In [1]:
#import dependencies

import pandas as pd

In [2]:
#read csv file into df

ml_df = pd.read_csv("cumulative.csv")

#drop extra columns and null values from df

ml_df = ml_df.drop(columns=["rowid", "kepid", "kepoi_name", "kepler_name", "koi_pdisposition", "koi_score", "koi_tce_delivname"])
cleaned_ml_df = ml_df.dropna(axis='columns', how='any')
cleaned_ml_df = cleaned_ml_df.dropna()

#display df

cleaned_ml_df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_time0bk,koi_duration,ra,dec
0,CONFIRMED,0,0,0,0,9.488036,170.53875,2.9575,291.93423,48.141651
1,CONFIRMED,0,0,0,0,54.418383,162.51384,4.507,291.93423,48.141651
2,FALSE POSITIVE,0,1,0,0,19.89914,175.850252,1.7822,297.00482,48.134129
3,FALSE POSITIVE,0,1,0,0,1.736952,170.307565,2.40641,285.53461,48.28521
4,CONFIRMED,0,0,0,0,2.525592,171.59555,1.6545,288.75488,48.2262


In [3]:
#create train test split data

#import dependencies

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from tensorflow.keras.utils import to_categorical

#assign x and y values

X = cleaned_ml_df.drop("koi_disposition", axis=1)
y = cleaned_ml_df["koi_disposition"]

#split training and testing data

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=115, stratify=y)

In [4]:
#display training set

X_train.head()

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_time0bk,koi_duration,ra,dec
7257,0,1,0,0,4.816611,133.763636,3.82353,281.9364,48.010059
824,0,0,0,0,162.04795,278.880573,9.8974,294.2655,44.7943
7281,0,1,0,0,22.161272,148.46378,3.699,297.07068,49.442451
3305,0,1,0,0,22.41832,172.797099,5.349,296.89938,43.707218
1759,0,0,0,0,6.314345,173.3097,2.796,285.72079,40.85696


In [5]:
#preprocessing and scaling data

X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

  return self.partial_fit(X, y)


In [6]:
from sklearn.svm import SVC 
model = SVC(kernel='linear')
model.fit(X_train_scaled, y_train)
predictions = model.predict(X_test)

In [7]:
#display test and train scores

print(f"Training Data Score: {model.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {model.score(X_test_scaled, y_test)}")

Training Data Score: 0.7677401366234491
Testing Data Score: 0.7779171894604768


In [8]:
# Create the GridSearchCV model

from sklearn.model_selection import GridSearchCV
param_grid = {'C': [1, 5, 10],
              'gamma': [0.0001, 0.001, 0.01]}
grid = GridSearchCV(model, param_grid, verbose=3)

In [9]:
# Train the model with GridSearch

grid.fit(X_train_scaled, y_train)

Fitting 3 folds for each of 9 candidates, totalling 27 fits
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ...... C=1, gamma=0.0001, score=0.7671404682274248, total=   0.4s
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.7s remaining:    0.0s


[CV] ...... C=1, gamma=0.0001, score=0.7695524884985362, total=   0.4s
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.4s remaining:    0.0s


[CV] ...... C=1, gamma=0.0001, score=0.7707112970711297, total=   0.3s
[CV] C=1, gamma=0.001 ................................................
[CV] ....... C=1, gamma=0.001, score=0.7671404682274248, total=   0.4s
[CV] C=1, gamma=0.001 ................................................
[CV] ....... C=1, gamma=0.001, score=0.7695524884985362, total=   0.3s
[CV] C=1, gamma=0.001 ................................................
[CV] ....... C=1, gamma=0.001, score=0.7707112970711297, total=   0.3s
[CV] C=1, gamma=0.01 .................................................
[CV] ........ C=1, gamma=0.01, score=0.7671404682274248, total=   0.3s
[CV] C=1, gamma=0.01 .................................................
[CV] ........ C=1, gamma=0.01, score=0.7695524884985362, total=   0.3s
[CV] C=1, gamma=0.01 .................................................
[CV] ........ C=1, gamma=0.01, score=0.7707112970711297, total=   0.3s
[CV] C=5, gamma=0.0001 ...............................................
[CV] .

[Parallel(n_jobs=1)]: Done  27 out of  27 | elapsed:   19.1s finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'C': [1, 5, 10], 'gamma': [0.0001, 0.001, 0.01]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=3)

In [10]:
#print best model

print(grid.best_params_)
print(grid.best_score_)

{'C': 10, 'gamma': 0.0001}
0.7719224871044194


In [11]:
grid.score(X_train_scaled, y_train)

0.7699707235466332

In [12]:
predictions = grid.predict(X_test_scaled)
print(predictions)

['CONFIRMED' 'FALSE POSITIVE' 'FALSE POSITIVE' ... 'CANDIDATE'
 'FALSE POSITIVE' 'FALSE POSITIVE']


In [13]:
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions))

                precision    recall  f1-score   support

     CANDIDATE       0.54      0.53      0.54       562
     CONFIRMED       0.55      0.57      0.56       573
FALSE POSITIVE       0.98      0.98      0.98      1256

     micro avg       0.78      0.78      0.78      2391
     macro avg       0.69      0.69      0.69      2391
  weighted avg       0.78      0.78      0.78      2391

