In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

# Read the CSV and Perform Basic Data Cleaning

In [2]:
df = pd.read_csv("exoplanet_data500rows.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


# Select your features (columns)

In [3]:
# selected features based on feature importance from the Random Forest Model
selected_features = df[['koi_period', 'koi_time0bk', 'koi_impact', 'koi_duration', 'koi_depth', 
                        'koi_prad', 'koi_teq', 'koi_insol', 'koi_model_snr']]
X = selected_features.values
print(X.shape)

(500, 9)


# Create a Train Test Split

Use `koi_disposition` for the y values

In [4]:
# prepare y values
y = df['koi_disposition'].values
y

array(['CONFIRMED', 'FALSE POSITIVE', 'FALSE POSITIVE', 'CONFIRMED',
       'CONFIRMED', 'CONFIRMED', 'CONFIRMED', 'CONFIRMED', 'CONFIRMED',
       'CONFIRMED', 'CONFIRMED', 'FALSE POSITIVE', 'FALSE POSITIVE',
       'FALSE POSITIVE', 'CONFIRMED', 'CONFIRMED', 'FALSE POSITIVE',
       'CONFIRMED', 'FALSE POSITIVE', 'CONFIRMED', 'CONFIRMED',
       'CONFIRMED', 'FALSE POSITIVE', 'FALSE POSITIVE', 'FALSE POSITIVE',
       'CONFIRMED', 'CONFIRMED', 'CONFIRMED', 'CONFIRMED', 'CANDIDATE',
       'FALSE POSITIVE', 'CONFIRMED', 'FALSE POSITIVE', 'CONFIRMED',
       'CONFIRMED', 'CONFIRMED', 'CONFIRMED', 'CONFIRMED',
       'FALSE POSITIVE', 'CONFIRMED', 'CONFIRMED', 'CONFIRMED',
       'CONFIRMED', 'FALSE POSITIVE', 'CONFIRMED', 'CONFIRMED',
       'CONFIRMED', 'CANDIDATE', 'CONFIRMED', 'CONFIRMED', 'CANDIDATE',
       'CANDIDATE', 'CONFIRMED', 'CONFIRMED', 'CONFIRMED', 'CONFIRMED',
       'CONFIRMED', 'CONFIRMED', 'FALSE POSITIVE', 'CANDIDATE',
       'CONFIRMED', 'CONFIRMED', 'FALSE POSITIV

In [5]:
# Step 1: Label-encode data set

label_encoder = LabelEncoder()
label_encoder.fit(y)
encoded_y = label_encoder.transform(y)
encoded_y

array([1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 2, 1, 2, 1, 1, 1,
       2, 2, 2, 1, 1, 1, 1, 0, 2, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2,
       1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 2, 0, 1, 1, 2, 1, 1, 2,
       0, 1, 1, 1, 1, 1, 1, 2, 0, 2, 1, 1, 2, 2, 1, 0, 1, 2, 1, 1, 1, 1,
       1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 2,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 2,
       2, 1, 2, 1, 1, 1, 1, 0, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1,
       2, 1, 1, 1, 0, 1, 2, 0, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 2,
       1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 2, 1, 2, 1, 2, 1, 1, 1, 1, 1,
       1, 2, 1, 0, 2, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 0, 1, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2,
       1, 1, 0, 2, 2, 0, 1, 2, 0, 2, 1, 1, 1, 2, 0, 2, 2, 2, 2, 2, 0, 2,
       2, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 2,

In [6]:
# 0 = 'CANDIDATE'; 1 = 'CONFIRMED'; 2 = 'FALSE POSITIVE'
y_labels = ['CANDIDATE', 'CONFIRMED', 'FALSE POSITIVE']
y_labels

['CANDIDATE', 'CONFIRMED', 'FALSE POSITIVE']

In [7]:

X_train, X_test, y_train, y_test = train_test_split(X, encoded_y, random_state=1)
y_train

array([1, 1, 1, 1, 1, 2, 1, 0, 0, 1, 1, 1, 2, 2, 1, 0, 2, 2, 1, 1, 2, 2,
       1, 1, 2, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 0, 0, 2, 2, 2, 1, 1, 1, 0,
       2, 1, 1, 1, 2, 1, 1, 0, 1, 1, 2, 1, 1, 2, 1, 1, 1, 0, 1, 2, 1, 2,
       1, 0, 0, 1, 0, 1, 2, 1, 1, 1, 1, 2, 2, 1, 1, 2, 0, 0, 1, 1, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 2, 1, 0, 1, 2, 0, 1,
       2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 2, 2, 1, 1, 2, 2, 1, 2,
       2, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 2, 2, 0, 2, 1, 0, 1, 1,
       1, 2, 2, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 0, 1, 1, 2, 1,
       1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 0, 1, 1, 1, 2, 1, 2, 2, 1,
       1, 2, 1, 0, 0, 1, 1, 1, 1, 0, 2, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 2, 1, 0, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1,
       0, 1, 2, 1, 1, 2, 2, 2, 1, 0, 1, 2, 2, 2, 1, 1, 2, 1, 1, 1, 2, 1,
       2, 2, 1, 2, 0, 2, 2, 1, 1, 2, 2, 1, 0, 2, 1, 2, 1, 1, 1, 1, 0, 1,
       1, 1, 2, 2, 1, 2, 1, 2, 1, 1, 1, 2, 1, 1, 2,

In [8]:
X_train

array([[2.59340641e+01, 1.49762730e+02, 1.40000000e-02, ...,
        6.95000000e+02, 5.51000000e+01, 4.22000000e+01],
       [9.27859786e-01, 1.33905978e+02, 7.95000000e-01, ...,
        1.36100000e+03, 8.13300000e+02, 9.98000000e+01],
       [1.30607936e+01, 1.37631480e+02, 7.18000000e-01, ...,
        7.34000000e+02, 6.86600000e+01, 1.94000000e+01],
       ...,
       [2.44199803e+00, 1.34570950e+02, 8.56000000e-01, ...,
        1.28200000e+03, 6.38050000e+02, 2.72000000e+01],
       [1.87464901e+01, 1.47130740e+02, 5.08000000e-01, ...,
        6.86000000e+02, 5.23800000e+01, 2.67000000e+01],
       [2.81650490e+00, 1.70967685e+02, 1.05000000e-01, ...,
        1.21900000e+03, 5.21130000e+02, 1.40680000e+03]])

# linear kernel

In [9]:
model = SVC(kernel='linear')
model.fit(X_train, y_train)

SVC(kernel='linear')

In [10]:
# Model Accuracy
print(f"Test Accuracy: {model.score(X_test, y_test)}")

Test Accuracy: 0.744


In [11]:
# Calculate classification report
predictions = model.predict(X_test)
print(classification_report(y_test, predictions, target_names=y_labels))

                precision    recall  f1-score   support

     CANDIDATE       0.50      0.06      0.11        17
     CONFIRMED       0.73      0.90      0.81        63
FALSE POSITIVE       0.78      0.78      0.78        45

      accuracy                           0.74       125
     macro avg       0.67      0.58      0.56       125
  weighted avg       0.72      0.74      0.70       125



# polynomial kernel

In [12]:
model = SVC(kernel = 'poly')
model.fit(X_train, y_train)

SVC(kernel='poly')

In [13]:
# Model Accuracy
print(f"Test Accuracy: {model.score(X_test, y_test)}")

Test Accuracy: 0.504


In [14]:
# Calculate classification report

predictions = model.predict(X_test)
print(classification_report(y_test, predictions, target_names=y_labels))

                precision    recall  f1-score   support

     CANDIDATE       0.00      0.00      0.00        17
     CONFIRMED       0.51      0.95      0.66        63
FALSE POSITIVE       0.43      0.07      0.12        45

      accuracy                           0.50       125
     macro avg       0.31      0.34      0.26       125
  weighted avg       0.41      0.50      0.38       125



  _warn_prf(average, modifier, msg_start, len(result))


# gaussian kernel

In [15]:
# Support vector machine gaussian classifier

model = SVC(kernel = 'rbf')
model.fit(X_train, y_train)

SVC()

In [16]:
# Model Accuracy
print(f"Test Accuracy: {model.score(X_test, y_test)}")

Test Accuracy: 0.52


In [17]:
# Calculate classification report
predictions = model.predict(X_test)
print(classification_report(y_test, predictions, target_names=y_labels))

                precision    recall  f1-score   support

     CANDIDATE       0.00      0.00      0.00        17
     CONFIRMED       0.52      0.95      0.67        63
FALSE POSITIVE       0.56      0.11      0.19        45

      accuracy                           0.52       125
     macro avg       0.36      0.35      0.29       125
  weighted avg       0.46      0.52      0.40       125



  _warn_prf(average, modifier, msg_start, len(result))


# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [18]:
# Create the GridSearchCV model
# linear kernel does not use gamma parameter, so only C
# linear kernel takes much much longer than rbf and poly, so I take out linear
# n_jobs=-1 means it willl use all cpu cores/threads, my pc reached 100% cpu load!

parameters = {'C':[1, 5, 10], 'gamma':[0.0001, 0.0005, 0.001, 0.005]}
grid = GridSearchCV(model, parameters, verbose=5, n_jobs=4)


In [19]:
# Train the model with GridSearch
# this fit step takes a long time!!!
grid.fit(X_train, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:    0.7s
[Parallel(n_jobs=4)]: Done  60 out of  60 | elapsed:    0.9s finished


GridSearchCV(estimator=SVC(), n_jobs=4,
             param_grid={'C': [1, 5, 10],
                         'gamma': [0.0001, 0.0005, 0.001, 0.005]},
             verbose=5)

In [20]:
sorted(grid.cv_results_.keys())

['mean_fit_time',
 'mean_score_time',
 'mean_test_score',
 'param_C',
 'param_gamma',
 'params',
 'rank_test_score',
 'split0_test_score',
 'split1_test_score',
 'split2_test_score',
 'split3_test_score',
 'split4_test_score',
 'std_fit_time',
 'std_score_time',
 'std_test_score']

In [21]:
print(grid.best_params_)
print(grid.best_score_)

{'C': 1, 'gamma': 0.0005}
0.608


In [22]:
# Make predictions with the hypertuned model
predictions = grid.predict(X_test)

In [23]:
# Calculate classification report
 
print(classification_report(y_test, predictions,
                            target_names=y_labels))

                precision    recall  f1-score   support

     CANDIDATE       0.00      0.00      0.00        17
     CONFIRMED       0.50      1.00      0.67        63
FALSE POSITIVE       0.00      0.00      0.00        45

      accuracy                           0.50       125
     macro avg       0.17      0.33      0.22       125
  weighted avg       0.25      0.50      0.34       125



  _warn_prf(average, modifier, msg_start, len(result))


# Save the Model

In [24]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash
import joblib
filename = 'jamesye_svm.sav'
joblib.dump(grid, filename)

['jamesye_svm.sav']

# Load the Model

In [25]:
loaded_model = joblib.load(filename)
result = loaded_model.score(X_test, y_test)
result

0.504