In [None]:
# Update sklearn to prevent version mismatches
!pip install sklearn --upgrade

In [None]:
# install joblib. This will be used to save your model. 
# Restart your kernel after installing 
!pip install joblib

In [1]:
import pandas as pd

# Read the CSV and Perform Basic Data Cleaning

In [2]:
df = pd.read_csv("Data/exoplanet_data.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


In [3]:
df.columns

Index(['koi_disposition', 'koi_fpflag_nt', 'koi_fpflag_ss', 'koi_fpflag_co',
       'koi_fpflag_ec', 'koi_period', 'koi_period_err1', 'koi_period_err2',
       'koi_time0bk', 'koi_time0bk_err1', 'koi_time0bk_err2', 'koi_impact',
       'koi_impact_err1', 'koi_impact_err2', 'koi_duration',
       'koi_duration_err1', 'koi_duration_err2', 'koi_depth', 'koi_depth_err1',
       'koi_depth_err2', 'koi_prad', 'koi_prad_err1', 'koi_prad_err2',
       'koi_teq', 'koi_insol', 'koi_insol_err1', 'koi_insol_err2',
       'koi_model_snr', 'koi_tce_plnt_num', 'koi_steff', 'koi_steff_err1',
       'koi_steff_err2', 'koi_slogg', 'koi_slogg_err1', 'koi_slogg_err2',
       'koi_srad', 'koi_srad_err1', 'koi_srad_err2', 'ra', 'dec',
       'koi_kepmag'],
      dtype='object')

# Select your features (columns)

In [4]:
# Set features. This will also be used as your x values.
X = df[['koi_fpflag_nt', 'koi_fpflag_ss', 'koi_fpflag_co',
       'koi_fpflag_ec', 'koi_period','koi_time0bk','koi_impact','koi_duration','koi_depth','koi_prad','koi_teq', 'koi_insol',
       'koi_model_snr', 'koi_tce_plnt_num', 'koi_steff', 'koi_slogg','koi_srad','ra', 'dec','koi_kepmag']]

Use koi_disposition for the y values

In [5]:
y = df['koi_disposition'].values.reshape(-1,1)
y

array([['CONFIRMED'],
       ['FALSE POSITIVE'],
       ['FALSE POSITIVE'],
       ...,
       ['CANDIDATE'],
       ['FALSE POSITIVE'],
       ['FALSE POSITIVE']], dtype=object)

# Create a Train Test Split
Use koi_disposition for the y values

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.6)
print(X_train.shape, y_train.shape)

(2796, 20) (2796, 1)


In [7]:
X_train.head()

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_time0bk,koi_impact,koi_duration,koi_depth,koi_prad,koi_teq,koi_insol,koi_model_snr,koi_tce_plnt_num,koi_steff,koi_slogg,koi_srad,ra,dec,koi_kepmag
5979,0,1,0,0,15.582795,135.112903,0.759,3.76683,52760.0,14.0,397,5.86,439.3,1,3849,4.703,0.546,284.62024,38.969501,15.368
4323,0,0,0,0,151.13008,278.463,0.0529,3.49,358.0,4.41,578,26.4,4.6,1,6768,3.844,2.302,299.38071,40.535671,11.867
4122,0,1,1,0,7.014761,136.88345,0.687,3.945,122.1,1.02,928,175.85,27.5,1,5912,4.501,0.87,296.8017,43.654259,13.601
3117,0,0,0,0,96.384483,144.86825,0.095,5.253,591.0,2.41,400,6.03,10.4,1,5919,4.485,1.0,284.68292,41.592411,15.602
6612,1,0,0,0,379.42818,358.9906,0.006,11.24,403.9,1.89,252,0.95,11.1,1,5947,4.487,0.945,289.98419,41.918591,13.782


In [8]:
print(X.shape, y.shape)

(6991, 20) (6991, 1)


# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [11]:
# Scale your data
from sklearn.preprocessing import MinMaxScaler
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)


In [None]:
#y_train.ravel()

In [12]:
#pip install keras（only if needs)

In [13]:
from sklearn.svm import SVC
classifier = SVC(kernel='linear', random_state = 0)

classifier.fit(X_train_scaled, y_train.ravel())

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=0,
    shrinking=True, tol=0.001, verbose=False)

# Train the Model



In [14]:
print(f"Training Data Score: {classifier.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test_scaled, y_test)}")

Training Data Score: 0.8029327610872675
Testing Data Score: 0.797854588796186


Make predictions

In [15]:
#Make prediction
predictions = classifier.predict(X_test)

In [16]:
#Make the confusion Matrix
from sklearn.metrics import confusion_matrix
cm=confusion_matrix(y_test,predictions)

In [17]:
from sklearn.metrics import accuracy_score
accuracy=accuracy_score(y_test,predictions)
accuracy

0.3909415971394517

# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters
https://www.youtube.com/watch?v=CgmvAMiVKFE

In [18]:
# Applying Grid Search to find the best model and the best parameters
from sklearn.model_selection import GridSearchCV
parameters = [{'C': [1, 10, 100, 1000], 'kernel': ['linear']},
              {'C': [1, 10, 100, 1000], 'kernel': ['rbf'], 'gamma': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]}]
grid_search = GridSearchCV(estimator = classifier,
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 10,
                           n_jobs = -1)
grid_search = grid_search.fit(X_train_scaled, y_train.ravel())

In [20]:
accuracy = grid_search.best_score_
accuracy

0.8397711015736766

In [21]:
grid_search.best_params_

{'C': 1000, 'gamma': 0.7, 'kernel': 'rbf'}

In [41]:
print(f"Best Params: {grid_search.best_params_}")
print(f"Best Score: {grid_search.best_score_}")

Best Params: {'C': 1000, 'gamma': 0.7, 'kernel': 'rbf'}
Best Score: 0.8397711015736766


In [45]:
classifier = SVC(kernel = 'rbf', gamma=0.7)
classifier.fit(X_train_scaled, y_train.ravel())

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.7, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [46]:
# Predicting the Test set results
y_pred = classifier.predict(X_test_scaled)

In [47]:
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm

array([[ 333,  669,   15],
       [ 124,  934,   29],
       [   4,    0, 2087]], dtype=int64)

In [48]:
from sklearn.metrics import accuracy_score
accuracy=accuracy_score(y_test,y_pred)
accuracy

0.799523241954708

# Save the Model

In [49]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash
import joblib
best_model=SVC(kernel = 'rbf', gamma=0.7)


filename = 'best_model_svc.sav'
joblib.dump(best_model, filename)

['best_model_svc.sav']

In [50]:
# make dataframe to compare prediction and actual data
pd.DataFrame({"Prediction": predictions, "Actual": y_test.flatten()})

Unnamed: 0,Prediction,Actual
0,CANDIDATE,FALSE POSITIVE
1,CANDIDATE,FALSE POSITIVE
2,CANDIDATE,FALSE POSITIVE
3,CANDIDATE,CANDIDATE
4,CANDIDATE,CANDIDATE
...,...,...
4190,CANDIDATE,FALSE POSITIVE
4191,CANDIDATE,CONFIRMED
4192,FALSE POSITIVE,FALSE POSITIVE
4193,CANDIDATE,CONFIRMED
