In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

# Read the CSV and Perform Basic Data Cleaning

In [2]:
df = pd.read_csv("cumulative.csv")
# df.columns

In [3]:
df = df.drop(columns=["rowid", "kepid", "kepoi_name", "kepler_name", \
                      "koi_pdisposition", "koi_score", "koi_tce_delivname", \
                      "koi_period_err1", "koi_period_err2", "koi_time0bk_err1", \
                      "koi_time0bk_err2", "koi_impact_err1", "koi_impact_err2", \
                      "koi_duration_err1", "koi_duration_err2", "koi_depth_err1", \
                      "koi_depth_err2", "koi_prad_err1", "koi_prad_err2", \
                      "koi_teq_err1", "koi_teq_err2", "koi_insol_err1", \
                      "koi_insol_err2", \
                      "koi_tce_plnt_num", "koi_tce_delivname"])
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_time0bk,koi_impact,koi_duration,koi_depth,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,9.488036,170.53875,0.146,2.9575,615.8,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,CONFIRMED,0,0,0,0,54.418383,162.51384,0.586,4.507,874.8,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
2,FALSE POSITIVE,0,1,0,0,19.89914,175.850252,0.969,1.7822,10829.0,...,-176.0,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
3,FALSE POSITIVE,0,1,0,0,1.736952,170.307565,1.276,2.40641,8079.2,...,-174.0,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
4,CONFIRMED,0,0,0,0,2.525592,171.59555,0.701,1.6545,603.3,...,-211.0,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509


In [20]:
df.shape

(6904, 26)

In [5]:
# - NOT DISPOSITIONED, CANDIDATE drop values...
# df.loc[df['column_name'].isin(some_values)]
df = df.loc[(df['koi_disposition'] == "CONFIRMED") | (df['koi_disposition'] == "FALSE POSITIVE")]


In [6]:
len(df)

6904

# Create a Train Test Split

Use `koi_disposition` for the y values

In [7]:
y = df['koi_disposition']
# X = df
X = df.drop(columns=['koi_disposition'], axis=1)

### Do we need to LabelEncode Y?

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [8]:
X_train.head()

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_time0bk,koi_impact,koi_duration,koi_depth,koi_prad,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
7297,1,0,0,0,0.705891,131.62062,0.511,5.281,15.0,1.0,...,-464.0,4.077,0.242,-0.198,2.449,0.838,-0.838,294.20782,39.789291,11.321
6000,0,1,0,0,0.681138,131.57697,0.968,2.7287,105190.0,45.75,...,-195.0,4.474,0.094,-0.162,0.864,0.21,-0.113,292.02686,37.600651,12.545
8388,1,0,0,1,1.507101,132.09322,0.336,5.8097,497660.0,131.16,...,-309.0,4.099,0.19,-0.171,1.748,0.505,-0.454,297.26019,43.306358,14.15
5080,1,0,0,0,26.936878,157.7573,0.2807,4.969,246.9,1.57,...,-202.0,4.442,0.066,-0.281,0.998,0.407,-0.109,296.19614,45.634331,15.364
2949,0,0,0,0,4.454194,172.820223,0.479,2.3061,16042.0,11.76,...,-159.0,4.425,0.136,-0.221,0.915,0.242,-0.138,299.90591,45.439678,14.995


# Pre-processing

Scale the data using the MinMaxScaler

In [9]:
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)


# Train the Support Vector Machine

In [10]:
from sklearn.svm import SVC 
model2 = SVC(kernel='linear')
model2.fit(X_train_scaled, y_train)

predictions = model2.predict(X_test)

In [11]:
print(f"Training Data Score: {model2.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {model2.score(X_test_scaled, y_test)}")

Training Data Score: 0.985129393588258
Testing Data Score: 0.9831981460023175


# Hyperparameter Tuning

Use `GridSearchCV` to tune the `C` and `gamma` parameters

In [12]:
# Create the GridSearchCV model
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [1, 5, 10],
              'gamma': [0.0001, 0.001, 0.01]}
grid = GridSearchCV(model2, param_grid, verbose=3)


In [13]:
from datetime import datetime

current_time = datetime.now()
print("Current Time = ", current_time)

Current Time =  2019-12-17 14:55:55.161982


In [14]:
# Train the model with GridSearch
grid.fit(X_train, y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ................... C=1, gamma=0.0001, score=0.964, total= 8.0min
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  8.0min remaining:    0.0s


[CV] ................... C=1, gamma=0.0001, score=0.948, total=19.3min
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed: 27.3min remaining:    0.0s


[CV] ................... C=1, gamma=0.0001, score=0.969, total=14.6min
[CV] C=1, gamma=0.0001 ...............................................
[CV] ................... C=1, gamma=0.0001, score=0.965, total= 8.7min
[CV] C=1, gamma=0.0001 ...............................................
[CV] ................... C=1, gamma=0.0001, score=0.959, total=15.3min
[CV] C=1, gamma=0.001 ................................................
[CV] .................... C=1, gamma=0.001, score=0.964, total= 7.8min
[CV] C=1, gamma=0.001 ................................................
[CV] .................... C=1, gamma=0.001, score=0.948, total=17.9min
[CV] C=1, gamma=0.001 ................................................
[CV] .................... C=1, gamma=0.001, score=0.969, total=14.5min
[CV] C=1, gamma=0.001 ................................................
[CV] .................... C=1, gamma=0.001, score=0.965, total= 8.6min
[CV] C=1, gamma=0.001 ................................................


KeyboardInterrupt: 

In [17]:


current_time = datetime.now()
print("Current Time = ", current_time)

Current Time =  2019-12-17 17:06:43.550297


In [None]:

print(grid.best_params_)
print(grid.best_score_)