# Cleaning and Preprocessing Data for Machine Learning

In [None]:
# Update sklearn to prevent version mismatches
# !pip install sklearn --upgrade

In [None]:
# install joblib. This will be used to save your model. 
# Restart your kernel after installing 
!pip install joblib

In [1]:
import warnings
warnings.simplefilter('ignore')

# %matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [2]:
# Read the csv file into a pandas DataFrame

exoplanets = pd.read_csv('Resources/cumulative.csv')
# Drop the null columns where all values are null
exoplanets = exoplanets.dropna(axis='columns', how='all')
# Drop the null rows
exoplanets = exoplanets.dropna()
exoplanets

Unnamed: 0,rowid,kepid,kepoi_name,kepler_name,koi_disposition,koi_pdisposition,koi_score,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,1,10797460,K00752.01,Kepler-227 b,CONFIRMED,CANDIDATE,1.000,0,0,0,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,2,10797460,K00752.02,Kepler-227 c,CONFIRMED,CANDIDATE,0.969,0,0,0,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
4,5,10854555,K00755.01,Kepler-664 b,CONFIRMED,CANDIDATE,1.000,0,0,0,...,-211.0,4.438,0.070,-0.210,1.046,0.334,-0.133,288.75488,48.226200,15.509
5,6,10872983,K00756.01,Kepler-228 d,CONFIRMED,CANDIDATE,1.000,0,0,0,...,-232.0,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.224670,15.714
6,7,10872983,K00756.02,Kepler-228 c,CONFIRMED,CANDIDATE,1.000,0,0,0,...,-232.0,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.224670,15.714
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9309,9310,7347246,K03014.01,Kepler-1411 b,CONFIRMED,CANDIDATE,0.941,0,0,0,...,-236.0,4.454,0.054,-0.216,1.053,0.357,-0.119,286.36157,42.963921,15.831
9353,9354,8895758,K03106.01,Kepler-1427 b,CONFIRMED,CANDIDATE,0.877,0,0,0,...,-203.0,4.473,0.054,-0.216,1.000,0.322,-0.107,295.34967,45.114552,15.415
9355,9356,6196457,K00285.03,Kepler-92 d,CONFIRMED,FALSE POSITIVE,0.476,1,0,0,...,-80.0,4.050,0.033,-0.027,1.670,0.118,-0.082,289.08606,41.562958,11.565
9479,9480,7503885,K03417.01,Kepler-1494 b,CONFIRMED,CANDIDATE,1.000,0,0,0,...,-205.0,4.437,0.072,-0.203,1.008,0.319,-0.137,282.65741,43.162521,15.214


## Select the features (columns)

In [3]:
# Set features. This will also be used as your x values.
selected_features = exoplanets[['koi_disposition', 'koi_impact', 'koi_duration', 'koi_depth', 'koi_insol', 'koi_model_snr']]

# Create a Train Test Split

Use `koi_disposition` for the y values

In [4]:
X = selected_features.drop("koi_disposition", axis=1)
y = selected_features["koi_disposition"]
print(X.shape, y.shape)

(2269, 5) (2269,)


In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [6]:
X_train.head()

Unnamed: 0,koi_impact,koi_duration,koi_depth,koi_insol,koi_model_snr
5356,0.244,12.4124,3628.7,2.27,93.8
1851,0.218,3.485,138.3,842.45,15.9
2202,0.68,2.437,786.7,49.26,22.6
2949,0.479,2.3061,16042.0,252.22,1022.2
771,0.166,3.651,510.6,579.9,107.1


# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [7]:
# Import dependencies
from sklearn.preprocessing import LabelEncoder, MinMaxScaler,StandardScaler
from tensorflow.keras.utils import to_categorical

# scale the data
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# # Label-encode data set
# label_encoder = LabelEncoder()
# label_encoder.fit(y_train)
# encoded_y_train = label_encoder.transform(y_train)
# encoded_y_test = label_encoder.transform(y_test)

# print(encoded_y_test)

# Train the Model

In [8]:
# from sklearn.linear_model import LogisticRegression
# model = LogisticRegression()
# model.fit(X_train, y_train)
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train, y_train)

LogisticRegression()

In [9]:
print(model.score(X_train_scaled, y_train))

0.9994121105232217


In [10]:
print(model.score(X_test_scaled, y_test))

1.0


In [11]:
from sklearn.svm import SVC 
model = SVC(kernel='linear')
model.fit(X_train_scaled, y_train)
# predictions = model.predict(X_test_scaled)
# model

SVC(kernel='linear')

In [12]:
print(f"Training Data Score: {model.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {model.score(X_test_scaled, y_test)}")

Training Data Score: 0.9994121105232217
Testing Data Score: 1.0


In [None]:
# Calculate classification report
from sklearn.metrics import classification_report
print(classification_report(encoded_y_test, predictions,
                           target_names =["confirmed", "false positive", "candidate"]))

# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [13]:
from sklearn.model_selection import GridSearchCV

In [18]:
model.get_params().keys()

dict_keys(['C', 'break_ties', 'cache_size', 'class_weight', 'coef0', 'decision_function_shape', 'degree', 'gamma', 'kernel', 'max_iter', 'probability', 'random_state', 'shrinking', 'tol', 'verbose'])

In [19]:
# Create the GridSearchCV model
param_grid = {'C': [1, 5, 10, 50],
              'gamma': [0.0001, 0.0005, 0.001, 0.005]}
grid = GridSearchCV(model, param_grid, verbose=3)

In [20]:
# Train the model with GridSearch
grid.fit(X_train, y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV] C=1, gamma=0.0001 ...............................................
[CV] ..................... C=1, gamma=0.0001, score=nan, total=   0.0s
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s


[CV] ................... C=1, gamma=0.0001, score=1.000, total=   5.9s
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    5.9s remaining:    0.0s


[CV] ................... C=1, gamma=0.0001, score=0.997, total=   7.2s
[CV] C=1, gamma=0.0001 ...............................................
[CV] ................... C=1, gamma=0.0001, score=1.000, total=  13.3s
[CV] C=1, gamma=0.0001 ...............................................
[CV] ................... C=1, gamma=0.0001, score=0.994, total=  45.6s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ..................... C=1, gamma=0.0005, score=nan, total=   0.0s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ................... C=1, gamma=0.0005, score=1.000, total=   6.1s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ................... C=1, gamma=0.0005, score=0.997, total=   7.4s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ................... C=1, gamma=0.0005, score=1.000, total=  13.0s
[CV] C=1, gamma=0.0005 ...............................................
[CV] .

[CV] .................. C=50, gamma=0.0001, score=1.000, total=  11.5s
[CV] C=50, gamma=0.0001 ..............................................
[CV] .................. C=50, gamma=0.0001, score=0.994, total=  21.3s
[CV] C=50, gamma=0.0001 ..............................................
[CV] .................. C=50, gamma=0.0001, score=1.000, total=  23.1s
[CV] C=50, gamma=0.0001 ..............................................
[CV] .................. C=50, gamma=0.0001, score=0.994, total= 1.5min
[CV] C=50, gamma=0.0005 ..............................................
[CV] .................... C=50, gamma=0.0005, score=nan, total=   0.0s
[CV] C=50, gamma=0.0005 ..............................................
[CV] .................. C=50, gamma=0.0005, score=1.000, total=  12.0s
[CV] C=50, gamma=0.0005 ..............................................
[CV] .................. C=50, gamma=0.0005, score=0.994, total=  23.1s
[CV] C=50, gamma=0.0005 ..............................................
[CV] .

[Parallel(n_jobs=1)]: Done  80 out of  80 | elapsed: 29.0min finished


GridSearchCV(estimator=SVC(kernel='linear'),
             param_grid={'C': [1, 5, 10, 50],
                         'gamma': [0.0001, 0.0005, 0.001, 0.005]},
             verbose=3)

In [21]:
print(grid.best_params_)
print(grid.best_score_)

{'C': 1, 'gamma': 0.0001}
nan


# Save the Model

In [22]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash
import joblib
filename = 'Models/vRatcliff_logisticregression.sav'
joblib.dump(model, filename)

['Models/vRatcliff_logisticregression.sav']