In [33]:
# Update sklearn to prevent version mismatches

# !pip install sklearn --upgrade

In [34]:
# install joblib. This will be used to save model. 
# Restart your kernel after installing

# !pip install joblib

In [35]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [36]:
# Visualize more columns
pd.options.display.max_columns = None

# Read the CSV and Perform Basic Data Cleaning

In [37]:
df = pd.read_csv("../resources/exoplanet_data.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,koi_impact,koi_impact_err1,koi_impact_err2,koi_duration,koi_duration_err1,koi_duration_err2,koi_depth,koi_depth_err1,koi_depth_err2,koi_prad,koi_prad_err1,koi_prad_err2,koi_teq,koi_insol,koi_insol_err1,koi_insol_err2,koi_model_snr,koi_tce_plnt_num,koi_steff,koi_steff_err1,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,-0.00352,0.586,0.059,-0.443,4.507,0.116,-0.116,874.8,35.5,-35.5,2.83,0.32,-0.19,443,9.11,2.87,-1.62,25.8,2,5455,81,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,-0.000581,0.969,5.126,-0.077,1.7822,0.0341,-0.0341,10829.0,171.0,-171.0,14.6,3.92,-1.31,638,39.3,31.04,-10.49,76.3,1,5853,158,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,-0.000115,1.276,0.115,-0.092,2.40641,0.00537,-0.00537,8079.2,12.8,-12.8,33.46,8.5,-2.83,1395,891.96,668.95,-230.35,505.6,1,5805,157,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,-0.00113,0.701,0.235,-0.478,1.6545,0.042,-0.042,603.3,16.9,-16.9,2.75,0.88,-0.35,1406,926.16,874.33,-314.24,40.9,1,6031,169,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,-0.0019,0.762,0.139,-0.532,3.1402,0.0673,-0.0673,686.0,18.7,-18.7,2.77,0.9,-0.3,1160,427.65,420.33,-136.7,40.2,2,6046,189,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


# Select features

In [38]:
# Selecting columns that don't consist error estiamte values
columns = list(df.columns)

new_columns = []
for column in columns:
    if 'err' not in column:
        new_columns.append(column)

new_df = df[new_columns]
new_df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_time0bk,koi_impact,koi_duration,koi_depth,koi_prad,koi_teq,koi_insol,koi_model_snr,koi_tce_plnt_num,koi_steff,koi_slogg,koi_srad,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,162.51384,0.586,4.507,874.8,2.83,443,9.11,25.8,2,5455,4.467,0.927,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.89914,175.850252,0.969,1.7822,10829.0,14.6,638,39.3,76.3,1,5853,4.544,0.868,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,170.307565,1.276,2.40641,8079.2,33.46,1395,891.96,505.6,1,5805,4.564,0.791,285.53461,48.28521,15.597
3,CONFIRMED,0,0,0,0,2.525592,171.59555,0.701,1.6545,603.3,2.75,1406,926.16,40.9,1,6031,4.438,1.046,288.75488,48.2262,15.509
4,CONFIRMED,0,0,0,0,4.134435,172.97937,0.762,3.1402,686.0,2.77,1160,427.65,40.2,2,6046,4.486,0.972,296.28613,48.22467,15.714


In [39]:
# Set features. This will also be used as the x values.
X = new_df.drop("koi_disposition", axis=1)
# Use `koi_disposition` for the y values
y = new_df['koi_disposition'].values.reshape(-1, 1)

print(X.shape, y.shape)

(6991, 20) (6991, 1)


# Encode classes (y values)

In [40]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

label_encoder.fit(y)
encoded_y = label_encoder.transform(y)

# Visualize encoded y 
for label, original_class in zip(encoded_y[0:50], y[0:50]):
    print('Original Class: ' + str(original_class))
    print('Encoded Label: ' + str(label))
    print('-' * 12)

Original Class: ['CONFIRMED']
Encoded Label: 1
------------
Original Class: ['FALSE POSITIVE']
Encoded Label: 2
------------
Original Class: ['FALSE POSITIVE']
Encoded Label: 2
------------
Original Class: ['CONFIRMED']
Encoded Label: 1
------------
Original Class: ['CONFIRMED']
Encoded Label: 1
------------
Original Class: ['CONFIRMED']
Encoded Label: 1
------------
Original Class: ['CONFIRMED']
Encoded Label: 1
------------
Original Class: ['CONFIRMED']
Encoded Label: 1
------------
Original Class: ['CONFIRMED']
Encoded Label: 1
------------
Original Class: ['CONFIRMED']
Encoded Label: 1
------------
Original Class: ['CONFIRMED']
Encoded Label: 1
------------
Original Class: ['FALSE POSITIVE']
Encoded Label: 2
------------
Original Class: ['FALSE POSITIVE']
Encoded Label: 2
------------
Original Class: ['FALSE POSITIVE']
Encoded Label: 2
------------
Original Class: ['CONFIRMED']
Encoded Label: 1
------------
Original Class: ['CONFIRMED']
Encoded Label: 1
------------
Original Class:

  return f(*args, **kwargs)


In [41]:
# Check class categories
label_encoder.classes_

array(['CANDIDATE', 'CONFIRMED', 'FALSE POSITIVE'], dtype=object)

In [42]:
# Create target names for our classification report
target_names = ['CANDIDATE', 'CONFIRMED', 'FALSE POSITIVE']

In [43]:
encoded_y

array([1, 2, 2, ..., 0, 2, 2])

# Create a Train Test Split

In [44]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train_encoded, y_test_encoded = train_test_split(X, encoded_y, random_state=42)

In [45]:
X_train.head()

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_time0bk,koi_impact,koi_duration,koi_depth,koi_prad,koi_teq,koi_insol,koi_model_snr,koi_tce_plnt_num,koi_steff,koi_slogg,koi_srad,ra,dec,koi_kepmag
6122,0,0,0,0,6.768901,133.07724,0.15,3.616,123.1,1.24,1017,253.3,10.8,1,5737,4.327,1.125,294.40472,39.351681,14.725
6370,0,1,0,1,0.733726,132.02005,0.291,2.309,114.6,0.86,1867,2891.64,13.8,1,5855,4.578,0.797,284.50391,42.46386,15.77
2879,1,0,0,0,7.652707,134.46038,0.97,79.8969,641.1,3.21,989,226.81,254.3,1,6328,4.481,0.963,295.50211,38.98354,13.099
107,0,0,0,0,7.953547,174.66224,0.3,2.6312,875.4,2.25,696,55.37,38.4,1,4768,4.536,0.779,291.15878,40.750271,15.66
29,0,0,0,0,4.959319,172.258529,0.831,2.22739,9802.0,12.21,1103,349.4,696.5,1,5712,4.359,1.082,292.16705,48.727589,15.263


In [46]:
y_train_encoded

array([0, 2, 2, ..., 2, 2, 2])

# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [47]:
# Scale
from sklearn.preprocessing import MinMaxScaler

X_scaler = MinMaxScaler().fit(X_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [48]:
X_train_scaled

array([[0.        , 0.        , 0.        , ..., 0.66574567, 0.17604958,
        0.64129267],
       [0.        , 1.        , 0.        , ..., 0.21268467, 0.37354005,
        0.72766344],
       [1.        , 0.        , 0.        , ..., 0.71596223, 0.15268835,
        0.5069014 ],
       ...,
       [0.        , 1.        , 0.        , ..., 0.792823  , 0.59720043,
        0.30217373],
       [1.        , 0.        , 0.        , ..., 0.77821733, 0.29000226,
        0.52153071],
       [0.        , 0.        , 1.        , ..., 0.68082222, 0.27185353,
        0.61930738]])

# Train the Model



**Logistic Regression**

In [49]:
from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression()
classifier

LogisticRegression()

In [50]:
classifier.fit(X_train_scaled, y_train_encoded)

LogisticRegression()

In [51]:
print(f'Training Data Score: {round(classifier.score(X_train_scaled, y_train_encoded), 4)}')
print(f'Testing Data Score: {round(classifier.score(X_test_scaled, y_test_encoded), 4)}')

Training Data Score: 0.82
Testing Data Score: 0.8009


# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [52]:
# Create the GridSearchCV model
from sklearn.model_selection import GridSearchCV

param_grid = {'C': [1, 5, 10, 50],
             'max_iter': [500]}
grid = GridSearchCV(classifier, param_grid, verbose=3)

In [23]:
grid.get_params()

{'cv': None,
 'error_score': nan,
 'estimator__C': 1.0,
 'estimator__class_weight': None,
 'estimator__dual': False,
 'estimator__fit_intercept': True,
 'estimator__intercept_scaling': 1,
 'estimator__l1_ratio': None,
 'estimator__max_iter': 100,
 'estimator__multi_class': 'auto',
 'estimator__n_jobs': None,
 'estimator__penalty': 'l2',
 'estimator__random_state': None,
 'estimator__solver': 'lbfgs',
 'estimator__tol': 0.0001,
 'estimator__verbose': 0,
 'estimator__warm_start': False,
 'estimator': LogisticRegression(),
 'n_jobs': None,
 'param_grid': {'C': [1, 5, 10, 50], 'max_iter': [500]},
 'pre_dispatch': '2*n_jobs',
 'refit': True,
 'return_train_score': False,
 'scoring': None,
 'verbose': 3}

In [24]:
# Train the model with GridSearch
grid.fit(X_train_scaled, y_train_encoded)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV 1/5] END ..............................C=1, max_iter=500; total time=   0.2s
[CV 2/5] END ..............................C=1, max_iter=500; total time=   0.1s
[CV 3/5] END ..............................C=1, max_iter=500; total time=   0.1s
[CV 4/5] END ..............................C=1, max_iter=500; total time=   0.1s
[CV 5/5] END ..............................C=1, max_iter=500; total time=   0.1s
[CV 1/5] END ..............................C=5, max_iter=500; total time=   0.5s
[CV 2/5] END ..............................C=5, max_iter=500; total time=   0.2s
[CV 3/5] END ..............................C=5, max_iter=500; total time=   0.2s
[CV 4/5] END ..............................C=5, max_iter=500; total time=   0.3s
[CV 5/5] END ..............................C=5, max_iter=500; total time=   0.2s
[CV 1/5] END .............................C=10, max_iter=500; total time=   0.3s
[CV 2/5] END .............................C=10, m

GridSearchCV(estimator=LogisticRegression(),
             param_grid={'C': [1, 5, 10, 50], 'max_iter': [500]}, verbose=3)

In [25]:
print(f'Best parameters: {grid.best_params_}')
print(f'Best score: {grid.best_score_}')

Best parameters: {'C': 50, 'max_iter': 500}
Best score: 0.8237634533798092


In [26]:
# Make predictions with the hypertuned model
predictions = grid.predict(X_test_scaled)

In [27]:
# Classification Report
from sklearn.metrics import classification_report
print(classification_report(y_test_encoded, predictions,
                            target_names=target_names))

                precision    recall  f1-score   support

     CANDIDATE       0.61      0.62      0.62       411
     CONFIRMED       0.68      0.65      0.66       484
FALSE POSITIVE       0.98      1.00      0.99       853

      accuracy                           0.81      1748
     macro avg       0.76      0.76      0.76      1748
  weighted avg       0.81      0.81      0.81      1748



#### Observation on model performance
* This model's best score is 0.824 and the weighted average is 0.81.
* It is best at predicting "False Positive".

# Save the Model

In [32]:
import joblib

filename = '../saved_ml_models/logistic_regression.sav'
joblib.dump(grid, filename)

['../saved_ml_models/logistic_regression.sav']