In [1]:
# Update sklearn to prevent version mismatches

# !pip install sklearn --upgrade

In [2]:
# install joblib. This will be used to save model. 
# Restart your kernel after installing

# !pip install joblib

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [4]:
# Visualize more columns
pd.options.display.max_columns = None

# Read the CSV and Perform Basic Data Cleaning

In [5]:
df = pd.read_csv("../resources/exoplanet_data.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,koi_impact,koi_impact_err1,koi_impact_err2,koi_duration,koi_duration_err1,koi_duration_err2,koi_depth,koi_depth_err1,koi_depth_err2,koi_prad,koi_prad_err1,koi_prad_err2,koi_teq,koi_insol,koi_insol_err1,koi_insol_err2,koi_model_snr,koi_tce_plnt_num,koi_steff,koi_steff_err1,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,-0.00352,0.586,0.059,-0.443,4.507,0.116,-0.116,874.8,35.5,-35.5,2.83,0.32,-0.19,443,9.11,2.87,-1.62,25.8,2,5455,81,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,-0.000581,0.969,5.126,-0.077,1.7822,0.0341,-0.0341,10829.0,171.0,-171.0,14.6,3.92,-1.31,638,39.3,31.04,-10.49,76.3,1,5853,158,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,-0.000115,1.276,0.115,-0.092,2.40641,0.00537,-0.00537,8079.2,12.8,-12.8,33.46,8.5,-2.83,1395,891.96,668.95,-230.35,505.6,1,5805,157,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,-0.00113,0.701,0.235,-0.478,1.6545,0.042,-0.042,603.3,16.9,-16.9,2.75,0.88,-0.35,1406,926.16,874.33,-314.24,40.9,1,6031,169,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,-0.0019,0.762,0.139,-0.532,3.1402,0.0673,-0.0673,686.0,18.7,-18.7,2.77,0.9,-0.3,1160,427.65,420.33,-136.7,40.2,2,6046,189,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


# Select features

In [6]:
# Selecting columns that don't consist error estiamte values
columns = list(df.columns)

new_columns = []
for column in columns:
    if 'err' not in column:
        new_columns.append(column)

new_df = df[new_columns]
new_df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_time0bk,koi_impact,koi_duration,koi_depth,koi_prad,koi_teq,koi_insol,koi_model_snr,koi_tce_plnt_num,koi_steff,koi_slogg,koi_srad,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,162.51384,0.586,4.507,874.8,2.83,443,9.11,25.8,2,5455,4.467,0.927,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.89914,175.850252,0.969,1.7822,10829.0,14.6,638,39.3,76.3,1,5853,4.544,0.868,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,170.307565,1.276,2.40641,8079.2,33.46,1395,891.96,505.6,1,5805,4.564,0.791,285.53461,48.28521,15.597
3,CONFIRMED,0,0,0,0,2.525592,171.59555,0.701,1.6545,603.3,2.75,1406,926.16,40.9,1,6031,4.438,1.046,288.75488,48.2262,15.509
4,CONFIRMED,0,0,0,0,4.134435,172.97937,0.762,3.1402,686.0,2.77,1160,427.65,40.2,2,6046,4.486,0.972,296.28613,48.22467,15.714


In [7]:
# Set features. This will also be used as the x values.
X = new_df.drop("koi_disposition", axis=1)
# Use `koi_disposition` for the y values
y = new_df['koi_disposition'].values.reshape(-1, 1)

print(X.shape, y.shape)

(6991, 20) (6991, 1)


In [8]:
# Create a list of feature names for features selection
feature_names = list(X.columns)
feature_names

['koi_fpflag_nt',
 'koi_fpflag_ss',
 'koi_fpflag_co',
 'koi_fpflag_ec',
 'koi_period',
 'koi_time0bk',
 'koi_impact',
 'koi_duration',
 'koi_depth',
 'koi_prad',
 'koi_teq',
 'koi_insol',
 'koi_model_snr',
 'koi_tce_plnt_num',
 'koi_steff',
 'koi_slogg',
 'koi_srad',
 'ra',
 'dec',
 'koi_kepmag']

# Encode classes (y values)

In [9]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

label_encoder.fit(y)
encoded_y = label_encoder.transform(y)

# Visualize encoded y 
for label, original_class in zip(encoded_y[0:50], y[0:50]):
    print('Original Class: ' + str(original_class))
    print('Encoded Label: ' + str(label))
    print('-' * 12)

Original Class: ['CONFIRMED']
Encoded Label: 1
------------
Original Class: ['FALSE POSITIVE']
Encoded Label: 2
------------
Original Class: ['FALSE POSITIVE']
Encoded Label: 2
------------
Original Class: ['CONFIRMED']
Encoded Label: 1
------------
Original Class: ['CONFIRMED']
Encoded Label: 1
------------
Original Class: ['CONFIRMED']
Encoded Label: 1
------------
Original Class: ['CONFIRMED']
Encoded Label: 1
------------
Original Class: ['CONFIRMED']
Encoded Label: 1
------------
Original Class: ['CONFIRMED']
Encoded Label: 1
------------
Original Class: ['CONFIRMED']
Encoded Label: 1
------------
Original Class: ['CONFIRMED']
Encoded Label: 1
------------
Original Class: ['FALSE POSITIVE']
Encoded Label: 2
------------
Original Class: ['FALSE POSITIVE']
Encoded Label: 2
------------
Original Class: ['FALSE POSITIVE']
Encoded Label: 2
------------
Original Class: ['CONFIRMED']
Encoded Label: 1
------------
Original Class: ['CONFIRMED']
Encoded Label: 1
------------
Original Class:

  return f(*args, **kwargs)


In [10]:
# Check class categories
label_encoder.classes_

array(['CANDIDATE', 'CONFIRMED', 'FALSE POSITIVE'], dtype=object)

In [11]:
# Create target names for our classification report
target_names = ['CANDIDATE', 'CONFIRMED', 'FALSE POSITIVE']

In [12]:
encoded_y

array([1, 2, 2, ..., 0, 2, 2])

# Create a Train Test Split

In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train_encoded, y_test_encoded = train_test_split(X, encoded_y, random_state=42)

In [14]:
X_train.head()

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_time0bk,koi_impact,koi_duration,koi_depth,koi_prad,koi_teq,koi_insol,koi_model_snr,koi_tce_plnt_num,koi_steff,koi_slogg,koi_srad,ra,dec,koi_kepmag
6122,0,0,0,0,6.768901,133.07724,0.15,3.616,123.1,1.24,1017,253.3,10.8,1,5737,4.327,1.125,294.40472,39.351681,14.725
6370,0,1,0,1,0.733726,132.02005,0.291,2.309,114.6,0.86,1867,2891.64,13.8,1,5855,4.578,0.797,284.50391,42.46386,15.77
2879,1,0,0,0,7.652707,134.46038,0.97,79.8969,641.1,3.21,989,226.81,254.3,1,6328,4.481,0.963,295.50211,38.98354,13.099
107,0,0,0,0,7.953547,174.66224,0.3,2.6312,875.4,2.25,696,55.37,38.4,1,4768,4.536,0.779,291.15878,40.750271,15.66
29,0,0,0,0,4.959319,172.258529,0.831,2.22739,9802.0,12.21,1103,349.4,696.5,1,5712,4.359,1.082,292.16705,48.727589,15.263


In [15]:
y_train_encoded

array([0, 2, 2, ..., 2, 2, 2])

# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [16]:
# Scale
from sklearn.preprocessing import MinMaxScaler

X_scaler = MinMaxScaler().fit(X_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [17]:
X_train_scaled

array([[0.        , 0.        , 0.        , ..., 0.66574567, 0.17604958,
        0.64129267],
       [0.        , 1.        , 0.        , ..., 0.21268467, 0.37354005,
        0.72766344],
       [1.        , 0.        , 0.        , ..., 0.71596223, 0.15268835,
        0.5069014 ],
       ...,
       [0.        , 1.        , 0.        , ..., 0.792823  , 0.59720043,
        0.30217373],
       [1.        , 0.        , 0.        , ..., 0.77821733, 0.29000226,
        0.52153071],
       [0.        , 0.        , 1.        , ..., 0.68082222, 0.27185353,
        0.61930738]])

# Train the Model



**Random Forests**

In [18]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=200)

In [19]:
# Fit
rf_model.fit(X_train_scaled, y_train_encoded)

RandomForestClassifier(n_estimators=200)

In [20]:
print(f'Training Data Score: {round(rf_model.score(X_train_scaled, y_train_encoded), 4)}')
print(f'Testing Data Score: {round(rf_model.score(X_test_scaled, y_test_encoded), 4)}')

Training Data Score: 1.0
Testing Data Score: 0.909


**Observation on model performance before feature selection**
* This model's r2 score on the test data is 0.907.

# Feature Selection

Use `feature_importances_` to filter out insignificant variables and retrain models with only the significant features.

In [21]:
# Sort the features by their importances
sorted_imp = sorted(zip(rf_model.feature_importances_, feature_names), reverse=True)
sorted_imp

[(0.13403106687082245, 'koi_fpflag_nt'),
 (0.13223743996814008, 'koi_model_snr'),
 (0.12342603607135198, 'koi_fpflag_co'),
 (0.10391879665888877, 'koi_fpflag_ss'),
 (0.06705898857431608, 'koi_prad'),
 (0.053890424506466854, 'koi_depth'),
 (0.045688997614096304, 'koi_fpflag_ec'),
 (0.04425550518303751, 'koi_period'),
 (0.04228787647065348, 'koi_impact'),
 (0.032520086995920704, 'koi_teq'),
 (0.030355040923125756, 'koi_duration'),
 (0.02889779901779093, 'koi_time0bk'),
 (0.027615570956835665, 'koi_insol'),
 (0.022371554627256072, 'koi_steff'),
 (0.021814018843663987, 'ra'),
 (0.021120943246911748, 'dec'),
 (0.020932626320317456, 'koi_kepmag'),
 (0.019900271021476953, 'koi_srad'),
 (0.01929574419244177, 'koi_slogg'),
 (0.008381211936485645, 'koi_tce_plnt_num')]

#### Let's see what happens if we remove less than half of the variables.
* Let's try removing anything that has an importance of less than 3% (0.03)

In [22]:
# Create a new list of columns (X features) for retrain
new_features = []

for pair in sorted_imp:
        if pair[0] >= 0.03:
            new_features.append(pair[1])
new_features

['koi_fpflag_nt',
 'koi_model_snr',
 'koi_fpflag_co',
 'koi_fpflag_ss',
 'koi_prad',
 'koi_depth',
 'koi_fpflag_ec',
 'koi_period',
 'koi_impact',
 'koi_teq',
 'koi_duration']

In [23]:
# Recreate X
X_retrain = X[new_features]
X_retrain.columns

Index(['koi_fpflag_nt', 'koi_model_snr', 'koi_fpflag_co', 'koi_fpflag_ss',
       'koi_prad', 'koi_depth', 'koi_fpflag_ec', 'koi_period', 'koi_impact',
       'koi_teq', 'koi_duration'],
      dtype='object')

In [24]:
# Scale
X_train_retrain, X_test_retrain, y_train_encoded_retrain, y_test_encoded_retrain = train_test_split(X_retrain, encoded_y, random_state=42)
X_scaler_retrain = MinMaxScaler().fit(X_train_retrain)
X_train_scaled_retrain = X_scaler_retrain.transform(X_train_retrain)
X_test_scaled_retrain = X_scaler_retrain.transform(X_test_retrain)

# Retrain

Now we have a new list of features that we think are more significant, we'll use them to retrain the model

In [25]:
rf_model_retrain = RandomForestClassifier(n_estimators=200)
rf_model_retrain.fit(X_train_scaled_retrain, y_train_encoded_retrain)

RandomForestClassifier(n_estimators=200)

In [26]:
print(f'Training Data Score: {round(rf_model_retrain.score(X_train_scaled_retrain, y_train_encoded_retrain), 4)}')
print(f'Testing Data Score: {round(rf_model_retrain.score(X_test_scaled_retrain, y_test_encoded_retrain), 4)}')

Training Data Score: 1.0
Testing Data Score: 0.8993


**Observation on model performance before feature selection**
* This model's r2 score on the test data is 0.895.
* The score is not better than the original score using all the features.

# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

#### Since the model before feature selections performed slightly better, we will use that model for hyperparameter tuning.

In [27]:
# Create the GridSearchCV model
from sklearn.model_selection import GridSearchCV

# Try different numbers of trees
param_grid = {'n_estimators': [50, 100, 200, 300]}
grid = GridSearchCV(rf_model, param_grid, verbose=3)

In [28]:
rf_model.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 200,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [29]:
grid.get_params()

{'cv': None,
 'error_score': nan,
 'estimator__bootstrap': True,
 'estimator__ccp_alpha': 0.0,
 'estimator__class_weight': None,
 'estimator__criterion': 'gini',
 'estimator__max_depth': None,
 'estimator__max_features': 'auto',
 'estimator__max_leaf_nodes': None,
 'estimator__max_samples': None,
 'estimator__min_impurity_decrease': 0.0,
 'estimator__min_impurity_split': None,
 'estimator__min_samples_leaf': 1,
 'estimator__min_samples_split': 2,
 'estimator__min_weight_fraction_leaf': 0.0,
 'estimator__n_estimators': 200,
 'estimator__n_jobs': None,
 'estimator__oob_score': False,
 'estimator__random_state': None,
 'estimator__verbose': 0,
 'estimator__warm_start': False,
 'estimator': RandomForestClassifier(n_estimators=200),
 'n_jobs': None,
 'param_grid': {'n_estimators': [50, 100, 200, 300]},
 'pre_dispatch': '2*n_jobs',
 'refit': True,
 'return_train_score': False,
 'scoring': None,
 'verbose': 3}

In [30]:
# Train the model with GridSearch
grid.fit(X_train_scaled, y_train_encoded)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV 1/5] END ................................n_estimators=50; total time=   0.6s
[CV 2/5] END ................................n_estimators=50; total time=   1.1s
[CV 3/5] END ................................n_estimators=50; total time=   1.0s
[CV 4/5] END ................................n_estimators=50; total time=   0.6s
[CV 5/5] END ................................n_estimators=50; total time=   0.5s
[CV 1/5] END ...............................n_estimators=100; total time=   1.7s
[CV 2/5] END ...............................n_estimators=100; total time=   1.3s
[CV 3/5] END ...............................n_estimators=100; total time=   1.4s
[CV 4/5] END ...............................n_estimators=100; total time=   1.1s
[CV 5/5] END ...............................n_estimators=100; total time=   1.1s
[CV 1/5] END ...............................n_estimators=200; total time=   2.5s
[CV 2/5] END ...............................n_est

GridSearchCV(estimator=RandomForestClassifier(n_estimators=200),
             param_grid={'n_estimators': [50, 100, 200, 300]}, verbose=3)

In [31]:
print(f'Best parameters: {grid.best_params_}')
print(f'Best score: {grid.best_score_}')

Best parameters: {'n_estimators': 300}
Best score: 0.8941410940262993


In [39]:
# Make predictions with the hypertuned model
predictions_tuned = grid.predict(X_test_scaled)

In [40]:
# Classification Report
from sklearn.metrics import classification_report
print(classification_report(y_test_encoded, predictions_tuned,
                            target_names=target_names))

                precision    recall  f1-score   support

     CANDIDATE       0.84      0.78      0.81       411
     CONFIRMED       0.83      0.86      0.84       484
FALSE POSITIVE       0.98      1.00      0.99       853

      accuracy                           0.91      1748
     macro avg       0.88      0.88      0.88      1748
  weighted avg       0.91      0.91      0.91      1748



#### Observation on model performance
* This model's best score is 0.895 when the number of trees = 300, and the weighted average is 0.90.
* It is best at predicting "False Positive".
* There is virtually no difference between the tuned model and the original model using n_estimators=200 which has a score of 0.907 on the test data. Thus in order to minimize saved model's file size, we will go ahead and save the original model below.

In [41]:
# Classification report on the original model (n_estimators=200)
predictions_original = rf_model.predict(X_test_scaled)
print(classification_report(y_test_encoded, predictions_original,
                            target_names=target_names))

                precision    recall  f1-score   support

     CANDIDATE       0.84      0.79      0.81       411
     CONFIRMED       0.84      0.86      0.85       484
FALSE POSITIVE       0.98      1.00      0.99       853

      accuracy                           0.91      1748
     macro avg       0.89      0.88      0.88      1748
  weighted avg       0.91      0.91      0.91      1748



# Save the Model

In [38]:
import joblib

# We will 
filename = '../saved_ml_models/random_forests.sav'
joblib.dump(rf_model, filename)

['../saved_ml_models/random_forests.sav']