In [1]:
import pandas as pd

# Read the CSV and Perform Basic Data Cleaning

In [2]:
# decision tree is very fast, even using the full data it will still finish in seconds.
df = pd.read_csv("exoplanet_data.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


# Select your features (columns)

In [3]:
# Set features. This will also be used as your x values.
# remove disposition flags, and errors columns
selected_features = df[['koi_period', 'koi_time0bk', 'koi_impact', 'koi_duration', 'koi_depth', 
                        'koi_prad', 'koi_teq', 'koi_insol', 'koi_model_snr', 'koi_tce_plnt_num', 
                        'koi_steff', 'koi_slogg', 'koi_srad', 'ra', 'dec', 'koi_kepmag']]
X = selected_features.values
print(X.shape)

(6991, 16)


# Create a Train Test Split

Use `koi_disposition` for the y values

In [4]:
# prepare y values
y = df['koi_disposition'].values

# Step 1: Label-encode data set
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
label_encoder.fit(y)
encoded_y = label_encoder.transform(y)

encoded_y

array([1, 2, 2, ..., 0, 2, 2])

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, encoded_y, random_state=1)

In [6]:
X_train

array([[1.05484133e+01, 1.39064020e+02, 1.01700000e+00, ...,
        2.98095430e+02, 4.47370610e+01, 1.32040000e+01],
       [2.47543849e+01, 1.40207320e+02, 7.09000000e-01, ...,
        2.95735350e+02, 4.25762480e+01, 1.55140000e+01],
       [1.05733568e+00, 1.31792007e+02, 2.62000000e-01, ...,
        2.92184170e+02, 4.93100400e+01, 1.54140000e+01],
       ...,
       [1.07027283e+00, 1.31835891e+02, 1.26900000e+00, ...,
        2.99170620e+02, 4.08710590e+01, 1.59260000e+01],
       [1.87464901e+01, 1.47130740e+02, 5.08000000e-01, ...,
        2.88164700e+02, 4.58165090e+01, 1.58430000e+01],
       [2.10907909e+01, 1.33963160e+02, 8.32000000e-01, ...,
        2.85475040e+02, 4.80349310e+01, 1.42230000e+01]])

# Random Forest Training

In [7]:
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(random_state = 1)
rf_model = rf_model.fit(X_train, y_train)

In [8]:
train_score = rf_model.score(X_train, y_train)
test_score = rf_model.score(X_test, y_test)
print(f"train_score: {train_score}")
print(f"test_score: {test_score}")

# print feature importance
feature_importance = zip(rf_model.feature_importances_, selected_features.columns)
list(feature_importance)


train_score: 1.0
test_score: 0.7654462242562929


[(0.06983886619639082, 'koi_period'),
 (0.056339065884505504, 'koi_time0bk'),
 (0.08204143111698879, 'koi_impact'),
 (0.06885226911480116, 'koi_duration'),
 (0.08170508570901058, 'koi_depth'),
 (0.12124406537329678, 'koi_prad'),
 (0.0611652648297357, 'koi_teq'),
 (0.061253698645244145, 'koi_insol'),
 (0.1412132144321545, 'koi_model_snr'),
 (0.0156662982924218, 'koi_tce_plnt_num'),
 (0.03984142338687951, 'koi_steff'),
 (0.03638505632707633, 'koi_slogg'),
 (0.03641348293926122, 'koi_srad'),
 (0.04756430572012431, 'ra'),
 (0.04070664542470149, 'dec'),
 (0.039769826607407334, 'koi_kepmag')]

In [9]:
# print sorted feature importance
sorted_feature_importance = sorted(zip(rf_model.feature_importances_, selected_features.columns), reverse=True)
list(sorted_feature_importance)


[(0.1412132144321545, 'koi_model_snr'),
 (0.12124406537329678, 'koi_prad'),
 (0.08204143111698879, 'koi_impact'),
 (0.08170508570901058, 'koi_depth'),
 (0.06983886619639082, 'koi_period'),
 (0.06885226911480116, 'koi_duration'),
 (0.061253698645244145, 'koi_insol'),
 (0.0611652648297357, 'koi_teq'),
 (0.056339065884505504, 'koi_time0bk'),
 (0.04756430572012431, 'ra'),
 (0.04070664542470149, 'dec'),
 (0.03984142338687951, 'koi_steff'),
 (0.039769826607407334, 'koi_kepmag'),
 (0.03641348293926122, 'koi_srad'),
 (0.03638505632707633, 'koi_slogg'),
 (0.0156662982924218, 'koi_tce_plnt_num')]

# Hyperparameter Tunning with GridSearch

In [10]:
# show current parameter
print(rf_model.get_params())

{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': 1, 'verbose': 0, 'warm_start': False}


In [11]:
from sklearn.model_selection import GridSearchCV
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [85, 90, 95],
    'max_features': [2, 3, 4],
    'min_samples_leaf': [2, 3, 4],
    'min_samples_split': [6, 7, 8, 9],
    'n_estimators': [250, 300, 400, 500]
}

# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf_model, param_grid = param_grid, 
                          cv = 3, n_jobs = 4, verbose = 3)

In [12]:
# Fit grid search to the data
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 432 candidates, totalling 1296 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:   15.1s
[Parallel(n_jobs=4)]: Done 120 tasks      | elapsed:  1.2min
[Parallel(n_jobs=4)]: Done 280 tasks      | elapsed:  3.1min
[Parallel(n_jobs=4)]: Done 504 tasks      | elapsed:  6.0min
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:  9.5min
[Parallel(n_jobs=4)]: Done 1144 tasks      | elapsed: 13.6min
[Parallel(n_jobs=4)]: Done 1296 out of 1296 | elapsed: 15.8min finished


GridSearchCV(cv=3, estimator=RandomForestClassifier(random_state=1), n_jobs=4,
             param_grid={'bootstrap': [True], 'max_depth': [85, 90, 95],
                         'max_features': [2, 3, 4],
                         'min_samples_leaf': [2, 3, 4],
                         'min_samples_split': [6, 7, 8, 9],
                         'n_estimators': [250, 300, 400, 500]},
             verbose=3)

In [13]:
grid_search.best_params_

{'bootstrap': True,
 'max_depth': 85,
 'max_features': 3,
 'min_samples_leaf': 3,
 'min_samples_split': 6,
 'n_estimators': 250}

In [14]:
print(grid_search.best_score_)

0.7596808651378827


In [15]:
best_grid = grid_search.best_estimator_
train_score = best_grid.score(X_train, y_train)
test_score = best_grid.score(X_test, y_test)
print(f"train_score: {train_score}")
print(f"test_score: {test_score}")

train_score: 0.955750524508869
test_score: 0.7734553775743707


Improvement = 0.7734 - 0.7654 = 0.008 = 0.8% accuracy

# Remove any feature which has importance less than 0.05

In [16]:
# reduce feature from 16 down to 9
selected_features2 = df[['koi_period', 'koi_time0bk', 'koi_impact', 'koi_duration', 'koi_depth', 
                        'koi_prad', 'koi_teq', 'koi_insol', 'koi_model_snr']]
X2 = selected_features2.values
print(X2.shape)

(6991, 9)


In [17]:
# 
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, encoded_y, random_state=1)

In [18]:
rf_model2 = RandomForestClassifier(n_estimators=200)
rf_model2 = rf_model2.fit(X_train2, y_train2)

In [19]:
train_score2 = rf_model2.score(X_train2, y_train2)
test_score2 = rf_model2.score(X_test2, y_test2)
print(f"train_score2: {train_score2}")
print(f"test_score2: {test_score2}")

# print feature importance
feature_importance2 = zip(rf_model2.feature_importances_, selected_features2.columns)
list(feature_importance2)

train_score2: 1.0
test_score2: 0.7654462242562929


[(0.10707327075244918, 'koi_period'),
 (0.07782068761782276, 'koi_time0bk'),
 (0.10628674740361546, 'koi_impact'),
 (0.10114738339056215, 'koi_duration'),
 (0.10062295314106147, 'koi_depth'),
 (0.165897023590182, 'koi_prad'),
 (0.08626121239053582, 'koi_teq'),
 (0.08631650271123194, 'koi_insol'),
 (0.16857421900253927, 'koi_model_snr')]

In [20]:
# print sorted feature importance
sorted_feature_importance2 = sorted(zip(rf_model2.feature_importances_, selected_features2.columns), reverse=True)
list(sorted_feature_importance2)

[(0.16857421900253927, 'koi_model_snr'),
 (0.165897023590182, 'koi_prad'),
 (0.10707327075244918, 'koi_period'),
 (0.10628674740361546, 'koi_impact'),
 (0.10114738339056215, 'koi_duration'),
 (0.10062295314106147, 'koi_depth'),
 (0.08631650271123194, 'koi_insol'),
 (0.08626121239053582, 'koi_teq'),
 (0.07782068761782276, 'koi_time0bk')]

## Conclusion: 

In [21]:
# Remove less importance feature doesn't show improvments on accuracy
# but it could help speed up model calculation in deep learning model!

# Save the Model

In [24]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash
import joblib
filename = 'jamesye_random_forest_model.sav'
joblib.dump(best_grid, filename)

['jamesye_random_forest_model.sav']

# Load Model

In [25]:
loaded_model = joblib.load(filename)
result = loaded_model.score(X_test, y_test)
result

0.7734553775743707