In [1]:
# Update sklearn to prevent version mismatches
!pip install sklearn --upgrade



In [2]:
# install joblib. This will be used to save your model. 
# Restart your kernel after installing 
!pip install joblib



In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from tensorflow.keras.utils import to_categorical
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Read the CSV and Perform Basic Data Cleaning

In [4]:
df = pd.read_csv("Resources\exoplanet_data.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


# Select your features (columns)

In [5]:
# Set features. This will also be used as your x values.
#selected_features = df.drop("koi_disposition", axis=1)

selected_features = df.drop(["koi_disposition",
                             "koi_tce_plnt_num"
                            ],axis=1)

selected_features.head(25)

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,-0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,-0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,-0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,-0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,-0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714
5,0,0,0,0,2.566589,1.78e-05,-1.78e-05,179.55437,0.00461,-0.00461,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714
6,0,0,0,0,16.068647,1.09e-05,-1.09e-05,173.621937,0.000517,-0.000517,...,-83,4.485,0.083,-0.028,0.848,0.033,-0.072,286.99948,48.37579,15.841
7,0,0,0,0,2.470613,2.7e-08,-2.7e-08,122.763305,9e-06,-9e-06,...,-78,4.457,0.024,-0.024,0.964,0.038,-0.038,286.80847,49.316399,11.338
8,0,1,0,0,2.204735,4.3e-08,-4.3e-08,121.358542,1.6e-05,-1.6e-05,...,-89,4.019,0.033,-0.027,1.952,0.099,-0.11,292.24728,47.969521,10.463
9,0,0,0,0,3.522498,1.98e-07,-1.98e-07,121.119423,4.7e-05,-4.7e-05,...,-137,4.169,0.055,-0.045,1.451,0.11,-0.11,281.28812,42.45108,13.563


# Create a Train Test Split

Use `koi_disposition` for the y values

In [6]:
X = selected_features
y = df["koi_disposition"]

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y, train_size=0.65,test_size=0.35)
X_train.head()

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
4430,1,0,0,0,149.237142,0.002931,-0.002931,168.1563,0.0164,-0.0164,...,-142,4.576,0.027,-0.202,0.812,0.249,-0.062,288.32599,41.027618,15.683
2330,0,0,0,0,20.605501,0.000259,-0.000259,146.5069,0.0102,-0.0102,...,-85,4.193,0.016,-0.018,1.308,0.054,-0.054,285.47659,41.632717,11.049
4356,0,0,0,0,3.290304,4e-06,-4e-06,131.90944,0.00178,-0.00178,...,-33,4.81,0.01,-0.02,0.427,0.036,-0.009,292.57986,37.376411,15.669
2142,0,0,0,0,8.073941,4.8e-05,-4.8e-05,540.04161,0.00487,-0.00487,...,-82,4.869,0.049,-0.055,0.398,0.049,-0.049,285.73981,40.980049,16.109
856,0,0,0,0,21.060753,5.7e-05,-5.7e-05,143.59983,0.00227,-0.00227,...,-103,4.382,0.099,-0.11,1.034,0.148,-0.121,298.17722,42.237041,13.182


# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [8]:
# Scale your data
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Train the Model



In [9]:
# Random Forest Classifier
rf = RandomForestClassifier(n_estimators=500)
rf = rf.fit(X_train_scaled, y_train)
rf.score(X_test_scaled, y_test)

0.8917041275030649

In [10]:
# predecictions
predictions = rf.predict(X_test_scaled)
print(predictions)

['FALSE POSITIVE' 'FALSE POSITIVE' 'CONFIRMED' ... 'FALSE POSITIVE'
 'CONFIRMED' 'CANDIDATE']


In [11]:
print(f"Training Data Score: {rf.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {rf.score(X_test_scaled, y_test)}")

Training Data Score: 1.0
Testing Data Score: 0.8917041275030649


In [12]:
# Random Forests in sklearn will automatically calculate feature importance
importances = rf.feature_importances_
importances

array([0.0957573 , 0.06829415, 0.11118917, 0.03559354, 0.02450434,
       0.01741622, 0.01706945, 0.01364591, 0.02620902, 0.02003406,
       0.01731711, 0.01083146, 0.01083529, 0.02465868, 0.03122317,
       0.03162242, 0.02089385, 0.01524027, 0.01436019, 0.04607276,
       0.03632019, 0.02847448, 0.01627688, 0.01409235, 0.01596062,
       0.01339039, 0.05861809, 0.00971179, 0.03346949, 0.02641646,
       0.00920573, 0.00895687, 0.01095944, 0.00953084, 0.01186682,
       0.00904457, 0.01310086, 0.01075307, 0.01108273])

In [13]:
feature_names = selected_features.columns

sorted(zip(rf.feature_importances_, feature_names), reverse=True)

[(0.11118916504766128, 'koi_fpflag_co'),
 (0.09575729772066079, 'koi_fpflag_nt'),
 (0.06829415345059059, 'koi_fpflag_ss'),
 (0.05861809227265134, 'koi_model_snr'),
 (0.046072755643169, 'koi_prad'),
 (0.036320187036088275, 'koi_prad_err1'),
 (0.03559353854882506, 'koi_fpflag_ec'),
 (0.033469485768623096, 'koi_steff_err1'),
 (0.0316224234456887, 'koi_duration_err2'),
 (0.031223173292396558, 'koi_duration_err1'),
 (0.028474480224283262, 'koi_prad_err2'),
 (0.02641646033497795, 'koi_steff_err2'),
 (0.026209016204859804, 'koi_time0bk_err1'),
 (0.02465868488549725, 'koi_duration'),
 (0.02450433653294376, 'koi_period'),
 (0.02089385446058378, 'koi_depth'),
 (0.02003405659782935, 'koi_time0bk_err2'),
 (0.01741622395566034, 'koi_period_err1'),
 (0.017317114874146226, 'koi_impact'),
 (0.01706945065056662, 'koi_period_err2'),
 (0.016276876410365168, 'koi_teq'),
 (0.015960619257094542, 'koi_insol_err1'),
 (0.015240265638282818, 'koi_depth_err1'),
 (0.014360186804663623, 'koi_depth_err2'),
 (0.0140

In [14]:
print(classification_report(y_test, 
                            predictions, 
                            target_names = ["Confirmed", "False Positive", "Candidate"]))

                precision    recall  f1-score   support

     Confirmed       0.81      0.74      0.77       591
False Positive       0.80      0.82      0.81       630
     Candidate       0.97      1.00      0.99      1226

      accuracy                           0.89      2447
     macro avg       0.86      0.85      0.86      2447
  weighted avg       0.89      0.89      0.89      2447



# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [15]:
# Create the GridSearchCV model
from sklearn.model_selection import GridSearchCV
param_grid = {'n_estimators': [500, 750, 1000],
              'max_depth': [100, 200, 300],
              'criterion': ['gini', 'entropy']}
grid = GridSearchCV(rf, param_grid, verbose=3)

In [16]:
# Train the model with GridSearch
grid.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV] criterion=gini, max_depth=100, n_estimators=500 .................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  criterion=gini, max_depth=100, n_estimators=500, score=0.875, total=   4.9s
[CV] criterion=gini, max_depth=100, n_estimators=500 .................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    4.8s remaining:    0.0s


[CV]  criterion=gini, max_depth=100, n_estimators=500, score=0.892, total=   4.9s
[CV] criterion=gini, max_depth=100, n_estimators=500 .................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    9.7s remaining:    0.0s


[CV]  criterion=gini, max_depth=100, n_estimators=500, score=0.890, total=   4.8s
[CV] criterion=gini, max_depth=100, n_estimators=500 .................
[CV]  criterion=gini, max_depth=100, n_estimators=500, score=0.894, total=   4.9s
[CV] criterion=gini, max_depth=100, n_estimators=500 .................
[CV]  criterion=gini, max_depth=100, n_estimators=500, score=0.888, total=   5.1s
[CV] criterion=gini, max_depth=100, n_estimators=750 .................
[CV]  criterion=gini, max_depth=100, n_estimators=750, score=0.875, total=   7.4s
[CV] criterion=gini, max_depth=100, n_estimators=750 .................
[CV]  criterion=gini, max_depth=100, n_estimators=750, score=0.892, total=   7.3s
[CV] criterion=gini, max_depth=100, n_estimators=750 .................
[CV]  criterion=gini, max_depth=100, n_estimators=750, score=0.891, total=   7.4s
[CV] criterion=gini, max_depth=100, n_estimators=750 .................
[CV]  criterion=gini, max_depth=100, n_estimators=750, score=0.901, total=   7.4s


[CV]  criterion=entropy, max_depth=100, n_estimators=1000, score=0.893, total=  15.6s
[CV] criterion=entropy, max_depth=100, n_estimators=1000 .............
[CV]  criterion=entropy, max_depth=100, n_estimators=1000, score=0.890, total=  16.1s
[CV] criterion=entropy, max_depth=100, n_estimators=1000 .............
[CV]  criterion=entropy, max_depth=100, n_estimators=1000, score=0.901, total=  16.1s
[CV] criterion=entropy, max_depth=100, n_estimators=1000 .............
[CV]  criterion=entropy, max_depth=100, n_estimators=1000, score=0.889, total=  15.2s
[CV] criterion=entropy, max_depth=200, n_estimators=500 ..............
[CV]  criterion=entropy, max_depth=200, n_estimators=500, score=0.876, total=   7.4s
[CV] criterion=entropy, max_depth=200, n_estimators=500 ..............
[CV]  criterion=entropy, max_depth=200, n_estimators=500, score=0.899, total=   7.4s
[CV] criterion=entropy, max_depth=200, n_estimators=500 ..............
[CV]  criterion=entropy, max_depth=200, n_estimators=500, sc

[Parallel(n_jobs=1)]: Done  90 out of  90 | elapsed: 14.2min finished


GridSearchCV(estimator=RandomForestClassifier(n_estimators=500),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [100, 200, 300],
                         'n_estimators': [500, 750, 1000]},
             verbose=3)

In [17]:
print(grid.best_params_)
print(grid.best_score_)

{'criterion': 'entropy', 'max_depth': 100, 'n_estimators': 750}
0.8926055160582138


# Save the Model

In [18]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash
import joblib
filename = 'Models\RandomForest.sav'
joblib.dump(rf, filename)

['Models\\RandomForest.sav']