In [2]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

# Read the CSV and Perform Basic Data Cleaning

In [3]:
df = pd.read_csv("exoplanet_data.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
pd.set_option('display.max_columns', 999)
df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,koi_impact,koi_impact_err1,koi_impact_err2,koi_duration,koi_duration_err1,koi_duration_err2,koi_depth,koi_depth_err1,koi_depth_err2,koi_prad,koi_prad_err1,koi_prad_err2,koi_teq,koi_insol,koi_insol_err1,koi_insol_err2,koi_model_snr,koi_tce_plnt_num,koi_steff,koi_steff_err1,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,-0.00352,0.586,0.059,-0.443,4.507,0.116,-0.116,874.8,35.5,-35.5,2.83,0.32,-0.19,443,9.11,2.87,-1.62,25.8,2,5455,81,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,-0.000581,0.969,5.126,-0.077,1.7822,0.0341,-0.0341,10829.0,171.0,-171.0,14.6,3.92,-1.31,638,39.3,31.04,-10.49,76.3,1,5853,158,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,-0.000115,1.276,0.115,-0.092,2.40641,0.00537,-0.00537,8079.2,12.8,-12.8,33.46,8.5,-2.83,1395,891.96,668.95,-230.35,505.6,1,5805,157,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,-0.00113,0.701,0.235,-0.478,1.6545,0.042,-0.042,603.3,16.9,-16.9,2.75,0.88,-0.35,1406,926.16,874.33,-314.24,40.9,1,6031,169,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,-0.0019,0.762,0.139,-0.532,3.1402,0.0673,-0.0673,686.0,18.7,-18.7,2.77,0.9,-0.3,1160,427.65,420.33,-136.7,40.2,2,6046,189,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


# Select your features (columns)

In [4]:
df.columns

Index(['koi_disposition', 'koi_fpflag_nt', 'koi_fpflag_ss', 'koi_fpflag_co',
       'koi_fpflag_ec', 'koi_period', 'koi_period_err1', 'koi_period_err2',
       'koi_time0bk', 'koi_time0bk_err1', 'koi_time0bk_err2', 'koi_impact',
       'koi_impact_err1', 'koi_impact_err2', 'koi_duration',
       'koi_duration_err1', 'koi_duration_err2', 'koi_depth', 'koi_depth_err1',
       'koi_depth_err2', 'koi_prad', 'koi_prad_err1', 'koi_prad_err2',
       'koi_teq', 'koi_insol', 'koi_insol_err1', 'koi_insol_err2',
       'koi_model_snr', 'koi_tce_plnt_num', 'koi_steff', 'koi_steff_err1',
       'koi_steff_err2', 'koi_slogg', 'koi_slogg_err1', 'koi_slogg_err2',
       'koi_srad', 'koi_srad_err1', 'koi_srad_err2', 'ra', 'dec',
       'koi_kepmag'],
      dtype='object')

In [5]:
# Set features. This will also be used as your x values.
X = df[['koi_fpflag_nt', 'koi_fpflag_ss', 'koi_fpflag_co', 'koi_fpflag_ec', 'koi_period', 'koi_impact', 'koi_duration',
       'koi_depth', 'koi_prad', 'koi_teq', 'koi_model_snr']]

In [6]:
#check features shape
X.shape

(6991, 11)

# Create a Train Test Split

Use `koi_disposition` for the y values

In [7]:
#import label encoder to get numerical values for koi_disposition
from sklearn.preprocessing import LabelEncoder

In [8]:
y = df['koi_disposition']

In [9]:
labels = LabelEncoder()
labels.fit(y)
encoded_y = labels.transform(y)

In [10]:
labels_df = pd.DataFrame({"Disposition": y, "Code": encoded_y}).drop_duplicates(subset=None, keep="first")
labels_df

Unnamed: 0,Disposition,Code
0,CONFIRMED,1
1,FALSE POSITIVE,2
29,CANDIDATE,0


In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, encoded_y, random_state=42)

In [12]:
print(len(X_train))

5243


# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [13]:
# Scale your data
from sklearn.preprocessing import MinMaxScaler

In [14]:
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Train the Model



In [15]:
#import random forest model
from sklearn.ensemble import RandomForestClassifier

In [16]:
model = RandomForestClassifier(n_estimators=100)
model = model.fit(X_train_scaled, y_train)

In [17]:
print(f"Training Data Score: {model.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {model.score(X_test_scaled, y_test)}")

Training Data Score: 1.0
Testing Data Score: 0.898741418764302


In [18]:
feature_names = X.columns

In [19]:
#check feature importance 
sorted(zip(model.feature_importances_, feature_names), reverse=True)

[(0.1620020566626708, 'koi_model_snr'),
 (0.14204389095673103, 'koi_fpflag_nt'),
 (0.1420087541459002, 'koi_fpflag_co'),
 (0.11210381343498921, 'koi_fpflag_ss'),
 (0.10074255126440997, 'koi_prad'),
 (0.06622752682450285, 'koi_period'),
 (0.06585511956432906, 'koi_depth'),
 (0.0563886975342797, 'koi_teq'),
 (0.055863201712236785, 'koi_impact'),
 (0.05077582594044856, 'koi_duration'),
 (0.045988561959501885, 'koi_fpflag_ec')]

In [30]:
from sklearn.metrics import classification_report

In [38]:
predictions = model.predict(X_test_scaled)
print(classification_report(y_test, predictions,
                            target_names=["candidate", "confirmed", "false positive"]))

                precision    recall  f1-score   support

     candidate       0.82      0.76      0.79       411
     confirmed       0.82      0.84      0.83       484
false positive       0.98      1.00      0.99       853

      accuracy                           0.90      1748
     macro avg       0.87      0.87      0.87      1748
  weighted avg       0.90      0.90      0.90      1748



# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [20]:
# Create the GridSearchCV model
from sklearn.model_selection import GridSearchCV

In [49]:
# Train the model with GridSearch
# Create the parameter grid based on the results of random search 
param_grid = {
    'max_depth': [5, 10, 50, 100],
    'n_estimators': [100, 200, 300, 500]
}
# Create a based model
model2 = RandomForestClassifier()
# Instantiate the grid search model
grid = GridSearchCV(model2, param_grid, verbose=3)

In [50]:
grid.fit(X_train_scaled, y_train)

Fitting 3 folds for each of 16 candidates, totalling 48 fits
[CV] max_depth=5, n_estimators=100 ...................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ....... max_depth=5, n_estimators=100, score=0.897, total=   0.3s
[CV] max_depth=5, n_estimators=100 ...................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s remaining:    0.0s


[CV] ....... max_depth=5, n_estimators=100, score=0.878, total=   0.3s
[CV] max_depth=5, n_estimators=100 ...................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.4s remaining:    0.0s


[CV] ....... max_depth=5, n_estimators=100, score=0.865, total=   0.3s
[CV] max_depth=5, n_estimators=200 ...................................
[CV] ....... max_depth=5, n_estimators=200, score=0.898, total=   0.5s
[CV] max_depth=5, n_estimators=200 ...................................
[CV] ....... max_depth=5, n_estimators=200, score=0.880, total=   0.5s
[CV] max_depth=5, n_estimators=200 ...................................
[CV] ....... max_depth=5, n_estimators=200, score=0.867, total=   0.5s
[CV] max_depth=5, n_estimators=300 ...................................
[CV] ....... max_depth=5, n_estimators=300, score=0.897, total=   0.8s
[CV] max_depth=5, n_estimators=300 ...................................
[CV] ....... max_depth=5, n_estimators=300, score=0.879, total=   0.8s
[CV] max_depth=5, n_estimators=300 ...................................
[CV] ....... max_depth=5, n_estimators=300, score=0.867, total=   0.8s
[CV] max_depth=5, n_estimators=500 ...................................
[CV] .

[Parallel(n_jobs=1)]: Done  48 out of  48 | elapsed:   49.7s finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False),
           

In [51]:
#check best params and scores
print(grid.best_params_)
print(grid.best_score_)

{'max_depth': 50, 'n_estimators': 200}
0.8935723822239175


In [52]:
#define model using best hyperparameters
model_opt =  RandomForestClassifier(n_estimators = 200, max_depth = 50)
model_opt.fit(X_train_scaled, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=50, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [53]:
#make predictions with optimized model
predictions2 = model_opt.predict(X_test_scaled)

In [54]:
#classification report
print(classification_report(y_test, predictions2,
                            target_names=["candidate", "confirmed", "false positive"]))

                precision    recall  f1-score   support

     candidate       0.82      0.78      0.80       411
     confirmed       0.83      0.84      0.83       484
false positive       0.98      1.00      0.99       853

      accuracy                           0.90      1748
     macro avg       0.88      0.87      0.87      1748
  weighted avg       0.90      0.90      0.90      1748



In [55]:
#check testing data score 
print(f"Training Data Score: {model_opt.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {model_opt.score(X_test_scaled, y_test)}")

Training Data Score: 1.0
Testing Data Score: 0.9016018306636155


In [56]:
#check feature importance 
sorted(zip(model_opt.feature_importances_, feature_names), reverse=True)

[(0.15388545256142208, 'koi_model_snr'),
 (0.14725994211890037, 'koi_fpflag_nt'),
 (0.1408870164754371, 'koi_fpflag_co'),
 (0.10798149825257665, 'koi_fpflag_ss'),
 (0.10129702067309179, 'koi_prad'),
 (0.07010955340285795, 'koi_depth'),
 (0.0671368656975991, 'koi_period'),
 (0.05652212138064443, 'koi_impact'),
 (0.05639774497121136, 'koi_teq'),
 (0.05103755970244254, 'koi_duration'),
 (0.047485224763816496, 'koi_fpflag_ec')]

# Save the Model

In [57]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash
import joblib
filename = 'keith_woodfin.sav'
joblib.dump(model_opt, filename)

['keith_woodfin.sav']