In [1]:
# imports
import pandas as pd

# Read the CSV and Perform Basic Data Cleaning

In [2]:
df = pd.read_csv("exoplanet_data.csv")
df = df.dropna(axis='columns', how='all')
df = df.dropna()

In [3]:
# redefine future target into 2 classes: CANDIDATE or CONFIRMED =TruePositives; FALSE POSITIVE =FalsePositive
df_orig = df 
df['koi_disposition']= df['koi_disposition'].replace({'CANDIDATE':'TruePositive','CONFIRMED':'TruePositive','FALSE POSITIVE':'FalsePositive'})

# Select your features (columns)

In [4]:
#select target
target = pd.DataFrame(df['koi_disposition'])
target.head()

Unnamed: 0,koi_disposition
0,TruePositive
1,FalsePositive
2,FalsePositive
3,TruePositive
4,TruePositive


In [5]:
features = df.drop('koi_disposition', axis=1)
features_names = features.columns

# Create a Train Test Split



In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.333, random_state=1708)

# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [7]:
# Scale feature data
from sklearn.preprocessing import StandardScaler
x_scaler = StandardScaler().fit(X_train)
X_train_s = x_scaler.transform(X_train)
X_test_s = x_scaler.transform(X_test)

# Train the RF Model


In [8]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=200, n_jobs=-1)
rf.fit(X_train_s, y_train)
rf.score(X_test_s, y_test)

  This is separate from the ipykernel package so we can avoid doing imports until


0.986689566337484

# Reduce features

In [9]:
coeff_feat_list =sorted(zip(rf.feature_importances_, features_names), reverse=True)

# top 10 features
coeff_feat_list[0:10]

[(0.17280259715174354, 'koi_fpflag_co'),
 (0.1516126764832017, 'koi_fpflag_nt'),
 (0.11777724205510921, 'koi_fpflag_ss'),
 (0.05886597636169335, 'koi_fpflag_ec'),
 (0.05407065007539873, 'koi_prad'),
 (0.044405253970770435, 'koi_prad_err2'),
 (0.03945953128603987, 'koi_prad_err1'),
 (0.025810764052282806, 'koi_depth'),
 (0.025199801685924526, 'koi_steff_err1'),
 (0.02512445468860042, 'koi_period')]

In [10]:
# use top 6 features
reduce_feat = df_orig[['koi_fpflag_co', 'koi_fpflag_nt', 'koi_fpflag_ss', 'koi_prad', 'koi_fpflag_ec', 'koi_prad_err1']]
reduce_feat.head()

Unnamed: 0,koi_fpflag_co,koi_fpflag_nt,koi_fpflag_ss,koi_prad,koi_fpflag_ec,koi_prad_err1
0,0,0,0,2.83,0,0.32
1,0,0,1,14.6,0,3.92
2,0,0,1,33.46,0,8.5
3,0,0,0,2.75,0,0.88
4,0,0,0,2.77,0,0.9


## Train RF model with reduced features (top 6 features)

In [11]:
# split sets
X_train, X_test, y_train, y_test = train_test_split(reduce_feat, target, test_size=0.333, random_state=1708)

# scale
x_scaler = StandardScaler().fit(X_train)
X_train_s = x_scaler.transform(X_train)
X_test_s = x_scaler.transform(X_test)

# model
rf = RandomForestClassifier(n_estimators=200, n_jobs=-1)
rf.fit(X_train_s, y_train)
rf.score(X_test_s, y_test)

  # This is added back by InteractiveShellApp.init_path()


0.9875483039931301

# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [12]:
model = RandomForestClassifier(n_jobs=-1)
model

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

In [13]:
# Create the GridSearchCV model
from sklearn.model_selection import GridSearchCV

param_grid = {'n_estimators': [10, 50, 100, 150, 200]}

grid = GridSearchCV(model, param_grid, verbose=3)

In [14]:
# Train the model with GridSearch
grid.fit(X_train_s, y_train)

Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV] n_estimators=10 .................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  estimator.fit(X_train, y_train, **fit_params)


[CV] ..................... n_estimators=10, score=0.988, total=   3.2s
[CV] n_estimators=10 .................................................
[CV] ..................... n_estimators=10, score=0.983, total=   0.1s
[CV] n_estimators=10 .................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    3.1s remaining:    0.0s
  estimator.fit(X_train, y_train, **fit_params)
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    3.3s remaining:    0.0s
  estimator.fit(X_train, y_train, **fit_params)


[CV] ..................... n_estimators=10, score=0.995, total=   0.1s
[CV] n_estimators=10 .................................................
[CV] ..................... n_estimators=10, score=0.989, total=   0.1s
[CV] n_estimators=10 .................................................


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


[CV] ..................... n_estimators=10, score=0.990, total=   0.1s
[CV] n_estimators=50 .................................................


  estimator.fit(X_train, y_train, **fit_params)


[CV] ..................... n_estimators=50, score=0.990, total=   0.2s
[CV] n_estimators=50 .................................................


  estimator.fit(X_train, y_train, **fit_params)


[CV] ..................... n_estimators=50, score=0.984, total=   0.2s
[CV] n_estimators=50 .................................................


  estimator.fit(X_train, y_train, **fit_params)


[CV] ..................... n_estimators=50, score=0.994, total=   0.2s
[CV] n_estimators=50 .................................................


  estimator.fit(X_train, y_train, **fit_params)


[CV] ..................... n_estimators=50, score=0.990, total=   0.2s
[CV] n_estimators=50 .................................................


  estimator.fit(X_train, y_train, **fit_params)


[CV] ..................... n_estimators=50, score=0.989, total=   0.2s
[CV] n_estimators=100 ................................................


  estimator.fit(X_train, y_train, **fit_params)


[CV] .................... n_estimators=100, score=0.990, total=   0.4s
[CV] n_estimators=100 ................................................


  estimator.fit(X_train, y_train, **fit_params)


[CV] .................... n_estimators=100, score=0.984, total=   0.3s
[CV] n_estimators=100 ................................................


  estimator.fit(X_train, y_train, **fit_params)


[CV] .................... n_estimators=100, score=0.994, total=   0.4s
[CV] n_estimators=100 ................................................


  estimator.fit(X_train, y_train, **fit_params)


[CV] .................... n_estimators=100, score=0.989, total=   0.3s
[CV] n_estimators=100 ................................................


  estimator.fit(X_train, y_train, **fit_params)


[CV] .................... n_estimators=100, score=0.989, total=   0.3s
[CV] n_estimators=150 ................................................


  estimator.fit(X_train, y_train, **fit_params)


[CV] .................... n_estimators=150, score=0.991, total=   0.4s
[CV] n_estimators=150 ................................................


  estimator.fit(X_train, y_train, **fit_params)


[CV] .................... n_estimators=150, score=0.983, total=   0.4s
[CV] n_estimators=150 ................................................


  estimator.fit(X_train, y_train, **fit_params)


[CV] .................... n_estimators=150, score=0.994, total=   0.4s
[CV] n_estimators=150 ................................................


  estimator.fit(X_train, y_train, **fit_params)


[CV] .................... n_estimators=150, score=0.989, total=   0.4s
[CV] n_estimators=150 ................................................


  estimator.fit(X_train, y_train, **fit_params)


[CV] .................... n_estimators=150, score=0.989, total=   0.4s
[CV] n_estimators=200 ................................................


  estimator.fit(X_train, y_train, **fit_params)


[CV] .................... n_estimators=200, score=0.990, total=   0.5s
[CV] n_estimators=200 ................................................


  estimator.fit(X_train, y_train, **fit_params)


[CV] .................... n_estimators=200, score=0.984, total=   0.5s
[CV] n_estimators=200 ................................................


  estimator.fit(X_train, y_train, **fit_params)


[CV] .................... n_estimators=200, score=0.994, total=   0.5s
[CV] n_estimators=200 ................................................


  estimator.fit(X_train, y_train, **fit_params)


[CV] .................... n_estimators=200, score=0.988, total=   0.6s
[CV] n_estimators=200 ................................................


  estimator.fit(X_train, y_train, **fit_params)


[CV] .................... n_estimators=200, score=0.989, total=   0.5s


[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:   11.3s finished
  self.best_estimator_.fit(X, y, **fit_params)


GridSearchCV(cv=None, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=-1,
                                              oob_score=False,
                                              rand

In [15]:
print(grid.best_params_)
print(grid.best_score_)

{'n_estimators': 50}
0.9894904985992852


In [16]:
predictions = grid.predict(X_test_s)

from sklearn.metrics import classification_report
print(classification_report(y_test, predictions,
                            target_names=["True Pos", "False Pos"]))

              precision    recall  f1-score   support

    True Pos       0.98      1.00      0.99      1155
   False Pos       1.00      0.98      0.99      1174

    accuracy                           0.99      2329
   macro avg       0.99      0.99      0.99      2329
weighted avg       0.99      0.99      0.99      2329



# Save the Model

In [17]:
# model
rf = RandomForestClassifier(n_estimators=50, n_jobs=-1)  #best param from search: 'n_estimators': 100
rf.fit(X_train_s, y_train)
rf.score(X_test_s, y_test)

  This is separate from the ipykernel package so we can avoid doing imports until


0.9875483039931301

In [18]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash

import joblib
filename = 'DanViassolo.sav'

joblib.dump(rf, filename)

['DanViassolo.sav']