In [1]:
#Import all the libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score

In [4]:
#Import the data
df = pd.read_csv('resources/exoplanet_data.csv')

In [5]:
#This is the final list of features to be used
dataset = df[['koi_fpflag_nt', 'koi_fpflag_ss', 'koi_fpflag_co',
                     'koi_fpflag_ec', 'koi_period', 'koi_impact',
                     'koi_duration', 'koi_depth', 'koi_prad',
                     'koi_teq', 'koi_model_snr', 'koi_steff',
                     'koi_slogg', 'koi_srad',
                     'ra', 'dec', 'koi_kepmag',
                     'koi_disposition']]

In [6]:
#Split into feature set and predictor set
X = dataset.iloc[:,:-1].values
y = dataset.iloc[:,-1].values

In [7]:
#Scale the features using MinMax
mms = MinMaxScaler()
X_mms = mms.fit_transform(X)

In [8]:
#Create the train test split
X_train, X_test, y_train, y_test = train_test_split(X_mms, y, test_size=0.25, random_state=101)

In [9]:
#First I set up my desired grid parameters
grid_params = {
    'n_estimators':[5,10,20,50,100,150,200],
    'max_features':[2,4,7,12,15,17],
    'max_depth':[2,4,6,8,10,15]
}

#Next I create the grid search with my parameters
rfgs = GridSearchCV(
    RandomForestClassifier(),
    grid_params,
    verbose = 1,
    cv = 5,
    n_jobs = -1
)

In [None]:
#WARNING!! time and computer intensive, use at your own risk!!!
#Hard coded results are given below
#Using the grid I created above I can start checking for the best parameters using the training data
rfgs_results = rfgs.fit(X_train,y_train)

In [None]:
#Lists the best parameters that were found and inputs them into a variable
#NOTE, if you choose to just use the hard coded answers, you can skip the cell above as well as this one
best_params = rfgs_results.best_params_

In [10]:
#Using the best parameters from the grid search I have my most optimized random forest model
best_rf = RandomForestClassifier(n_estimators=200,max_depth=10,max_features=7)

#Note, I used google colab for the grid search on random forest (as well as other models) to save on computer time.
#For RF specifically it took almost 17 minutes. I hard coded the results into this cell incase you want to skip the grid search
#If you didn't, comment out the top line and uncomment out the one below. 
#best_rf = RandomForestClassifier(n_estimators=best_params["n_estimators"],max_depth=best_params['max_depth'],max_features=best_params['max_features'])

In [11]:
#Using the best parameters we can now create the optimized model
best_rf.fit(X_train,y_train)

RandomForestClassifier(max_depth=10, max_features=7, n_estimators=200)

In [12]:
#Now that the model has been created, we can predict values to compare to the actual text values
rf_pred = best_rf.predict(X_test)

In [None]:
#Finally we see how well 