In [5]:
#Import all the libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score

In [2]:
#Import the data
df = pd.read_csv('exoplanet_data.csv')

In [8]:
#This function takes in a data frame with the desired features, preprocesses the data, then call other functions 
def test_data(df):    
    dataset = df
    
    #Create X and y
    X = dataset.iloc[:,:-1].values
    y = dataset.iloc[:,-1].values
    
    #Create Train-test split    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=101)
    
    #Standardize the data    
    mms = MinMaxScaler()
    X_train_mms = mms.fit_transform(X_train)
    X_test_mms = mms.transform(X_test)
    
    #Test using KNN model
    knn_model(X_train_mms,X_test_mms,y_train,y_test)
    
    #Test using Random Forest model
    rf_model(X_train_mms,X_test_mms,y_train,y_test)

In [4]:
#This function will take the training and test set data and analyse it using KNN model
def knn_model(X_train,X_test,y_train,y_test):
    knn = KNeighborsClassifier(n_neighbors=5, metric = 'minkowski', p = 2)
    knn.fit(X_train,y_train)
    knn_pred = knn.predict(X_test)
    
    cm = confusion_matrix(y_test, knn_pred)
    score = accuracy_score(y_test, knn_pred)
    print('Results for KNN model')
    print(cm)
    print(score)
    print()

In [7]:
#This function will take the training and test set data and analyse it using KNN model
def rf_model(X_train,X_test,y_train,y_test):
    #knn = KNeighborsClassifier(n_neighbors=5, metric = 'minkowski', p = 2)
    #knn.fit(X_train,y_train)
    #knn_pred = knn.predict(X_test)
    rfc = RandomForestClassifier(n_estimators=50, criterion = 'entropy', random_state = 0)
    rfc.fit(X_train,y_train)
    rf_pred = rfc.predict(X_test)

    
    cm = confusion_matrix(y_test, rf_pred)
    score = accuracy_score(y_test, rf_pred)
    print('Results for Random forest model')
    print(cm)
    print(score)
    print()

In [13]:
#This is a first pass at feature selection. I essentially only eliminated columns that I knew for sure should not be included
feature_select = df[['koi_fpflag_nt', 'koi_fpflag_ss', 'koi_fpflag_co',
                     'koi_fpflag_ec', 
                     'koi_period', 'koi_period_err1', 'koi_period_err2',
                     'koi_impact', 'koi_impact_err1', 'koi_impact_err2', 
                     'koi_duration', 'koi_duration_err1', 'koi_duration_err2', 
                     'koi_depth', 'koi_depth_err1', 'koi_depth_err2',
                     'koi_prad', 'koi_prad_err1', 'koi_prad_err2',
                     'koi_teq', 'koi_model_snr',
                     'koi_steff', 'koi_steff_err1', 'koi_steff_err2',
                     'koi_slogg', 'koi_slogg_err1', 'koi_slogg_err2',
                     'koi_srad', 'koi_srad_err1', 'koi_srad_err2',
                     'ra', 'dec', 'koi_kepmag',
                     'koi_disposition']]

In [10]:
test_data(feature_select)

Results for KNN model
[[233 185  10]
 [111 316   8]
 [  2   0 883]]
0.8192219679633868

Results for Random forest model
[[320  89  19]
 [ 60 368   7]
 [  1   2 882]]
0.8981693363844394



In [11]:
#The test above represents the baseline features I originally intended to use. 
#Results show KNN had an accuracy of ~82% and Random forest had ~90%
#The function runs bother of these modals at the same time and does so in a matter of seconds.
#Next I will look at various subsets of this feature selection to see if I can improve the score

In [14]:
#This feature selection will only include the 4 fpflag features
feature_select_flag = df[['koi_fpflag_nt', 'koi_fpflag_ss', 'koi_fpflag_co',
                     'koi_fpflag_ec',                      
                     'koi_disposition']]

In [15]:
test_data(feature_select_flag)

Results for KNN model
[[418   0  10]
 [426   0   9]
 [  1   0 884]]
0.7448512585812357

Results for Random forest model
[[  0 418  10]
 [  0 426   9]
 [  0   1 884]]
0.7494279176201373



In [16]:
#In previous tests (when I erroniously removed the candidate rows), these 4 features dominated the prediction
#Now when I run the test there is a significant drop in model performance, which is kindof a good thing
#This shows these 4 features are not the only features to help in the prediction
#Next I want to simply remove the error features from the original set

In [17]:
#This feature set will not have the error features.
feature_select_ne = df[['koi_fpflag_nt', 'koi_fpflag_ss', 'koi_fpflag_co',
                     'koi_fpflag_ec', 
                     'koi_period',
                     'koi_impact',
                     'koi_duration',
                     'koi_depth',
                     'koi_prad',
                     'koi_teq', 'koi_model_snr',
                     'koi_steff',
                     'koi_slogg',
                     'koi_srad',
                     'ra', 'dec', 'koi_kepmag',
                     'koi_disposition']]

In [18]:
test_data(feature_select_ne)

Results for KNN model
[[224 194  10]
 [133 293   9]
 [  1   0 884]]
0.801487414187643

Results for Random forest model
[[331  87  10]
 [ 66 360   9]
 [  1   0 884]]
0.9010297482837528



In [19]:
#results for this feature selection are similar to the initial test set 
#KNN is slightly worse and Random forest is marginally better
#I don't notice any significant differences between distribustion of the answers
#This is important as it removed about half of the initial feature. 
#When I move to the grid search, this will help cut down on computer time with no real change in performance