In [None]:
#Import all the libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score

In [None]:
#Import the data
df = pd.read_csv('exoplanet_data.csv')

In [None]:
#This function takes in a data frame with the desired features, preprocesses the data, then call other functions 
def test_data(df):    
    dataset = df
    
    #Create X and y
    X = dataset.iloc[:,:-1].values
    y = dataset.iloc[:,-1].values
    
    #Create Train-test split    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=101)
    
    #Standardize the data    
    mms = MinMaxScaler()
    X_train_mms = mms.fit_transform(X_train)
    X_test_mms = mms.transform(X_test)
    
    #Test using KNN model
    knn_model(X_train_mms,X_test_mms,y_train,y_test)
    
    #Test using Random Forest model
    rf_model(X_train_mms,X_test_mms,y_train,y_test)

In [None]:
#This function will take the training and test set data and analyse it using KNN model
def knn_model(X_train,X_test,y_train,y_test):
    knn = KNeighborsClassifier(n_neighbors=5, metric = 'minkowski', p = 2)
    knn.fit(X_train,y_train)
    knn_pred = knn.predict(X_test)
    
    cm = confusion_matrix(y_test, knn_pred)
    score = accuracy_score(y_test, knn_pred)
    print('Results for KNN model')
    print(cm)
    print(score)
    print()

In [None]:
#This function will take the training and test set data and analyse it using KNN model
def rf_model(X_train,X_test,y_train,y_test):
    #knn = KNeighborsClassifier(n_neighbors=5, metric = 'minkowski', p = 2)
    #knn.fit(X_train,y_train)
    #knn_pred = knn.predict(X_test)
    rfc = RandomForestClassifier(n_estimators=50, criterion = 'entropy', random_state = 0)
    rfc.fit(X_train,y_train)
    rf_pred = rfc.predict(X_test)

    
    cm = confusion_matrix(y_test, rf_pred)
    score = accuracy_score(y_test, rf_pred)
    print('Results for Random forest model')
    print(cm)
    print(score)
    print()

In [None]:
df.head()

In [None]:
df['koi_period_err'] = (df['koi_period_err1']-df['koi_period_err2'])/df['koi_period']
df['koi_impact_err'] = (df['koi_impact_err1']-df['koi_impact_err2'])/df['koi_impact']
df['koi_duration_err'] = (df['koi_duration_err1']-df['koi_duration_err2'])/df['koi_duration']
df['koi_depth_err'] = (df['koi_depth_err1']-df['koi_depth_err2'])/df['koi_depth']
df['koi_prad_err'] = (df['koi_prad_err1']-df['koi_prad_err2'])/df['koi_prad']
df['koi_steff_err'] = (df['koi_steff_err1']-df['koi_steff_err2'])/df['koi_steff']
df['koi_slogg_err'] = (df['koi_slogg_err1']-df['koi_slogg_err2'])/df['koi_slogg']
df['koi_srad_err'] = (df['koi_srad_err1']-df['koi_srad_err2'])/df['koi_srad']

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
#This is a first pass at feature selection. I essentially only eliminated columns that I knew for sure should not be included
feature_select_err = df[['koi_fpflag_nt', 'koi_fpflag_ss', 'koi_fpflag_co',
                     'koi_fpflag_ec', 
                     'koi_period', 'koi_period_err', 
                     'koi_impact',   
                     'koi_duration', 'koi_duration_err',  
                     'koi_depth', 'koi_depth_err', 
                     'koi_prad', 'koi_prad_err', 
                     'koi_teq', 'koi_model_snr',
                     'koi_steff', 'koi_steff_err', 
                     'koi_slogg', 'koi_slogg_err', 
                     'koi_srad', 'koi_srad_err', 
                     'ra', 'dec', 'koi_kepmag',
                     'koi_disposition']]

In [None]:
test_data(feature_select_err)

In [None]:
#The best overall model so far is the features without any error features. 
#Adding in these new calcuated features has no meaningful effect on the accuracy
#However it does add more features which can slow down testing