In [1]:
#Import all the libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, accuracy_score

In [2]:
#Import the data
df = pd.read_csv('exoplanet_data.csv')

In [3]:
#This function takes in a data frame with the desired features, preprocesses the data, then call other functions 
def test_data(df):    
    dataset = df
    
    #Create X and y
    X = dataset.iloc[:,:-1].values
    y = dataset.iloc[:,-1].values
    
    #Create Train-test split    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=101)
    
    #Standardize the data    
    mms = MinMaxScaler()
    X_train_mms = mms.fit_transform(X_train)
    X_test_mms = mms.transform(X_test)
    
    #Test using KNN model
    knn_model(X_train_mms,X_test_mms,y_train,y_test)
    
    #Test using Random Forest model
    rf_model(X_train_mms,X_test_mms,y_train,y_test)
    
    #Test using Naive Bayes
    nb_model(X_train_mms,X_test_mms,y_train,y_test)

In [4]:
#This function will take the training and test set data and analyse it using KNN model
def knn_model(X_train,X_test,y_train,y_test):
    knn = KNeighborsClassifier(n_neighbors=5, metric = 'minkowski', p = 2)
    knn.fit(X_train,y_train)
    knn_pred = knn.predict(X_test)
    
    cm = confusion_matrix(y_test, knn_pred)
    score = accuracy_score(y_test, knn_pred)
    print('Results for KNN model')
    print(cm)
    print(score)
    print()

In [5]:
#This function will take the training and test set data and analyse it using random forest model
def rf_model(X_train,X_test,y_train,y_test):
    rfc = RandomForestClassifier(n_estimators=50, criterion = 'entropy', random_state = 0)
    rfc.fit(X_train,y_train)
    rf_pred = rfc.predict(X_test)

    
    cm = confusion_matrix(y_test, rf_pred)
    score = accuracy_score(y_test, rf_pred)
    print('Results for Random forest model')
    print(cm)
    print(score)
    print()

In [8]:
#This function will take the training and test set data and analyse it using Naive Bayes model
def nb_model(X_train,X_test,y_train,y_test):
    nbg = GaussianNB()
    nbg.fit(X_train,y_train)
    nbg_pred = nbg.predict(X_test)
    #rfc = RandomForestClassifier(n_estimators=50, criterion = 'entropy', random_state = 0)
    #rfc.fit(X_train,y_train)
    #rf_pred = rfc.predict(X_test)

    
    cm = confusion_matrix(y_test, nbg_pred)
    score = accuracy_score(y_test, nbg_pred)
    print('Results for Naive Bayes model')
    print(cm)
    print(score)
    print()

In [7]:
feature_select_ne = df[['koi_fpflag_nt', 'koi_fpflag_ss', 'koi_fpflag_co',
                     'koi_fpflag_ec', 
                     'koi_period',
                     'koi_impact',
                     'koi_duration',
                     'koi_depth',
                     'koi_prad',
                     'koi_teq', 'koi_model_snr',
                     'koi_steff',
                     'koi_slogg',
                     'koi_srad',
                     'ra', 'dec', 'koi_kepmag',
                     'koi_disposition']]

In [9]:
test_data(feature_select_ne)

Results for KNN model
[[224 194  10]
 [133 293   9]
 [  1   0 884]]
0.801487414187643

Results for Random forest model
[[331  87  10]
 [ 66 360   9]
 [  1   0 884]]
0.9010297482837528

Results for Naive Bayes model
[[415   0  13]
 [426   0   9]
 [  1   0 884]]
0.7431350114416476

