In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('exoplanet_data.csv')
df.head()

In [None]:
#Printing out a list of columns heads to look up their descriptions
df.columns

In [None]:
#This is a first pass at feature selection. I essentially only eliminated columns that I knew for sure should not be included
feature_select = df[['koi_fpflag_nt', 'koi_fpflag_ss', 'koi_fpflag_co',
                     'koi_fpflag_ec', 
                     'koi_period', 'koi_period_err1', 'koi_period_err2',
                     'koi_impact', 'koi_impact_err1', 'koi_impact_err2', 
                     'koi_duration', 'koi_duration_err1', 'koi_duration_err2', 
                     'koi_depth', 'koi_depth_err1', 'koi_depth_err2',
                     'koi_prad', 'koi_prad_err1', 'koi_prad_err2',
                     'koi_teq', 'koi_model_snr',
                     'koi_steff', 'koi_steff_err1', 'koi_steff_err2',
                     'koi_slogg', 'koi_slogg_err1', 'koi_slogg_err2',
                     'koi_srad', 'koi_srad_err1', 'koi_srad_err2',
                     'ra', 'dec', 'koi_kepmag',
                     'koi_disposition']]

In [None]:
feature_select.head()

In [None]:
feature_select['koi_disposition'].value_counts()

In [None]:
#This data has KOI that have been confirmed, false positives, and candidates
#I'm going to split them up so that the candidates are their own dataset, gameset.
#1. I cannot use use it in the training as they are unknowns, as opposed to a confirmed planet or not a planet
#2. However, I can use this to set up a a real world application of the data.
#Once the model is selected, it can be used to predict if the "new" data should produce a planet or false positive
dataset = feature_select[feature_select['koi_disposition'] != 'CANDIDATE']
gameset = feature_select[feature_select['koi_disposition'] == 'CANDIDATE']
gameset['koi_disposition'].value_counts()

In [None]:
#Split the columns into a features table, X, and a predictor table, y
X = dataset.iloc[:,:-1].values
y = dataset.iloc[:,-1].values

In [None]:
#Creating the train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=101)

In [None]:
print(X_train)

In [None]:
from sklearn.preprocessing import MinMaxScaler
mms = MinMaxScaler()
X_train_mms = mms.fit_transform(X_train)
X_test_mms = mms.transform(X_test)

In [None]:
print(X_train_mms)

In [None]:
from sklearn.linear_model import LogisticRegression
log_cla = LogisticRegression(random_state = 0)
log_cla.fit(X_train_mms, y_train)
log_pred = log_cla.predict(X_test_mms)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, log_pred)
print(cm)
accuracy_score(y_test, log_pred)


In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5, metric = 'minkowski', p = 2)
knn.fit(X_train_mms,y_train)
knn_pred = knn.predict(X_test_mms)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, knn_pred)
print(cm)
accuracy_score(y_test, knn_pred)