In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score

In [2]:
df = pd.read_csv('resources/exoplanet_data.csv')
df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


In [11]:
def log_reg_model(df):
    feature_select = df
    
    #Remove the Candidate  rows
    dataset = feature_select[feature_select['koi_disposition'] != 'CANDIDATE']
    
    #Create X and y
    X = dataset.iloc[:,:-1].values
    y = dataset.iloc[:,-1].values
    
    #Create Train-test split    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=101)
    
    #Standardize the data    
    mms = MinMaxScaler()
    X_train_mms = mms.fit_transform(X_train)
    X_test_mms = mms.transform(X_test)
    
    #Using logistic regression, fit the model then make predictions from the test set
    log_cla = LogisticRegression(random_state = 0)
    log_cla.fit(X_train_mms, y_train)
    log_pred = log_cla.predict(X_test_mms)
    
    #print the results
    cm = confusion_matrix(y_test, log_pred)
    print(cm)
    score = accuracy_score(y_test, log_pred)
    print(score)

In [5]:
feature_select = df[['koi_period', 'koi_period_err1', 'koi_period_err2',
                     'koi_impact', 'koi_impact_err1', 'koi_impact_err2', 
                     'koi_duration', 'koi_duration_err1', 'koi_duration_err2', 
                     'koi_depth', 'koi_depth_err1', 'koi_depth_err2',
                     'koi_prad', 'koi_prad_err1', 'koi_prad_err2',
                     'koi_teq', 'koi_model_snr',
                     'koi_steff', 'koi_steff_err1', 'koi_steff_err2',
                     'koi_slogg', 'koi_slogg_err1', 'koi_slogg_err2',
                     'koi_srad', 'koi_srad_err1', 'koi_srad_err2',
                     'ra', 'dec', 'koi_kepmag',
                     'koi_disposition']]

In [12]:
log_reg_model(feature_select)

[[323 124]
 [ 71 808]]
0.8529411764705882


In [9]:
feature_select_full = df[['koi_fpflag_nt', 'koi_fpflag_ss', 'koi_fpflag_co',
                     'koi_fpflag_ec', 
                     'koi_period', 'koi_period_err1', 'koi_period_err2',
                     'koi_impact', 'koi_impact_err1', 'koi_impact_err2', 
                     'koi_duration', 'koi_duration_err1', 'koi_duration_err2', 
                     'koi_depth', 'koi_depth_err1', 'koi_depth_err2',
                     'koi_prad', 'koi_prad_err1', 'koi_prad_err2',
                     'koi_teq', 'koi_model_snr',
                     'koi_steff', 'koi_steff_err1', 'koi_steff_err2',
                     'koi_slogg', 'koi_slogg_err1', 'koi_slogg_err2',
                     'koi_srad', 'koi_srad_err1', 'koi_srad_err2',
                     'ra', 'dec', 'koi_kepmag',
                     'koi_disposition']]

In [13]:
log_reg_model(feature_select_full)

[[436  11]
 [  0 879]]
0.9917043740573153


In [14]:
feature_select_trouble = df[['koi_fpflag_nt', 'koi_fpflag_ss', 'koi_fpflag_co',
                             'koi_fpflag_ec',                              
                             'koi_disposition']]

In [15]:
log_reg_model(feature_select_trouble)

[[436  11]
 [  0 879]]
0.9917043740573153


In [16]:
feature_select_nt = df[['koi_fpflag_nt', 'koi_disposition']]
feature_select_ss = df[['koi_fpflag_ss', 'koi_disposition']]
feature_select_co = df[['koi_fpflag_co', 'koi_disposition']]
feature_select_ec = df[['koi_fpflag_ec', 'koi_disposition']]

In [17]:
log_reg_model(feature_select_nt)

[[  0 447]
 [  0 879]]
0.6628959276018099


In [18]:
log_reg_model(feature_select_ss)

[[  0 447]
 [  0 879]]
0.6628959276018099


In [19]:
log_reg_model(feature_select_co)

[[  0 447]
 [  0 879]]
0.6628959276018099


In [20]:
log_reg_model(feature_select_ec)

[[  0 447]
 [  0 879]]
0.6628959276018099


In [23]:
feature_select_nt_ss = df[['koi_fpflag_nt', 'koi_fpflag_ss', 'koi_disposition']]
feature_select_nt_co = df[['koi_fpflag_nt', 'koi_fpflag_co', 'koi_disposition']]
feature_select_nt_ec = df[['koi_fpflag_nt', 'koi_fpflag_ec', 'koi_disposition']]

In [22]:
log_reg_model(feature_select_nt_ss)

[[438   9]
 [199 680]]
0.8431372549019608


In [24]:
log_reg_model(feature_select_nt_co)

[[440   7]
 [326 553]]
0.748868778280543


In [25]:
log_reg_model(feature_select_nt_ec)

[[442   5]
 [443 436]]
0.6621417797888386


In [30]:
feature_select_ss_co = df[['koi_fpflag_ss', 'koi_fpflag_co', 'koi_disposition']]
feature_select_ss_ec = df[['koi_fpflag_ss', 'koi_fpflag_ec', 'koi_disposition']]

In [27]:
log_reg_model(feature_select_ss_co)

[[441   6]
 [238 641]]
0.8159879336349924


In [28]:
log_reg_model(feature_select_ss_ec)

[[443   4]
 [311 568]]
0.7624434389140271


In [31]:
feature_select_co_ec = df[['koi_fpflag_co', 'koi_fpflag_ec', 'koi_disposition']]

In [32]:
log_reg_model(feature_select_co_ec)

[[  0 447]
 [  0 879]]
0.6628959276018099
