In [1]:
# imports
import pandas as pd

# Read the CSV and Perform Basic Data Cleaning

In [2]:
df = pd.read_csv("exoplanet_data.csv")
df = df.dropna(axis='columns', how='all')
df = df.dropna()

In [3]:
# redefine future target into 2 classes: CANDIDATE or CONFIRMED =TruePositives; FALSE POSITIVE =FalsePositive
df_orig = df 
df['koi_disposition']= df['koi_disposition'].replace({'CANDIDATE':'TruePositive','CONFIRMED':'TruePositive','FALSE POSITIVE':'FalsePositive'})

# Select your features (columns)

In [4]:
#select target
target = pd.DataFrame(df['koi_disposition'])

In [5]:
features = df.drop('koi_disposition', axis=1)
features_names = features.columns

# Create a Train Test Split



In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.333, random_state=1708)

# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [7]:
# Scale feature data
from sklearn.preprocessing import StandardScaler
x_scaler = StandardScaler().fit(X_train)
X_train_s = x_scaler.transform(X_train)
X_test_s = x_scaler.transform(X_test)

# Train the Log Regression Model


In [8]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(random_state =123)
lr

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=123, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [9]:
lr.fit(X_train_s, y_train)
lr.score(X_test_s, y_test)

  y = column_or_1d(y, warn=True)


0.9879776728209532

# Reduce features

In [10]:
features_names

Index(['koi_fpflag_nt', 'koi_fpflag_ss', 'koi_fpflag_co', 'koi_fpflag_ec',
       'koi_period', 'koi_period_err1', 'koi_period_err2', 'koi_time0bk',
       'koi_time0bk_err1', 'koi_time0bk_err2', 'koi_impact', 'koi_impact_err1',
       'koi_impact_err2', 'koi_duration', 'koi_duration_err1',
       'koi_duration_err2', 'koi_depth', 'koi_depth_err1', 'koi_depth_err2',
       'koi_prad', 'koi_prad_err1', 'koi_prad_err2', 'koi_teq', 'koi_insol',
       'koi_insol_err1', 'koi_insol_err2', 'koi_model_snr', 'koi_tce_plnt_num',
       'koi_steff', 'koi_steff_err1', 'koi_steff_err2', 'koi_slogg',
       'koi_slogg_err1', 'koi_slogg_err2', 'koi_srad', 'koi_srad_err1',
       'koi_srad_err2', 'ra', 'dec', 'koi_kepmag'],
      dtype='object')

In [11]:
lr.coef_[0]

array([-3.37695348, -3.51714289, -3.95419736, -3.05890193, -0.58543905,
        0.01693658, -0.01693658, -0.05400916, -0.09249291,  0.09249291,
       -1.43474228, -0.29511756, -0.3596192 , -0.03048296, -0.05500409,
        0.05500409, -0.92216284,  0.02159099, -0.02159099, -0.07380233,
       -0.06962177,  0.04721527, -0.35825658, -0.35304021,  0.2914424 ,
       -0.38511766, -0.06440293,  0.32718345,  0.19862047, -0.2242176 ,
        0.23612115,  0.02129866,  0.09237442,  0.37494871,  0.30266683,
       -0.22089198,  0.4118479 , -0.02480901,  0.05454229,  0.06088528])

In [12]:
# use top 6 features - extracted by looking at coeffs above, taking largest coeffs in absolute value
reduce_feat = df_orig[['koi_fpflag_nt', 'koi_fpflag_ss', 'koi_fpflag_co', 'koi_impact', 'koi_duration_err1','koi_depth']]
reduce_feat

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_impact,koi_duration_err1,koi_depth
0,0,0,0,0.586,0.11600,874.8
1,0,1,0,0.969,0.03410,10829.0
2,0,1,0,1.276,0.00537,8079.2
3,0,0,0,0.701,0.04200,603.3
4,0,0,0,0.762,0.06730,686.0
...,...,...,...,...,...,...
6986,0,0,0,0.765,0.63400,87.7
6987,0,1,1,1.252,0.01740,1579.2
6988,0,0,0,0.043,0.22900,48.5
6989,0,0,1,0.147,0.16200,103.6


## Train RF model with reduced features (top 6 features)

In [13]:
# split sets
X_train, X_test, y_train, y_test = train_test_split(reduce_feat, target, test_size=0.333, random_state=1708)

# scale
x_scaler = StandardScaler().fit(X_train)
X_train_s = x_scaler.transform(X_train)
X_test_s = x_scaler.transform(X_test)

# model
lr = LogisticRegression(random_state =123)
lr.fit(X_train_s, y_train)
lr.score(X_test_s, y_test)

  y = column_or_1d(y, warn=True)


0.9776728209531989

# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [14]:
model = LogisticRegression(random_state =123, n_jobs=-1)
model

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=-1, penalty='l2',
                   random_state=123, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [15]:
# Create the GridSearchCV model
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [.1, .5, 1, 5]}
grid = GridSearchCV(model, param_grid, verbose=3)

In [16]:
# Train the model with GridSearch
grid.fit(X_train_s, y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV] C=0.1 ...........................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  y = column_or_1d(y, warn=True)


[CV] ............................... C=0.1, score=0.981, total=   2.0s
[CV] C=0.1 ...........................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.9s remaining:    0.0s
  y = column_or_1d(y, warn=True)


[CV] ............................... C=0.1, score=0.973, total=   0.8s
[CV] C=0.1 ...........................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    2.7s remaining:    0.0s
  y = column_or_1d(y, warn=True)


[CV] ............................... C=0.1, score=0.979, total=   0.8s
[CV] C=0.1 ...........................................................


  y = column_or_1d(y, warn=True)


[CV] ............................... C=0.1, score=0.971, total=   0.8s
[CV] C=0.1 ...........................................................
[CV] ............................... C=0.1, score=0.981, total=   0.0s
[CV] C=0.5 ...........................................................
[CV] ............................... C=0.5, score=0.981, total=   0.0s
[CV] C=0.5 ...........................................................
[CV] ............................... C=0.5, score=0.973, total=   0.0s
[CV] C=0.5 ...........................................................
[CV] ............................... C=0.5, score=0.979, total=   0.0s
[CV] C=0.5 ...........................................................
[CV] ............................... C=0.5, score=0.971, total=   0.0s
[CV] C=0.5 ...........................................................
[CV] ............................... C=0.5, score=0.981, total=   0.0s
[CV] C=1 .............................................................
[CV] .

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV] ................................. C=1, score=0.973, total=   0.0s
[CV] C=1 .............................................................
[CV] ................................. C=1, score=0.979, total=   0.0s
[CV] C=1 .............................................................
[CV] ................................. C=1, score=0.971, total=   0.0s
[CV] C=1 .............................................................
[CV] ................................. C=1, score=0.981, total=   0.0s
[CV] C=5 .............................................................
[CV] ................................. C=5, score=0.981, total=   0.0s
[CV] C=5 .............................................................
[CV] ................................. C=5, score=0.973, total=   0.0s
[CV] C=5 .............................................................
[CV] ................................. C=5, score=0.977, total=   0.0s
[CV] C=5 .............................................................
[CV] .

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    4.7s finished
  y = column_or_1d(y, warn=True)


GridSearchCV(cv=None, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='auto',
                                          n_jobs=-1, penalty='l2',
                                          random_state=123, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=None, param_grid={'C': [0.1, 0.5, 1, 5]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=3)

In [17]:
print(grid.best_params_)
print(grid.best_score_)

{'C': 0.1}
0.976833924439599


In [18]:
predictions = grid.predict(X_test_s)
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions,
                            target_names=["True Pos", "False Pos"]))

              precision    recall  f1-score   support

    True Pos       0.98      0.98      0.98      1155
   False Pos       0.98      0.98      0.98      1174

    accuracy                           0.98      2329
   macro avg       0.98      0.98      0.98      2329
weighted avg       0.98      0.98      0.98      2329



# Save the Model 
###  Using best parameters found via GridSearchCV and for reduced number of features (top 6)

In [19]:
# model
lr = LogisticRegression(C=.1, random_state =123, n_jobs=-1)  #using best params found by GridSearchCV above
lr.fit(X_train_s, y_train)
lr.score(X_test_s, y_test)

  y = column_or_1d(y, warn=True)


0.9776728209531989

In [20]:
import joblib
filename = 'DanViassolo_LR.sav'
joblib.dump(lr, filename)

['DanViassolo_LR.sav']