In [2]:
import pandas as pd

# Read the CSV and Perform Basic Data Cleaning

In [3]:
df = pd.read_csv("exoplanet_data.csv")

# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')

# Drop the null rows
df = df.dropna()

# Remove Candidate so can predict Confirmed or False Positive
features_df = df[df.koi_disposition != 'CANDIDATE']

# Display Dataframe
features_df

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,2.479000e-04,-2.479000e-04,162.513840,0.003520,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.899140,1.490000e-05,-1.490000e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.630000e-07,-2.630000e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.285210,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.760000e-06,-3.760000e-06,171.595550,0.001130,...,-211,4.438,0.070,-0.210,1.046,0.334,-0.133,288.75488,48.226200,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.050000e-05,-1.050000e-05,172.979370,0.001900,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.224670,15.714
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6983,FALSE POSITIVE,0,1,0,0,21.513523,2.714000e-04,-2.714000e-04,132.335600,0.012200,...,-141,3.508,0.187,-0.153,3.318,0.665,-0.813,287.46786,37.966640,10.630
6986,FALSE POSITIVE,0,0,0,1,8.589871,1.846000e-04,-1.846000e-04,132.016100,0.015700,...,-152,4.296,0.231,-0.189,1.088,0.313,-0.228,298.74921,46.973351,14.478
6987,FALSE POSITIVE,0,1,1,0,0.527699,1.160000e-07,-1.160000e-07,131.705093,0.000170,...,-166,4.529,0.035,-0.196,0.903,0.237,-0.079,297.18875,47.093819,14.082
6989,FALSE POSITIVE,0,0,1,0,0.681402,2.430000e-06,-2.430000e-06,132.181750,0.002850,...,-236,4.447,0.056,-0.224,1.041,0.341,-0.114,294.16489,47.176281,15.385


# Select your features (columns)

In [4]:
# Set features. This will also be used as your x values.
X = features_df.drop(columns=['koi_disposition'])
y = features_df['koi_disposition']#.values.reshape(-1,1)
print(X.shape,y.shape)

(5304, 40) (5304,)


In [5]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X,y)
rf.score(X,y)

1.0

In [6]:
#Find relevant variables

importances = rf.feature_importances_
importances

array([0.06599833, 0.06740756, 0.13107001, 0.04019547, 0.0157802 ,
       0.0092723 , 0.00812989, 0.00711472, 0.02153539, 0.02452095,
       0.01791866, 0.00505666, 0.00380416, 0.00764849, 0.03779811,
       0.03854953, 0.01282065, 0.01417315, 0.00917446, 0.06328264,
       0.04560511, 0.04921217, 0.0140814 , 0.01751375, 0.01960622,
       0.01535987, 0.02488107, 0.00127922, 0.0035465 , 0.08096079,
       0.06817971, 0.00618841, 0.00317807, 0.0127284 , 0.00511971,
       0.01439047, 0.00473123, 0.00439234, 0.00318539, 0.00460883])

In [17]:
# Map importances to variables

importances_mapped = sorted(zip(importances , X), reverse=True)
importances_mapped

[(0.13107000836284755, 'koi_fpflag_co'),
 (0.08096078783680338, 'koi_steff_err1'),
 (0.0681797050386322, 'koi_steff_err2'),
 (0.06740756249789426, 'koi_fpflag_ss'),
 (0.06599833401764849, 'koi_fpflag_nt'),
 (0.06328264249898576, 'koi_prad'),
 (0.04921217327891795, 'koi_prad_err2'),
 (0.04560510529060518, 'koi_prad_err1'),
 (0.04019546771213352, 'koi_fpflag_ec'),
 (0.03854953163994521, 'koi_duration_err2'),
 (0.03779811110300887, 'koi_duration_err1'),
 (0.02488106716731001, 'koi_model_snr'),
 (0.02452094978437265, 'koi_time0bk_err2'),
 (0.02153538980501611, 'koi_time0bk_err1'),
 (0.019606222326904718, 'koi_insol_err1'),
 (0.017918658866075163, 'koi_impact'),
 (0.0175137517698148, 'koi_insol'),
 (0.015780202682358687, 'koi_period'),
 (0.015359865325454004, 'koi_insol_err2'),
 (0.01439046770805236, 'koi_srad_err1'),
 (0.01417314719255667, 'koi_depth_err1'),
 (0.014081395910448926, 'koi_teq'),
 (0.012820654571876368, 'koi_depth'),
 (0.012728398201821313, 'koi_slogg_err2'),
 (0.009272302546

In [23]:
#Overwrite X with variables who have feature imporance > .05

X = features_df[['koi_fpflag_co','koi_fpflag_ss', 'koi_fpflag_nt','koi_steff_err1', 'koi_steff_err2','koi_prad']]
X

Unnamed: 0,koi_fpflag_co,koi_fpflag_ss,koi_fpflag_nt,koi_steff_err1,koi_steff_err2,koi_prad
0,0,0,0,81,-81,2.83
1,0,1,0,158,-176,14.60
2,0,1,0,157,-174,33.46
3,0,0,0,169,-211,2.75
4,0,0,0,189,-232,2.77
...,...,...,...,...,...,...
6983,0,1,0,128,-141,534.47
6986,0,0,0,169,-152,1.11
6987,1,1,0,139,-166,29.35
6989,1,0,0,193,-236,1.07


# Create a Train Test Split

Use `koi_disposition` for the y values

In [24]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

y_train

979     FALSE POSITIVE
6201    FALSE POSITIVE
5648    FALSE POSITIVE
229          CONFIRMED
1302         CONFIRMED
             ...      
4043    FALSE POSITIVE
4985    FALSE POSITIVE
6836    FALSE POSITIVE
6880    FALSE POSITIVE
996          CONFIRMED
Name: koi_disposition, Length: 3978, dtype: object

# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [25]:
# Scale your data
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
X_scaler = MinMaxScaler().fit(X_train)
X_scaler

MinMaxScaler()

In [27]:
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

for label, original_class in zip(encoded_y_train, y_train):
    print('Original Class: ' + str(original_class))
    print('Encoded Label: ' + str(label))
    print('-' * 12)

Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 0
-----------

------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class: FALSE POSITIVE
Encoded La

Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class

Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: FALSE POSITIV

Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded 

Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Enc

------------
Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 0
--------

------------
Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class: FALSE POSITIVE
Encoded La

------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: FALSE POSI

In [7]:
#from tensorflow.keras.utils import to_categorical

# Step 2: Convert encoded labels to one-hot-encoding
#y_train_categorical = to_categorical(encoded_y_train)
#y_test_categorical = to_categorical(encoded_y_test)

#y_train_categorical.shape

(5243, 3)

# Train the Model



In [32]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model

LogisticRegression()

In [33]:
model.fit(X_train_scaled, encoded_y_train)

LogisticRegression()

In [36]:
model_train_accuracy = model.score(X_train_scaled, encoded_y_train)
model_test_accuracy = model.score(X_test_scaled, encoded_y_test)
print(f"Training Data Score: {model_train_accuracy}")
print(f"Testing Data Score: {model_test_accuracy}")

Training Data Score: 0.9761186525892408
Testing Data Score: 0.969079939668175


In [38]:
print(
    f"Logistic Regression using Random Forest and Label Encoder has an accuracy of {model_test_accuracy}")

Logistic Regression using Random Forest and Label Encoder has an accuracy of 0.969079939668175


# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [42]:
# Create the GridSearchCV model
#from sklearn.svm import SVC 
#model = SVC(kernel='linear')
#model

from sklearn.model_selection import GridSearchCV
param_grid = {'C': [1, 5, 10, 50]}
              #'gamma': [0.0001, 0.0005, 0.001, 0.005]}
grid = GridSearchCV(model, param_grid, verbose=3)

In [43]:
# Train the model with GridSearch
grid.fit(X_train_scaled, encoded_y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV] C=1 .............................................................
[CV] ................................. C=1, score=0.969, total=   0.0s
[CV] C=1 .............................................................
[CV] ................................. C=1, score=0.971, total=   0.0s
[CV] C=1 .............................................................
[CV] ................................. C=1, score=0.974, total=   0.0s
[CV] C=1 .............................................................
[CV] ................................. C=1, score=0.989, total=   0.0s
[CV] C=1 .............................................................
[CV] ................................. C=1, score=0.979, total=   0.0s
[CV] C=5 .............................................................
[CV] ................................. C=5, score=0.969, total=   0.0s
[CV] C=5 .............................................................
[CV] ............

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.1s remaining:    0.0s


[CV] ................................. C=5, score=0.977, total=   0.0s
[CV] C=10 ............................................................
[CV] ................................ C=10, score=0.969, total=   0.0s
[CV] C=10 ............................................................
[CV] ................................ C=10, score=0.971, total=   0.0s
[CV] C=10 ............................................................
[CV] ................................ C=10, score=0.971, total=   0.0s
[CV] C=10 ............................................................
[CV] ................................ C=10, score=0.989, total=   0.0s
[CV] C=10 ............................................................
[CV] ................................ C=10, score=0.976, total=   0.0s
[CV] C=50 ............................................................
[CV] ................................ C=50, score=0.969, total=   0.0s
[CV] C=50 ............................................................
[CV] .

[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    0.5s finished


GridSearchCV(estimator=LogisticRegression(), param_grid={'C': [1, 5, 10, 50]},
             verbose=3)

In [44]:
print(grid.best_params_)
print(grid.best_score_)

{'C': 1}
0.97612243607977


# Save the Model

In [45]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash
import joblib
filename = 'logistic_reg.h5'
joblib.dump(model, filename)
#model.save(filename)

['logistic_reg.h5']