# Update sklearn to prevent version mismatches
# !pip install sklearn --upgrade

# install joblib. This will be used to save your model. 
# Restart your kernel after installing 
# !pip install joblib

In [1]:
import pandas as pd
import tensorflow as tf

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Read the CSV and Perform Basic Data Cleaning

In [2]:
df = pd.read_csv("../data/exoplanet_data.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


# Select your features (columns)

In [3]:
df.columns

Index(['koi_disposition', 'koi_fpflag_nt', 'koi_fpflag_ss', 'koi_fpflag_co',
       'koi_fpflag_ec', 'koi_period', 'koi_period_err1', 'koi_period_err2',
       'koi_time0bk', 'koi_time0bk_err1', 'koi_time0bk_err2', 'koi_impact',
       'koi_impact_err1', 'koi_impact_err2', 'koi_duration',
       'koi_duration_err1', 'koi_duration_err2', 'koi_depth', 'koi_depth_err1',
       'koi_depth_err2', 'koi_prad', 'koi_prad_err1', 'koi_prad_err2',
       'koi_teq', 'koi_insol', 'koi_insol_err1', 'koi_insol_err2',
       'koi_model_snr', 'koi_tce_plnt_num', 'koi_steff', 'koi_steff_err1',
       'koi_steff_err2', 'koi_slogg', 'koi_slogg_err1', 'koi_slogg_err2',
       'koi_srad', 'koi_srad_err1', 'koi_srad_err2', 'ra', 'dec',
       'koi_kepmag'],
      dtype='object')

In [4]:
# Set features. This will also be used as your x values.
selected_features = df[['koi_fpflag_nt', 'koi_fpflag_ss', 'koi_fpflag_co',
                        'koi_fpflag_ec', 'koi_period', 'koi_period_err1', 'koi_period_err2',
                        'koi_time0bk', 'koi_time0bk_err1', 'koi_time0bk_err2', 'koi_impact',
                        'koi_impact_err1', 'koi_impact_err2', 'koi_duration',
                        'koi_duration_err1', 'koi_duration_err2', 'koi_depth', 'koi_depth_err1',
                        'koi_depth_err2', 'koi_prad', 'koi_prad_err1', 'koi_prad_err2',
                        'koi_teq', 'koi_insol', 'koi_insol_err1', 'koi_insol_err2',
                        'koi_model_snr', 'koi_tce_plnt_num', 'koi_steff', 'koi_steff_err1',
                        'koi_steff_err2', 'koi_slogg', 'koi_slogg_err1', 'koi_slogg_err2',
                        'koi_srad', 'koi_srad_err1', 'koi_srad_err2'
                       ]]

selected_features

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,...,koi_tce_plnt_num,koi_steff,koi_steff_err1,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2
0,0,0,0,0,54.418383,2.479000e-04,-2.479000e-04,162.513840,0.003520,-0.003520,...,2,5455,81,-81,4.467,0.064,-0.096,0.927,0.105,-0.061
1,0,1,0,0,19.899140,1.490000e-05,-1.490000e-05,175.850252,0.000581,-0.000581,...,1,5853,158,-176,4.544,0.044,-0.176,0.868,0.233,-0.078
2,0,1,0,0,1.736952,2.630000e-07,-2.630000e-07,170.307565,0.000115,-0.000115,...,1,5805,157,-174,4.564,0.053,-0.168,0.791,0.201,-0.067
3,0,0,0,0,2.525592,3.760000e-06,-3.760000e-06,171.595550,0.001130,-0.001130,...,1,6031,169,-211,4.438,0.070,-0.210,1.046,0.334,-0.133
4,0,0,0,0,4.134435,1.050000e-05,-1.050000e-05,172.979370,0.001900,-0.001900,...,2,6046,189,-232,4.486,0.054,-0.229,0.972,0.315,-0.105
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6986,0,0,0,1,8.589871,1.846000e-04,-1.846000e-04,132.016100,0.015700,-0.015700,...,1,5638,169,-152,4.296,0.231,-0.189,1.088,0.313,-0.228
6987,0,1,1,0,0.527699,1.160000e-07,-1.160000e-07,131.705093,0.000170,-0.000170,...,1,5638,139,-166,4.529,0.035,-0.196,0.903,0.237,-0.079
6988,0,0,0,0,1.739849,1.780000e-05,-1.780000e-05,133.001270,0.007690,-0.007690,...,1,6119,165,-220,4.444,0.056,-0.224,1.031,0.341,-0.114
6989,0,0,1,0,0.681402,2.430000e-06,-2.430000e-06,132.181750,0.002850,-0.002850,...,1,6173,193,-236,4.447,0.056,-0.224,1.041,0.341,-0.114


# Create a Train Test Split

Use `koi_disposition` for the y values

In [5]:
# Assign X (data) and y (target)

X = selected_features
y = df["koi_disposition"].values.reshape(-1,1)

print(X.shape, y.shape)

(6991, 37) (6991, 1)


In [6]:
# Split the data into training and testing

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [7]:
X_train.head()

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,...,koi_tce_plnt_num,koi_steff,koi_steff_err1,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2
6122,0,0,0,0,6.768901,7.38e-05,-7.38e-05,133.07724,0.00844,-0.00844,...,1,5737,154,-171,4.327,0.153,-0.187,1.125,0.31,-0.207
6370,0,1,0,1,0.733726,6.06e-06,-6.06e-06,132.02005,0.00795,-0.00795,...,1,5855,158,-175,4.578,0.033,-0.187,0.797,0.211,-0.056
2879,1,0,0,0,7.652707,6.54e-05,-6.54e-05,134.46038,0.00619,-0.00619,...,1,6328,151,-189,4.481,0.05,-0.2,0.963,0.29,-0.097
107,0,0,0,0,7.953547,1.91e-05,-1.91e-05,174.66224,0.00182,-0.00182,...,1,4768,76,-85,4.536,0.056,-0.016,0.779,0.023,-0.049
29,0,0,0,0,4.959319,5.15e-07,-5.15e-07,172.258529,8.3e-05,-8.3e-05,...,1,5712,77,-77,4.359,0.11,-0.11,1.082,0.173,-0.13


# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [8]:
# Scale your data

X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [9]:
# Step 1: Label-encode data set
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

  return f(**kwargs)


In [10]:
# Step 2: Convert encoded labels to one-hot-encoding
y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)

In [11]:
y_train_categorical[:10]

array([[1., 0., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.]], dtype=float32)

In [12]:
print(y_train_categorical.shape)

(5243, 3)


# Train the Model



In [13]:
# Create an empty sequential model
model = Sequential()

# Add the first layer where the input dimensions are the 561 columns of the training data
model.add(Dense(100, activation='relu', input_dim=X_train_scaled.shape[1]))

# Add a second hidden layer
model.add(Dense(100, activation='relu'))

# Add output layer
model.add(Dense(y_train_categorical.shape[1], activation="softmax"))

# Compile the model using categorical_crossentropy for the loss function, the adam optimizer,
# and add accuracy to the training metrics
model.compile(loss="categorical_crossentropy",
              optimizer="adam", metrics=['accuracy'])

In [14]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 100)               3800      
_________________________________________________________________
dense_1 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_2 (Dense)              (None, 3)                 303       
Total params: 14,203
Trainable params: 14,203
Non-trainable params: 0
_________________________________________________________________


In [15]:
# Use the training data to fit (train) the model
model.fit(
    X_train_scaled,
    y_train_categorical,
    epochs=150,
    shuffle=True,
    verbose=2
)

Epoch 1/100
164/164 - 1s - loss: 0.5685 - accuracy: 0.7166
Epoch 2/100
164/164 - 0s - loss: 0.3692 - accuracy: 0.8119
Epoch 3/100
164/164 - 0s - loss: 0.3541 - accuracy: 0.8104
Epoch 4/100
164/164 - 0s - loss: 0.3378 - accuracy: 0.8354
Epoch 5/100
164/164 - 0s - loss: 0.3308 - accuracy: 0.8394
Epoch 6/100
164/164 - 0s - loss: 0.3253 - accuracy: 0.8449
Epoch 7/100
164/164 - 0s - loss: 0.3217 - accuracy: 0.8419
Epoch 8/100
164/164 - 0s - loss: 0.3131 - accuracy: 0.8505
Epoch 9/100
164/164 - 0s - loss: 0.3095 - accuracy: 0.8545
Epoch 10/100
164/164 - 0s - loss: 0.3124 - accuracy: 0.8564
Epoch 11/100
164/164 - 0s - loss: 0.3055 - accuracy: 0.8617
Epoch 12/100
164/164 - 0s - loss: 0.2994 - accuracy: 0.8653
Epoch 13/100
164/164 - 0s - loss: 0.3018 - accuracy: 0.8623
Epoch 14/100
164/164 - 0s - loss: 0.2991 - accuracy: 0.8652
Epoch 15/100
164/164 - 0s - loss: 0.2967 - accuracy: 0.8652
Epoch 16/100
164/164 - 0s - loss: 0.2880 - accuracy: 0.8720
Epoch 17/100
164/164 - 0s - loss: 0.2902 - accura

<tensorflow.python.keras.callbacks.History at 0x21348300c50>

In [19]:
model_loss, model_accuracy = model.evaluate(X_train_scaled, y_train_categorical, verbose=2)
model2_loss, model2_accuracy = model.evaluate(X_test_scaled, y_test_categorical, verbose=2)

164/164 - 0s - loss: 0.2210 - accuracy: 0.9010
55/55 - 0s - loss: 0.2626 - accuracy: 0.8953


In [16]:
print(f"Training Data Score: {model.score(X_train_scaled, y_train_categorical)}")
print(f"Testing Data Score: {model.score(X_test_scaled, y_test_categorical)}")

AttributeError: 'Sequential' object has no attribute 'score'

# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [None]:
# Create the GridSearchCV model

In [None]:
# Train the model with GridSearch

In [None]:
print(grid2.best_params_)
print(grid2.best_score_)

# Save the Model

In [None]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash
import joblib
filename = 'your_name.sav'
joblib.dump(your_model, filename)