In [1]:
# Update sklearn to prevent version mismatches

# !pip install sklearn --upgrade

In [2]:
# install joblib. This will be used to save model. 
# Restart your kernel after installing

# !pip install joblib

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [4]:
# Visualize more columns
pd.options.display.max_columns = None

# Read the CSV and Perform Basic Data Cleaning

In [5]:
df = pd.read_csv("../resources/exoplanet_data.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,koi_impact,koi_impact_err1,koi_impact_err2,koi_duration,koi_duration_err1,koi_duration_err2,koi_depth,koi_depth_err1,koi_depth_err2,koi_prad,koi_prad_err1,koi_prad_err2,koi_teq,koi_insol,koi_insol_err1,koi_insol_err2,koi_model_snr,koi_tce_plnt_num,koi_steff,koi_steff_err1,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,-0.00352,0.586,0.059,-0.443,4.507,0.116,-0.116,874.8,35.5,-35.5,2.83,0.32,-0.19,443,9.11,2.87,-1.62,25.8,2,5455,81,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,-0.000581,0.969,5.126,-0.077,1.7822,0.0341,-0.0341,10829.0,171.0,-171.0,14.6,3.92,-1.31,638,39.3,31.04,-10.49,76.3,1,5853,158,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,-0.000115,1.276,0.115,-0.092,2.40641,0.00537,-0.00537,8079.2,12.8,-12.8,33.46,8.5,-2.83,1395,891.96,668.95,-230.35,505.6,1,5805,157,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,-0.00113,0.701,0.235,-0.478,1.6545,0.042,-0.042,603.3,16.9,-16.9,2.75,0.88,-0.35,1406,926.16,874.33,-314.24,40.9,1,6031,169,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,-0.0019,0.762,0.139,-0.532,3.1402,0.0673,-0.0673,686.0,18.7,-18.7,2.77,0.9,-0.3,1160,427.65,420.33,-136.7,40.2,2,6046,189,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


# Select features

In [6]:
# Selecting columns that don't consist error estiamte values
columns = list(df.columns)

new_columns = []
for column in columns:
    if 'err' not in column:
        new_columns.append(column)

new_df = df[new_columns]
new_df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_time0bk,koi_impact,koi_duration,koi_depth,koi_prad,koi_teq,koi_insol,koi_model_snr,koi_tce_plnt_num,koi_steff,koi_slogg,koi_srad,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,162.51384,0.586,4.507,874.8,2.83,443,9.11,25.8,2,5455,4.467,0.927,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.89914,175.850252,0.969,1.7822,10829.0,14.6,638,39.3,76.3,1,5853,4.544,0.868,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,170.307565,1.276,2.40641,8079.2,33.46,1395,891.96,505.6,1,5805,4.564,0.791,285.53461,48.28521,15.597
3,CONFIRMED,0,0,0,0,2.525592,171.59555,0.701,1.6545,603.3,2.75,1406,926.16,40.9,1,6031,4.438,1.046,288.75488,48.2262,15.509
4,CONFIRMED,0,0,0,0,4.134435,172.97937,0.762,3.1402,686.0,2.77,1160,427.65,40.2,2,6046,4.486,0.972,296.28613,48.22467,15.714


In [7]:
# Set features. This will also be used as the x values.
X = new_df.drop("koi_disposition", axis=1)
# Use `koi_disposition` for the y values
y = new_df['koi_disposition'].values.reshape(-1, 1)

print(X.shape, y.shape)

(6991, 20) (6991, 1)


# Encode classes (y values)

In [8]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

label_encoder.fit(y)
encoded_y = label_encoder.transform(y)

# Visualize encoded y 
for label, original_class in zip(encoded_y[0:50], y[0:50]):
    print('Original Class: ' + str(original_class))
    print('Encoded Label: ' + str(label))
    print('-' * 12)

Original Class: ['CONFIRMED']
Encoded Label: 1
------------
Original Class: ['FALSE POSITIVE']
Encoded Label: 2
------------
Original Class: ['FALSE POSITIVE']
Encoded Label: 2
------------
Original Class: ['CONFIRMED']
Encoded Label: 1
------------
Original Class: ['CONFIRMED']
Encoded Label: 1
------------
Original Class: ['CONFIRMED']
Encoded Label: 1
------------
Original Class: ['CONFIRMED']
Encoded Label: 1
------------
Original Class: ['CONFIRMED']
Encoded Label: 1
------------
Original Class: ['CONFIRMED']
Encoded Label: 1
------------
Original Class: ['CONFIRMED']
Encoded Label: 1
------------
Original Class: ['CONFIRMED']
Encoded Label: 1
------------
Original Class: ['FALSE POSITIVE']
Encoded Label: 2
------------
Original Class: ['FALSE POSITIVE']
Encoded Label: 2
------------
Original Class: ['FALSE POSITIVE']
Encoded Label: 2
------------
Original Class: ['CONFIRMED']
Encoded Label: 1
------------
Original Class: ['CONFIRMED']
Encoded Label: 1
------------
Original Class:

  return f(*args, **kwargs)


In [9]:
# Check class categories
label_encoder.classes_

array(['CANDIDATE', 'CONFIRMED', 'FALSE POSITIVE'], dtype=object)

In [10]:
# Create target names for our classification report
target_names = ['CANDIDATE', 'CONFIRMED', 'FALSE POSITIVE']

In [11]:
encoded_y

array([1, 2, 2, ..., 0, 2, 2])

# Create a Train Test Split

In [12]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train_encoded, y_test_encoded = train_test_split(X, encoded_y, random_state=42)

In [13]:
X_train.head()

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_time0bk,koi_impact,koi_duration,koi_depth,koi_prad,koi_teq,koi_insol,koi_model_snr,koi_tce_plnt_num,koi_steff,koi_slogg,koi_srad,ra,dec,koi_kepmag
6122,0,0,0,0,6.768901,133.07724,0.15,3.616,123.1,1.24,1017,253.3,10.8,1,5737,4.327,1.125,294.40472,39.351681,14.725
6370,0,1,0,1,0.733726,132.02005,0.291,2.309,114.6,0.86,1867,2891.64,13.8,1,5855,4.578,0.797,284.50391,42.46386,15.77
2879,1,0,0,0,7.652707,134.46038,0.97,79.8969,641.1,3.21,989,226.81,254.3,1,6328,4.481,0.963,295.50211,38.98354,13.099
107,0,0,0,0,7.953547,174.66224,0.3,2.6312,875.4,2.25,696,55.37,38.4,1,4768,4.536,0.779,291.15878,40.750271,15.66
29,0,0,0,0,4.959319,172.258529,0.831,2.22739,9802.0,12.21,1103,349.4,696.5,1,5712,4.359,1.082,292.16705,48.727589,15.263


In [14]:
y_train_encoded

array([0, 2, 2, ..., 2, 2, 2])

**Using the Keras to perform one-hot encoding on y (for Neural Networks/Deep Learning purpose)**

In [15]:
from keras.utils import to_categorical

y_train_one_hot = to_categorical(y_train_encoded)
y_test_one_hot = to_categorical(y_test_encoded)

Using TensorFlow backend.


In [16]:
y_train_one_hot

array([[1., 0., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       ...,
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.]], dtype=float32)

# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [17]:
# Scale
from sklearn.preprocessing import MinMaxScaler

X_scaler = MinMaxScaler().fit(X_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [18]:
X_train_scaled

array([[0.        , 0.        , 0.        , ..., 0.66574567, 0.17604958,
        0.64129267],
       [0.        , 1.        , 0.        , ..., 0.21268467, 0.37354005,
        0.72766344],
       [1.        , 0.        , 0.        , ..., 0.71596223, 0.15268835,
        0.5069014 ],
       ...,
       [0.        , 1.        , 0.        , ..., 0.792823  , 0.59720043,
        0.30217373],
       [1.        , 0.        , 0.        , ..., 0.77821733, 0.29000226,
        0.52153071],
       [0.        , 0.        , 1.        , ..., 0.68082222, 0.27185353,
        0.61930738]])

# Train the Model



**Deep Learning**

In [19]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [20]:
# Create model and add layers
dl_model = Sequential()
dl_model.add(Dense(units=100, activation='relu', input_dim=20))
dl_model.add(Dense(units=100, activation='relu'))
dl_model.add(Dense(units=3, activation='softmax'))

In [21]:
# Compile and fit the model
dl_model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [22]:
dl_model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 100)               2100      
_________________________________________________________________
dense_1 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_2 (Dense)              (None, 3)                 303       
Total params: 12,503
Trainable params: 12,503
Non-trainable params: 0
_________________________________________________________________


In [23]:
dl_model.fit(
    X_train_scaled,
    y_train_one_hot,
    epochs=60,
    shuffle=True,
    verbose=2
)

Train on 5243 samples
Epoch 1/60
5243/5243 - 1s - loss: 0.5292 - accuracy: 0.7233
Epoch 2/60
5243/5243 - 1s - loss: 0.3775 - accuracy: 0.7879
Epoch 3/60
5243/5243 - 1s - loss: 0.3693 - accuracy: 0.7942
Epoch 4/60
5243/5243 - 0s - loss: 0.3651 - accuracy: 0.8022
Epoch 5/60
5243/5243 - 1s - loss: 0.3603 - accuracy: 0.8053
Epoch 6/60
5243/5243 - 1s - loss: 0.3584 - accuracy: 0.8053
Epoch 7/60
5243/5243 - 0s - loss: 0.3555 - accuracy: 0.8131
Epoch 8/60
5243/5243 - 0s - loss: 0.3531 - accuracy: 0.8175
Epoch 9/60
5243/5243 - 1s - loss: 0.3509 - accuracy: 0.8179
Epoch 10/60
5243/5243 - 0s - loss: 0.3503 - accuracy: 0.8171
Epoch 11/60
5243/5243 - 1s - loss: 0.3516 - accuracy: 0.8144
Epoch 12/60
5243/5243 - 0s - loss: 0.3489 - accuracy: 0.8165
Epoch 13/60
5243/5243 - 1s - loss: 0.3483 - accuracy: 0.8108
Epoch 14/60
5243/5243 - 1s - loss: 0.3462 - accuracy: 0.8159
Epoch 15/60
5243/5243 - 0s - loss: 0.3447 - accuracy: 0.8152
Epoch 16/60
5243/5243 - 1s - loss: 0.3450 - accuracy: 0.8215
Epoch 17/60

<tensorflow.python.keras.callbacks.History at 0x7fdde0e12d30>

In [24]:
# Quantify trained model
model_loss, model_accuracy = dl_model.evaluate(
    X_test_scaled, y_test_one_hot, verbose=2)

print(
    f"Model - Loss: {model_loss}, Accuracy: {model_accuracy}")

1748/1 - 0s - loss: 0.3258 - accuracy: 0.8444
Model - Loss: 0.3344597158361081, Accuracy: 0.8443936109542847


In [25]:
# Make predictions with the hypertuned model
# predictions = dl_model.predict(X_test_scaled)

In [26]:
# predictions

#### Observation on model performance
* This model's overall accuracy is 0.854.

# Save the Model

In [27]:
# Save the model
dl_model.save("../saved_ml_models/deep_learning.h5")

In [28]:
# Testing to make sure model saved correctly...

# from tensorflow.keras.models import load_model
# loaded_model = load_model("../saved_ml_models/deep_network.h5")