In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.model_selection import GridSearchCV
import pickle

Using TensorFlow backend.


## Preprocess the Data
* Separate the data into training and testing data.
* Use MinMaxScaler to scale the numerical data.

In [2]:
# Read the training data
training_df = pd.read_csv("Data/Cleaned_Data.csv") 
training_df.head()

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_time0bk,koi_impact,koi_duration,koi_depth,koi_prad,koi_teq,koi_insol,koi_model_snr,koi_steff,koi_slogg,koi_srad,ra,dec,koi_disposition
0,0,0,0,0,9.488036,170.53875,0.146,2.9575,615.8,2.26,793.0,93.59,35.8,5455.0,4.467,0.927,291.93423,48.141651,CONFIRMED
1,0,0,0,0,54.418383,162.51384,0.586,4.507,874.8,2.83,443.0,9.11,25.8,5455.0,4.467,0.927,291.93423,48.141651,CONFIRMED
2,0,1,0,0,19.89914,175.850252,0.969,1.7822,10829.0,14.6,638.0,39.3,76.3,5853.0,4.544,0.868,297.00482,48.134129,FALSE POSITIVE
3,0,1,0,0,1.736952,170.307565,1.276,2.40641,8079.2,33.46,1395.0,891.96,505.6,5805.0,4.564,0.791,285.53461,48.28521,FALSE POSITIVE
4,0,0,0,0,2.525592,171.59555,0.701,1.6545,603.3,2.75,1406.0,926.16,40.9,6031.0,4.438,1.046,288.75488,48.2262,CONFIRMED


In [3]:
# select target feature
target_feature = "koi_disposition"

In [4]:
# Split the data
X = training_df.drop(columns=[target_feature])
y = training_df[[target_feature]].values.ravel()
#
print(X.shape, y.shape)

(9201, 18) (9201,)


In [5]:
# Create a Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [6]:
# Scale the training data to better train
def scale_data(X, X_train, X_test):
    '''
        return the scaled training and testing data with the scaler
    '''
    # create the scaler for the core data
    X_scaler = MinMaxScaler().fit(X)
    # apply the scale to training and testing data sets
    X_train_scaled = X_scaler.transform(X_train)
    X_test_scaled = X_scaler.transform(X_test)
    #
    return X_train_scaled, X_test_scaled, X_scaler

X_train, X_test, X_scaler = scale_data(X, X_train, X_test)

In [7]:
# Label Encode the target feature from human words to machine id's
def label_data(y, y_train, y_test):
    '''
        return the labled training and testing data with the encoder
    '''
    # create and fit the label encoder
    label_encoder = LabelEncoder()
    label_encoder.fit(y)
    # create the label encoded values
    y_train_encoded = label_encoder.transform(y_train)
    y_test_encoded = label_encoder.transform(y_test)
    # create the labeled data
    y_train_labeled = to_categorical(y_train_encoded)
    y_test_labeled = to_categorical(y_test_encoded)
    #
    return y_train_labeled, y_test_labeled, label_encoder
    
y_train, y_test, y_label_encoder = label_data(y, y_train, y_test)

## Tune Model Parameters

In [8]:
print(X_train.shape, y_train.shape)

(6900, 18) (6900, 3)


In [9]:
# Create the model and the layers
model = Sequential()
model.add(Dense(units=50, activation="relu", input_dim=X_train.shape[1]))
model.add(Dense(units=100, activation="selu"))
model.add(Dense(units=200, activation="elu"))
model.add(Dense(units=100, activation="selu"))
model.add(Dense(units=y_train.shape[1], activation="softmax"))

In [10]:
# Compile the model
model.compile(
    optimizer="adam",
    loss="categorical_crossentropy",
    metrics=["accuracy"]
)
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 50)                950       
_________________________________________________________________
dense_1 (Dense)              (None, 100)               5100      
_________________________________________________________________
dense_2 (Dense)              (None, 200)               20200     
_________________________________________________________________
dense_3 (Dense)              (None, 100)               20100     
_________________________________________________________________
dense_4 (Dense)              (None, 3)                 303       
Total params: 46,653
Trainable params: 46,653
Non-trainable params: 0
_________________________________________________________________


In [None]:
# Fit the model to the training data!
model.fit(
    x=X_train,
    y=y_train,
    batch_size=None,
    epochs=500,
    verbose=2,
    callbacks=None,
    shuffle=True,
    class_weight=None,
    sample_weight=None,
    initial_epoch=0,
    steps_per_epoch=None,
    validation_steps=None,
    validation_freq=0.001,
    max_queue_size=1000,
    workers=4,
    use_multiprocessing=False,
)

In [12]:
# Print the models loss and accuracy scores
model_loss, model_accuracy = model.evaluate(
    X_test, 
    y_test, 
    verbose=2
)

2301/2301 - 0s - loss: 0.6855 - accuracy: 0.8653


In [17]:
# save the model to disk
model.save("Deep_Learning_Model.h5")