In [35]:
import glob

import pandas as pd
import numpy as np
import keras
from sklearn.model_selection import train_test_split
from keras.models import Model
from keras.layers import Dense, BatchNormalization, Dropout, Input, LeakyReLU, PReLU
from keras.callbacks import LearningRateScheduler
from keras.optimizers import SGD, Adam
from keras.utils import to_categorical
from keras import backend as K

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support

import warnings

In [2]:
warnings.filterwarnings("ignore")

In [3]:
files = glob.glob('*.csv')
print(files)

['dataset-rocky-no-STemp.csv', 'dataset-rocky-MR.csv', 'dataset-2-class.csv', 'dataset-rocky-restricted-feat-set.csv', 'dataset_no_ST_related_features.csv', 'dataset-rocky-all-feats.csv']


In [4]:
def rebalance(frame, col='hab_lbl', factor=1):
    max_size = frame[col].value_counts().max()
    lst = [frame]
    for class_index, group in frame.groupby(col):
        lst.append(group.sample(int((max_size-len(group)) / factor), replace=True))
    frame_new = pd.concat(lst)
    
    return frame_new

In [19]:
def preprocess(frame):
    """
    Preprocess a dataset:
    * Remove the P.Habitable column
    * Split dataset
    * Normalize dataset
    * Rebalance training set
    * Categorize outputs
    
    Returns:
        Tuple, (x_train, x_test, y_train, y_test)
    """
    if 'P. Habitable' in frame.columns:
        frame.drop('P. Habitable', axis=1)
    
    train_df, test_df = train_test_split(frame, train_size=0.8)
    train_df = rebalance(train_df)
    
    y = train_df['hab_lbl']
    y_test = test_df['hab_lbl']
    train_df.drop('hab_lbl', axis=1, inplace=True)
    test_df.drop('hab_lbl', axis=1, inplace=True)
    
    y = to_categorical(np.array(y))
    y_test = to_categorical(np.array(y_test))
    
    x_train = np.array(train_df)
    x_test = np.array(test_df)
    
    return (x_train, x_test, y, y_test)

In [6]:
batch_size = 32

In [7]:
def get_model(shape, classes):
    inp = Input(shape=(shape,))

    bn1 = BatchNormalization(name='first_bn')(inp)
    relu = Dense(20, activation='relu', name='dense1')(bn1)
    drop1 = Dropout(0.2, name='dropout1')(relu)

    bn = BatchNormalization(name='bn1')(drop1)
    relu = Dense(10, activation='relu', name='dense2')(bn)
    drop2 = Dropout(0.2)(relu)

    interm = keras.layers.Concatenate()([drop1, drop2])

    bn = BatchNormalization(name='bn2')(interm)
    relu = Dense(10, activation='relu', name='dense3')(bn)
    drop = Dropout(0.2)(relu)

    interm = keras.layers.Concatenate()([drop, drop2])

    bn = BatchNormalization()(interm)
    out = Dense(classes, activation='softmax', name='dense4')(bn)

    model = Model(inputs=inp, outputs=out)
    return model

In [10]:
def get_model2(shape, classes):
    inp = Input(shape=(shape,))
    bn = BatchNormalization()(inp)
    relu = Dense(20, activation='relu')(bn)
    relu = Dense(20, activation='relu')(relu)
    relu = Dense(20, activation='relu')(relu)
    relu = Dense(20, activation='relu')(relu)
    relu = Dense(20, activation='relu')(relu)
    relu = Dense(20, activation='relu')(relu)
    out = Dense(classes, activation='softmax')(relu)
    
    return Model(inputs=inp, outputs=out)

In [16]:
for file in files:
    df = pd.read_csv(file)
    x_train, x_test, y_train, y_test = preprocess(df)
    
    print('Processing:', file)
    print('============' + '=' * len(file))
    
    if file == 'dataset-2-class.csv':
        model = get_model2(x_train.shape[1], 2)
    else:
        model = get_model2(x_train.shape[1], 3)
    model.compile(SGD(0.01, momentum=0.9), loss='categorical_crossentropy', metrics=['accuracy'])
    model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=10, verbose=1)
    
    predictions = np.argmax(model.predict(x_test), axis=1)
    true_y = np.argmax(y_test, axis=1)
    
    matrix = confusion_matrix(true_y, predictions)
    print('Accuracy scores:', matrix.diagonal()/matrix.sum(axis=1))
    
    print('============' + '=' * len(file))

Processing: dataset-rocky-no-STemp.csv
Train on 3999 samples, validate on 343 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy scores: [0.99698795 1.         1.        ]
Processing: dataset-rocky-MR.csv
Train on 3996 samples, validate on 343 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy scores: [0.94894895 1.         0.        ]
Processing: dataset-2-class.csv
Train on 6052 samples, validate on 766 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy scores: [0.99867198 1.        ]
Processing: dataset-rocky-restricted-feat-set.csv
Train on 3996 samples, validate on 343 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy scores: [0.96696697 0.75       0.        ]
Processing: datas

Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy scores: [0.99404762 1.         1.        ]


## Leaky ReLU

In [20]:
def get_model_leaky(shape, classes):
    inp = Input(shape=(shape,))
    bn = BatchNormalization()(inp)
    relu = Dense(20)(bn)
    relu = LeakyReLU()(relu)
    relu = Dense(20)(relu)
    relu = LeakyReLU()(relu)
    relu = Dense(20)(relu)
    relu = LeakyReLU()(relu)
    relu = Dense(20)(relu)
    relu = LeakyReLU()(relu)
    relu = Dense(20)(relu)
    relu = LeakyReLU()(relu)
    relu = Dense(20)(relu)
    relu = LeakyReLU()(relu)
    out = Dense(classes, activation='softmax')(relu)
    
    return Model(inputs=inp, outputs=out)

In [22]:
for file in files:
    df = pd.read_csv(file)
    x_train, x_test, y_train, y_test = preprocess(df)
    
    print('Processing:', file)
    print('============' + '=' * len(file))
    
    if file == 'dataset-2-class.csv':
        model = get_model_leaky(x_train.shape[1], 2)
    else:
        model = get_model_leaky(x_train.shape[1], 3)
    model.compile(SGD(0.01, momentum=0.9), loss='categorical_crossentropy', metrics=['accuracy'])
    model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=10, verbose=0)
    
    predictions = np.argmax(model.predict(x_test), axis=1)
    true_y = np.argmax(y_test, axis=1)
    
    matrix = confusion_matrix(true_y, predictions)
    print('Accuracy scores:', matrix.diagonal()/matrix.sum(axis=1))
    
    print('============' + '=' * len(file))

Processing: dataset-rocky-no-STemp.csv
Accuracy scores: [0.99703264 1.         1.        ]
Processing: dataset-rocky-MR.csv
Accuracy scores: [0.94940476 1.         0.5       ]
Processing: dataset-2-class.csv
Accuracy scores: [1. 1.]
Processing: dataset-rocky-restricted-feat-set.csv
Accuracy scores: [1.   0.   0.25]
Processing: dataset_no_ST_related_features.csv
Accuracy scores: [0.94687915 1.         0.5       ]
Processing: dataset-rocky-all-feats.csv
Accuracy scores: [0.99698795 0.5        0.77777778]


## PReLU

In [23]:
def get_model_P(shape, classes):
    inp = Input(shape=(shape,))
    bn = BatchNormalization()(inp)
    relu = Dense(20)(bn)
    relu = PReLU()(relu)
    relu = Dense(20)(relu)
    relu = PReLU()(relu)
    relu = Dense(20)(relu)
    relu = PReLU()(relu)
    relu = Dense(20)(relu)
    relu = PReLU()(relu)
    relu = Dense(20)(relu)
    relu = PReLU()(relu)
    relu = Dense(20)(relu)
    relu = PReLU()(relu)
    out = Dense(classes, activation='softmax')(relu)
    
    return Model(inputs=inp, outputs=out)

In [26]:
for file in files:
    df = pd.read_csv(file)
    x_train, x_test, y_train, y_test = preprocess(df)
    
    print('Processing:', file)
    print('============' + '=' * len(file))
    
    if file == 'dataset-2-class.csv':
        model = get_model_P(x_train.shape[1], 2)
    else:
        model = get_model_P(x_train.shape[1], 3)
    model.compile(SGD(0.01, momentum=0.9), loss='categorical_crossentropy', metrics=['accuracy'])
    model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=10, verbose=0)
    
    predictions = np.argmax(model.predict(x_test), axis=1)
    true_y = np.argmax(y_test, axis=1)
    
    matrix = confusion_matrix(true_y, predictions)
    print('Accuracy scores:', matrix.diagonal()/matrix.sum(axis=1))
    
    print('============' + '=' * len(file))

Processing: dataset-rocky-no-STemp.csv
Accuracy scores: [0.99700599 0.5        1.        ]
Processing: dataset-rocky-MR.csv
Accuracy scores: [0.72171254 0.8        0.27272727]
Processing: dataset-2-class.csv
Accuracy scores: [0.99736495 1.        ]
Processing: dataset-rocky-restricted-feat-set.csv
Accuracy scores: [0.93712575 1.         0.71428571]
Processing: dataset_no_ST_related_features.csv
Accuracy scores: [0.95225464 0.75       0.25      ]
Processing: dataset-rocky-all-feats.csv
Accuracy scores: [0.9939577  0.66666667 0.88888889]


## Original model

In [38]:
def orig_model(shape, classes, activation='relu'):
    inp = Input(shape=(shape,))
    bn = BatchNormalization()(inp)
    
    if activation == 'relu':
        hidden = Dense(12, activation='relu')(bn)
    elif activation == 'leaky':
        hidden = Dense(12)(bn)
        hidden = LeakyReLU(0.1)(hidden)
    elif activation == 'prelu':
        hidden = Dense(12)(bn)
        hidden = PReLU()(hidden)

    out = Dense(classes, activation='softmax')(hidden)
    return Model(inputs=inp, outputs=out)

In [34]:
for file in files:
    df = pd.read_csv(file)
    x_train, x_test, y_train, y_test = preprocess(df)
    
    print('Processing:', file)
    print('============' + '=' * len(file))
    
    if file == 'dataset-2-class.csv':
        model = orig_model(x_train.shape[1], 2)
    else:
        model = orig_model(x_train.shape[1], 3)
    model.compile(SGD(0.01, momentum=0.001), loss='categorical_crossentropy', metrics=['accuracy'])
    model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=10, verbose=0, batch_size=128)
    
    predictions = np.argmax(model.predict(x_test), axis=1)
    true_y = np.argmax(y_test, axis=1)
    
    matrix = confusion_matrix(true_y, predictions)
    print('Accuracy scores:', matrix.diagonal()/matrix.sum(axis=1))
    
    print('============' + '=' * len(file))

Processing: dataset-rocky-no-STemp.csv
Accuracy scores: [0.9845679 1.        0.4375   ]
Processing: dataset-rocky-MR.csv
Accuracy scores: [0.89602446 0.8        0.72727273]
Processing: dataset-2-class.csv
Accuracy scores: [0.99735099 1.        ]
Processing: dataset-rocky-restricted-feat-set.csv
Accuracy scores: [0.9112426       nan 1.       ]
Processing: dataset_no_ST_related_features.csv
Accuracy scores: [0.91567852        nan 0.71428571]
Processing: dataset-rocky-all-feats.csv
Accuracy scores: [0.97297297 0.5        1.        ]


## Original model, leaky ReLU

In [39]:
for file in files:
    df = pd.read_csv(file)
    x_train, x_test, y_train, y_test = preprocess(df)
    
    print('Processing:', file)
    print('============' + '=' * len(file))
    
    if file == 'dataset-2-class.csv':
        model = orig_model(x_train.shape[1], 2, activation='leaky')
    else:
        model = orig_model(x_train.shape[1], 3, activation='leaky')
    model.compile(SGD(0.01, momentum=0.001), loss='categorical_crossentropy', metrics=['accuracy'])
    model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=10, verbose=0, batch_size=128)
    
    predictions = np.argmax(model.predict(x_test), axis=1)
    true_y = np.argmax(y_test, axis=1)
    
    matrix = confusion_matrix(true_y, predictions)
    print('Accuracy scores:', matrix.diagonal()/matrix.sum(axis=1))
    print('Other metrics:', precision_recall_fscore_support(true_y, predictions))
    
    print('============' + '=' * len(file))

Processing: dataset-rocky-no-STemp.csv
Accuracy scores: [0.97916667 1.         0.66666667]
Other metrics: (array([1.        , 0.5       , 0.33333333]), array([0.97916667, 1.        , 0.66666667]), array([0.98947368, 0.66666667, 0.44444444]), array([336,   4,   3]))
Processing: dataset-rocky-MR.csv
Accuracy scores: [0.97597598 0.66666667 0.71428571]
Other metrics: (array([0.99693252, 0.25      , 0.55555556]), array([0.97597598, 0.66666667, 0.71428571]), array([0.98634294, 0.36363636, 0.625     ]), array([333,   3,   7]))
Processing: dataset-2-class.csv
Accuracy scores: [0.99468792 1.        ]
Other metrics: (array([1.        , 0.76470588]), array([0.99468792, 1.        ]), array([0.99733688, 0.86666667]), array([753,  13]))
Processing: dataset-rocky-restricted-feat-set.csv
Accuracy scores: [0.93674699 1.         0.28571429]
Other metrics: (array([1.        , 0.18181818, 0.2       ]), array([0.93674699, 1.        , 0.28571429]), array([0.96734059, 0.30769231, 0.23529412]), array([332,   