# Init

In [69]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from keras.models import Model, load_model
from keras.layers import Input, Activation, Dense, concatenate, LSTM, GRU, Dropout
from sklearn import metrics
from keras.utils import np_utils

import gc
import os
import datetime
import joblib
import pandas as pd
import h5py
import numpy as np

In [33]:
X_test = h5py.File("../data/Dreem/X_test.h5","r")

def get_train():
    X_train = h5py.File("../data/Dreem/X_train.h5","r")
    y_train = pd.read_csv("../data/Dreem/y_train.csv","r", delimiter=',').set_index('id')

    X_train = X_train["features"][:]
    return X_train, y_train

In [34]:
X_train, y_train = get_train()

# Features

In [35]:
def extract_features(data):
    features = []
    features.append(data[:, :11])
    features.append(data[:, 11:].max(1).reshape(-1, 1))
    features.append(data[:, 11:].min(1).reshape(-1, 1))
    features.append(np.abs(data[:, 11:]).mean(1).reshape(-1, 1))
    features = np.concatenate(features, 1)
    return features

def extract_manual(data):
    ts_arr = data[:, 11:]
    X = data[:, :11]
    features = np.concatenate((
        np.mean(X, axis=1)[..., np.newaxis],
        np.amin(X, axis=1)[..., np.newaxis],
        np.amax(X, axis=1)[..., np.newaxis]
    ), axis=1)
    quantiles = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
    for q in quantiles:
        features = np.concatenate((features, np.quantile(X, q, axis=1)[..., np.newaxis]), axis=1)
    X = np.concatenate((features, X, ts_arr), axis=1)
    return X

In [36]:
X = extract_manual(X_train)
X_test = extract_manual(X_test["features"][:])
X, X_val, y, y_val = train_test_split(X_train, y_train, test_size=0.3, random_state=42)
y_val = np.squeeze(y_val)
del(X_train)
gc.collect()

(261634, 3)
(261634, 12)
(261634, 1273)


31

In [37]:
fe_train = X[:, :23]
ts_train = X[:, 23:]
ts_train = ts_train[..., np.newaxis]
fe_val = X_val[:, :23]
ts_val = X_val[:, 23:]

# Train

* **Benchmark:** Random Forest on manual features

In [38]:
rf = RandomForestClassifier(n_estimators=100, max_depth=2,random_state=0, class_weight='balanced')

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 800, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 6, cv = 3, verbose=0, random_state=42, n_jobs = -1)
rf_random.fit(fe_train, y)

best_random = rf_random.best_estimator_
print(rf_random.best_score_)

Fitting 3 folds for each of 6 candidates, totalling 18 fits
[CV] n_estimators=200, min_samples_split=10, min_samples_leaf=2, max_features=sqrt, max_depth=50, bootstrap=True 
[CV] n_estimators=200, min_samples_split=10, min_samples_leaf=2, max_features=sqrt, max_depth=50, bootstrap=True 


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


[CV] n_estimators=200, min_samples_split=10, min_samples_leaf=2, max_features=sqrt, max_depth=50, bootstrap=True 
[CV] n_estimators=333, min_samples_split=10, min_samples_leaf=4, max_features=sqrt, max_depth=90, bootstrap=False 


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


[CV]  n_estimators=200, min_samples_split=10, min_samples_leaf=2, max_features=sqrt, max_depth=50, bootstrap=True, total= 3.9min
[CV] n_estimators=333, min_samples_split=10, min_samples_leaf=4, max_features=sqrt, max_depth=90, bootstrap=False 


  estimator.fit(X_train, y_train, **fit_params)


[CV]  n_estimators=200, min_samples_split=10, min_samples_leaf=2, max_features=sqrt, max_depth=50, bootstrap=True, total= 3.9min
[CV]  n_estimators=200, min_samples_split=10, min_samples_leaf=2, max_features=sqrt, max_depth=50, bootstrap=True, total= 4.0min
[CV] n_estimators=333, min_samples_split=10, min_samples_leaf=4, max_features=sqrt, max_depth=90, bootstrap=False 


  estimator.fit(X_train, y_train, **fit_params)


[CV] n_estimators=333, min_samples_split=2, min_samples_leaf=2, max_features=auto, max_depth=60, bootstrap=False 


  estimator.fit(X_train, y_train, **fit_params)


[CV]  n_estimators=333, min_samples_split=10, min_samples_leaf=4, max_features=sqrt, max_depth=90, bootstrap=False, total=10.8min
[CV] n_estimators=333, min_samples_split=2, min_samples_leaf=2, max_features=auto, max_depth=60, bootstrap=False 


  estimator.fit(X_train, y_train, **fit_params)


[CV]  n_estimators=333, min_samples_split=10, min_samples_leaf=4, max_features=sqrt, max_depth=90, bootstrap=False, total=11.0min
[CV]  n_estimators=333, min_samples_split=10, min_samples_leaf=4, max_features=sqrt, max_depth=90, bootstrap=False, total=11.0min
[CV] n_estimators=333, min_samples_split=2, min_samples_leaf=2, max_features=auto, max_depth=60, bootstrap=False 


  estimator.fit(X_train, y_train, **fit_params)


[CV] n_estimators=600, min_samples_split=5, min_samples_leaf=1, max_features=sqrt, max_depth=30, bootstrap=True 


  estimator.fit(X_train, y_train, **fit_params)


[CV]  n_estimators=333, min_samples_split=2, min_samples_leaf=2, max_features=auto, max_depth=60, bootstrap=False, total=11.7min
[CV] n_estimators=600, min_samples_split=5, min_samples_leaf=1, max_features=sqrt, max_depth=30, bootstrap=True 


  estimator.fit(X_train, y_train, **fit_params)


[CV]  n_estimators=333, min_samples_split=2, min_samples_leaf=2, max_features=auto, max_depth=60, bootstrap=False, total=11.4min
[CV] n_estimators=600, min_samples_split=5, min_samples_leaf=1, max_features=sqrt, max_depth=30, bootstrap=True 


  estimator.fit(X_train, y_train, **fit_params)


[CV]  n_estimators=333, min_samples_split=2, min_samples_leaf=2, max_features=auto, max_depth=60, bootstrap=False, total=11.3min
[CV] n_estimators=466, min_samples_split=10, min_samples_leaf=1, max_features=auto, max_depth=80, bootstrap=False 


  estimator.fit(X_train, y_train, **fit_params)


[CV]  n_estimators=600, min_samples_split=5, min_samples_leaf=1, max_features=sqrt, max_depth=30, bootstrap=True, total=12.6min
[CV] n_estimators=466, min_samples_split=10, min_samples_leaf=1, max_features=auto, max_depth=80, bootstrap=False 


  estimator.fit(X_train, y_train, **fit_params)


[CV]  n_estimators=600, min_samples_split=5, min_samples_leaf=1, max_features=sqrt, max_depth=30, bootstrap=True, total=12.7min
[CV] n_estimators=466, min_samples_split=10, min_samples_leaf=1, max_features=auto, max_depth=80, bootstrap=False 


  estimator.fit(X_train, y_train, **fit_params)


[CV]  n_estimators=600, min_samples_split=5, min_samples_leaf=1, max_features=sqrt, max_depth=30, bootstrap=True, total=11.8min
[CV] n_estimators=266, min_samples_split=10, min_samples_leaf=1, max_features=sqrt, max_depth=60, bootstrap=False 


  estimator.fit(X_train, y_train, **fit_params)


[CV]  n_estimators=466, min_samples_split=10, min_samples_leaf=1, max_features=auto, max_depth=80, bootstrap=False, total=14.2min
[CV] n_estimators=266, min_samples_split=10, min_samples_leaf=1, max_features=sqrt, max_depth=60, bootstrap=False 


  estimator.fit(X_train, y_train, **fit_params)


[CV]  n_estimators=466, min_samples_split=10, min_samples_leaf=1, max_features=auto, max_depth=80, bootstrap=False, total=13.9min
[CV] n_estimators=266, min_samples_split=10, min_samples_leaf=1, max_features=sqrt, max_depth=60, bootstrap=False 


  estimator.fit(X_train, y_train, **fit_params)


[CV]  n_estimators=266, min_samples_split=10, min_samples_leaf=1, max_features=sqrt, max_depth=60, bootstrap=False, total= 7.8min
[CV]  n_estimators=466, min_samples_split=10, min_samples_leaf=1, max_features=auto, max_depth=80, bootstrap=False, total=13.8min
[CV]  n_estimators=266, min_samples_split=10, min_samples_leaf=1, max_features=sqrt, max_depth=60, bootstrap=False, total= 5.6min
[CV]  n_estimators=266, min_samples_split=10, min_samples_leaf=1, max_features=sqrt, max_depth=60, bootstrap=False, total= 5.1min


[Parallel(n_jobs=-1)]: Done  18 out of  18 | elapsed: 48.5min finished
  self.best_estimator_.fit(X, y, **fit_params)


0.49510491801488454


In [41]:
y_prev_val = best_random.predict(fe_val)

* **LSTM + MLP**

In [77]:
timestep_nb = ts_train.shape[1]
spike_per_ts = 1
cell_nb = 124
dropout = 0.2
batch_size = 64

input_tensor = Input(shape=(timestep_nb, spike_per_ts))
X = LSTM(cell_nb, return_sequences=True, dropout=dropout)(input_tensor)
X = LSTM(cell_nb, return_sequences=False)(X)

additional_features = fe_train.shape[1]
fe_input = Input(shape=(additional_features,)) # A tensor containing the engineered features
latent = Dense(64, activation='relu')(fe_input)
latent = Dropout(rate=dropout)(latent)
latent = Dense(32, activation='relu')(latent)
latent = Dropout(rate=dropout)(latent)
input_tensor = [input_tensor, fe_input]
X = concatenate([X, latent])   
    
output_tensor = Dense(3, activation='softmax')(X)

model = Model(input_tensor, output_tensor)
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_6 (InputLayer)            (None, 23)           0                                            
__________________________________________________________________________________________________
dense_7 (Dense)                 (None, 64)           1536        input_6[0][0]                    
__________________________________________________________________________________________________
input_5 (InputLayer)            (None, 1238, 1)      0                                            
__________________________________________________________________________________________________
dropout_5 (Dropout)             (None, 64)           0           dense_7[0][0]                    
__________________________________________________________________________________________________
lstm_5 (LS

In [None]:
y_train_encoded = np_utils.to_categorical(y)

model.compile(metrics=['accuracy'], loss='categorical_crossentropy', optimizer='adam')
history = model.fit([ts_train[:50000], fe_train[:50000]], y_train_encoded[:50000], epochs=2, validation_split=0.2, batch_size=batch_size)

Train on 40000 samples, validate on 10000 samples
Epoch 1/2
Epoch 2/2

# Evaluate

In [42]:
def evaluate(y_true, y_pred):
    print("ACC: ", metrics.accuracy_score(y_true, y_pred))
    #print("ROC_AUC: ", metrics.roc_auc_score(y_true, y_pred))
    print(metrics.classification_report(y_true, y_pred))

evaluate(y_prev_val, y_val)

ACC:  0.4967703303563466
             precision    recall  f1-score   support

          0       0.77      0.56      0.65     46387
          1       0.30      0.39      0.34     18318
          2       0.27      0.42      0.33     13786

avg / total       0.57      0.50      0.52     78491



In [24]:
y_val.shape

(65409, 2)

# Predict

In [None]:
def predict(X_test, model):
    # Predict on custom X_test
    y_pred = model.predict(X_test)
    y_pred = np.reshape(y_pred, (y_pred.shape[0],))
    print (y_pred.shape)
    
    # Convert sigmoid output to 0s and 1s
    y_pred[y_pred >= 0.5] = 1
    y_pred[y_pred < 0.5] = 0
  
    # Format .csv in ENS style
    dfy_pred = pd.DataFrame(data=y_pred, columns=["TARGET"], dtype=int)
    dfy_pred.index.name = "ID"
    dfy_pred.index += 16635
    return dfy_pred

In [47]:
fe_test = X_test[:, :23]
ts_test = X_test[:, 23:]
dfy_pred = predict(fe_test, best_random)

(238366,)


# Test

In [57]:
def saveExp(dfy_pred, model):
    """ Create directory in which to save predictions, experiment parameters and model object. """

    directory = "../experiments/{}".format(datetime.datetime.now().strftime("%m%d%H%M%S"))
    if not os.path.exists(directory):
        os.makedirs(directory)

    dfy_pred.to_csv(directory + '/y_pred.csv', sep=',')
    
    joblib.dump(model, directory + '/model.h5')

    return directory

# Save model
saveExp(dfy_pred, best_random)

'../experiments/0628220938'