In [4]:
import tensorflow
import keras
import os
import pandas as pd
import h5py
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
import numpy as np
from keras.optimizers import SGD
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.layers.recurrent import LSTM
from keras.layers import Dropout
from keras.callbacks import EarlyStopping
from keras.layers import Bidirectional
from sklearn.model_selection import KFold

### Load the dataset

In [5]:
path = os.getcwd()
print(path)
filename = '/encoding_LSTM_chunks20.h5'

/home/qls/deep-learning-pd/code


In [6]:
h5f = h5py.File(path + filename,'r')
X_train = h5f['train'][:]
Y_train = h5f['target'][:]
h5f.close()

print('Target shape', Y_train.shape)
print('Train shape', X_train.shape)

Target shape (16723,)
Train shape (16723, 20, 3)


### Modify features

In [7]:
# remove hold time
# X_train = X_train[:,:,1:]

# leave only hold time
# X_train = X_train[:,:,:1]

### Split to train and test

In [8]:
X_train, X_test, Y_train, Y_test = train_test_split(X_train, Y_train, test_size=0.1, random_state=15)

In [9]:
X_train[0]

array([[6., 4., 3.],
       [6., 4., 4.],
       [5., 5., 5.],
       [3., 3., 5.],
       [6., 3., 6.],
       [4., 5., 4.],
       [3., 6., 3.],
       [5., 6., 3.],
       [6., 4., 5.],
       [4., 3., 6.],
       [5., 3., 6.],
       [5., 5., 3.],
       [3., 3., 6.],
       [4., 5., 3.],
       [6., 5., 5.],
       [3., 3., 3.],
       [6., 4., 4.],
       [3., 6., 4.],
       [3., 6., 5.],
       [3., 5., 3.]])

### Build model

In [10]:
np.random.seed(14)  # fix the random numbers generator state

batch_size = 16
hidden_units = 50
input_shape = X_train.shape[1:]
nb_epochs = 10
nb_classes = 1
dropout = 0.05
early_stopping = EarlyStopping(monitor='loss', min_delta=0.01, patience=2, verbose=1)
print('Build model...')

# sgd = SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True)

model = Sequential()
model.add(Bidirectional(LSTM(units=hidden_units, kernel_initializer='uniform', recurrent_initializer='uniform', 
               dropout=dropout, use_bias=True, unit_forget_bias=True, activation='tanh', recurrent_activation='sigmoid', 
               input_shape=input_shape), input_shape=input_shape, merge_mode='concat'))

model.add(Dense(16))
model.add(Activation('linear'))

model.add(Dense(nb_classes))
model.add(Activation('sigmoid'))

model.summary()

model.compile(loss='binary_crossentropy', metrics=['binary_accuracy'], optimizer='adam')

print("Train...")
history = model.fit(X_train, Y_train, batch_size=batch_size, epochs=nb_epochs, verbose=1, callbacks=[early_stopping])

Build model...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bidirectional_1 (Bidirection (None, 100)               21600     
_________________________________________________________________
dense_1 (Dense)              (None, 16)                1616      
_________________________________________________________________
activation_1 (Activation)    (None, 16)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 17        
_________________________________________________________________
activation_2 (Activation)    (None, 1)                 0         
Total params: 23,233
Trainable params: 23,233
Non-trainable params: 0
_________________________________________________________________
Train...
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 00003: early stopping


### Test the model performance

In [11]:
# Y_pred = model.predict(X_test)
metr = model.evaluate(X_test, Y_test)



In [17]:
Y_pred = model.predict_proba(X_test)
print(Y_pred)
print(len(Y_pred))
print('AUC', roc_auc_score(Y_test, Y_pred))

Y_pred = model.predict(X_test)
print(Y_pred)
print(len(Y_pred))
print('Accuracy', accuracy_score(Y_test, np.round(Y_pred)))

[[0.48663464]
 [0.48654884]
 [0.48658192]
 ...
 [0.48295724]
 [0.47510287]
 [0.48661962]]
1673
AUC 0.4702015437392796
[[0.48663464]
 [0.48654884]
 [0.48658192]
 ...
 [0.48295724]
 [0.47510287]
 [0.48661962]]
1673
Accuracy 0.5068738792588166


In [20]:
Y_pred = model.predict_proba(X_test)
print(Y_pred.shape)
Y_pred.transpose()
print(Y_pred.transpose())

(1673, 1)
[[0.48663464 0.48654884 0.48658192 ... 0.48295724 0.47510287 0.48661962]]


In [22]:
Y_test, np.round(Y_pred).T

(array([1, 0, 1, ..., 0, 1, 1]),
 array([[0., 0., 0., ..., 0., 0., 0.]], dtype=float32))

### 10-fold cross-validation

In [25]:
seed = 7
kfold = KFold(n_splits=10, shuffle=True, random_state=seed)

acc_list = []
AUC_list = []
# f1_list = []

np.random.seed(14)  # fix the random numbers generator state

batch_size = 16
hidden_units = 50
input_shape = X_train.shape[1:]
nb_epochs = 20
nb_classes = 1
dropout = 0.05
early_stopping = EarlyStopping(monitor='val_loss', min_delta=0.01, patience=3, verbose=1)
print('Build model...')

# sgd = SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True)

for train, test in kfold.split(X_train, Y_train):
    print('Train indices:', train)
    print('Test indices:', test)
    
#     train_set = np.array((0, 0, 0))
#     target_set = np.array(0)
#     X_train, Y_train = merge_df_rows(df.iloc[train])
    
#     train_set = np.array((0, 0, 0))
#     target_set = np.array(0)
#     X_test, Y_test = merge_df_rows(df.iloc[test])
    
    input_shape = X_train.shape[1:]
#     print(input_shape)

    model = Sequential()
#     model.add(Bidirectional(LSTM(units=hidden_units, kernel_initializer='uniform', recurrent_initializer='uniform', 
#                    dropout=dropout, use_bias=True, unit_forget_bias=True, activation='tanh', recurrent_activation='sigmoid', 
#                    input_shape=input_shape), input_shape=input_shape, merge_mode='concat'))
    model.add(LSTM(units=hidden_units, kernel_initializer='uniform', recurrent_initializer='uniform', 
                   dropout=dropout, use_bias=True, unit_forget_bias=True, activation='tanh', recurrent_activation='sigmoid', 
                   input_shape=input_shape))

    model.add(Dense(16))
    model.add(Activation('linear'))

    model.add(Dense(nb_classes))
    model.add(Activation('sigmoid'))

    model.summary()

    model.compile(loss='binary_crossentropy', metrics=['binary_accuracy'], optimizer='adam')

    print("Train...")
    history = model.fit(X_train[train], Y_train[train], batch_size=batch_size, epochs=nb_epochs, verbose=1,
                        callbacks=[early_stopping], validation_split=0.15)

    Y_pred = model.predict_proba(X_train[test])
    acc = accuracy_score(Y_train[test], np.round(Y_pred))
    print('Accuracy is', acc)
    AUC = roc_auc_score(Y_train[test], Y_pred)
    print('AUC is', AUC)
#     f1 = f1_score(Y_train[test], np.round(Y_pred))
#     print('F1-score is', f1)
    
    acc_list.append(acc)
    AUC_list.append(AUC)
#     f1_list.append(f1)

print(np.mean(AUC_list))
print(np.mean(acc_list))

Build model...
Train indices: [    0     1     3 ... 15047 15048 15049]
Test indices: [    2    12    21 ... 15026 15029 15039]
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_5 (LSTM)                (None, 50)                10800     
_________________________________________________________________
dense_9 (Dense)              (None, 16)                816       
_________________________________________________________________
activation_9 (Activation)    (None, 16)                0         
_________________________________________________________________
dense_10 (Dense)             (None, 1)                 17        
_________________________________________________________________
activation_10 (Activation)   (None, 1)                 0         
Total params: 11,633
Trainable params: 11,633
Non-trainable params: 0
_________________________________________________________________
Train...
T

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_9 (LSTM)                (None, 50)                10800     
_________________________________________________________________
dense_17 (Dense)             (None, 16)                816       
_________________________________________________________________
activation_17 (Activation)   (None, 16)                0         
_________________________________________________________________
dense_18 (Dense)             (None, 1)                 17        
_________________________________________________________________
activation_18 (Activation)   (None, 1)                 0         
Total params: 11,633
Trainable params: 11,633
Non-trainable params: 0
_________________________________________________________________
Train...
Train on 11513 samples, validate on 2032 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 00004: early stopping
Accuracy

Train on 11513 samples, validate on 2032 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 00004: early stopping
Accuracy is 0.5754152823920266
AUC is 0.5441671087533156
Train indices: [    0     1     2 ... 15047 15048 15049]
Test indices: [   18    44    49 ... 15014 15018 15041]
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_14 (LSTM)               (None, 50)                10800     
_________________________________________________________________
dense_27 (Dense)             (None, 16)                816       
_________________________________________________________________
activation_27 (Activation)   (None, 16)                0         
_________________________________________________________________
dense_28 (Dense)             (None, 1)                 17        
_________________________________________________________________
activation_28 (Activation)   (None, 1)           

In [8]:
AUC_list

[0.7604895104895104,
 0.5900360144057623,
 0.7279761904761904,
 0.5932400932400932,
 0.659698025551684,
 0.7179962894248608,
 0.756578947368421,
 0.5884498480243161,
 0.5930993456276026,
 0.5849849849849851]