In [1]:
import datetime

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers import Convolution1D, TimeDistributed, Conv2D
from keras.layers.recurrent import LSTM, GRU
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, CSVLogger
from keras.layers.normalization import BatchNormalization
from keras.layers.advanced_activations import *
from keras.optimizers import Adam, Nadam

%matplotlib

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


Using matplotlib backend: Qt5Agg


## Import Tick Data and Create 5min RTH Bars

In [2]:
tick_data = pd.read_feather('../data/processed/ES_tick.feather')
tick_data = tick_data[tick_data['date'] > '2017-07-29']
#Create Index from date column
tick_data.index = tick_data['date']
tick_data.drop(labels=['date'],axis=1,inplace=True)
tick_data.tail()

Unnamed: 0_level_0,last,bid,ask,volume
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018-01-25 19:54:52.612000-05:00,2844.0,2843.75,2844.0,1
2018-01-25 19:54:52.615000-05:00,2844.0,2843.75,2844.0,1
2018-01-25 19:54:54.157000-05:00,2844.0,2843.75,2844.0,1
2018-01-25 19:54:54.157000-05:00,2844.0,2843.75,2844.0,1
2018-01-25 19:54:55.332000-05:00,2844.0,2843.75,2844.0,2


In [3]:
#Resample to get 5min bars
five_min_data = pd.DataFrame(
    tick_data['last'].resample('5Min', loffset=datetime.timedelta(minutes=5)).ohlc())

import pandas_market_calendars as mcal
#We hack the NYSE Calendar extending the close until 4:15
class CMERTHCalendar(mcal.exchange_calendar_nyse.NYSEExchangeCalendar):
    @property
    def close_time(self):
        return datetime.time(16, 15)
    
#Create RTH Calendar
nyse = CMERTHCalendar()
schedule = nyse.schedule(start_date=five_min_data.index.min(), 
                         end_date=five_min_data.index.max())

#Filter out those bars that occur during RTH
five_min_data['dates'] = pd.to_datetime(five_min_data.index.to_datetime().date)
five_min_data['valid_date'] = five_min_data['dates'].isin(schedule.index)
five_min_data['valid_time'] = False
during_rth = five_min_data['valid_date'] & \
        (five_min_data.index > schedule.loc[five_min_data['dates'],'market_open']) & \
        (five_min_data.index <= schedule.loc[five_min_data['dates'],'market_close'])
five_min_data.loc[during_rth, 'valid_time'] = True
five_min_data = five_min_data[five_min_data['valid_time'] == True]
five_min_data.drop(['dates','valid_date','valid_time'], axis=1, inplace=True)

#Add ema
five_min_data['ema'] = five_min_data['close'].ewm(span=20, min_periods=20).mean()

#Reset index
five_min_data.reset_index(inplace=True)

five_min_data[81:].head()

Unnamed: 0,date,open,high,low,close,ema
81,2017-08-01 09:35:00-04:00,2475.5,2476.0,2472.5,2473.5,2470.908594
82,2017-08-01 09:40:00-04:00,2473.5,2474.0,2471.5,2472.5,2471.060194
83,2017-08-01 09:45:00-04:00,2472.25,2473.25,2471.75,2473.0,2471.244978
84,2017-08-01 09:50:00-04:00,2473.0,2473.25,2472.0,2472.75,2471.388343
85,2017-08-01 09:55:00-04:00,2472.75,2473.0,2471.25,2471.25,2471.375165


## Create Test / Train Datasets

In [4]:
data = five_min_data[162:]

openp = data['open'].tolist()
highp = data['high'].tolist()
lowp = data['low'].tolist()
closep = data['close'].tolist()
emap = data['ema'].tolist()

In [18]:
WINDOW = 162 #Number of bars in a trading day
EMB_SIZE = 5
STEP = 1
FORECAST = 1

X, Y = [], []
for i in range(0, len(data), STEP):
    try:
        o = openp[i:i+WINDOW]
        h = highp[i:i+WINDOW]
        l = lowp[i:i+WINDOW]
        c = closep[i:i+WINDOW]
        e = emap[i:i+WINDOW]

        o = (np.array(o) - np.mean(o)) / np.std(o)
        h = (np.array(h) - np.mean(h)) / np.std(h)
        l = (np.array(l) - np.mean(l)) / np.std(l)
        c = (np.array(c) - np.mean(c)) / np.std(c)
        e = (np.array(e) - np.mean(e)) / np.std(e)

        x_i = closep[i:i+WINDOW]
        y_i = closep[(i+WINDOW-1)+FORECAST]  

        last_close = x_i[-1]
        next_close = y_i

        if last_close >= next_close:
            y_i = [1, 0]
        else:
            y_i = [0, 1] 

        x_i = np.column_stack((o, h, l, c, e))

    except Exception as e:
        #e.throw()
        break

    X.append(x_i)
    Y.append(y_i)

In [19]:
# Let's split into train and test sets
# Train Set will be from 8/1/17 through 12/31/17, Test Set 1/1/17 - 1/25/17
p = 8547 #Manual split for now
#p=8448
X, Y = np.array(X), np.array(Y)
X_train = X[0:p]
Y_train = Y[0:p]
X_test = X[p:]
Y_test = Y[p:]

#We may want to shuffle the training data -- will look into this later
def shuffle_in_unison(a, b):
    # courtsey http://stackoverflow.com/users/190280/josh-bleecher-snyder
    assert len(a) == len(b)
    shuffled_a = np.empty(a.shape, dtype=a.dtype)
    shuffled_b = np.empty(b.shape, dtype=b.dtype)
    permutation = np.random.permutation(len(a))
    for old_index, new_index in enumerate(permutation):
        shuffled_a[new_index] = a[old_index]
        shuffled_b[new_index] = b[old_index]
    return shuffled_a, shuffled_b

X_train, Y_train = shuffle_in_unison(X_train, Y_train)

# Not sure why this is needed, but we apply it anyway
X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], EMB_SIZE))
X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], EMB_SIZE))
X_test.shape

(1215, 162, 5)

## Train CNN Model

In [192]:
model = Sequential()

model.add(
    TimeDistributed(
        Conv2D(32, (7, 7), padding='same', strides=2),
        input_shape=(None, 540, 960, 2)))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
time_distributed_37 (TimeDis (None, None, 270, 480, 32 3168      
Total params: 3,168
Trainable params: 3,168
Non-trainable params: 0
_________________________________________________________________


In [110]:
model = Sequential()
model.add(TimeDistributed(Convolution1D(filters=16,
                                        kernel_size=4,
                                        padding='same'),
                          input_shape = (None, WINDOW, EMB_SIZE)))
model.add(TimeDistributed(BatchNormalization()))
model.add(TimeDistributed(LeakyReLU()))
model.add(TimeDistributed(Dropout(0.5)))

model.add(TimeDistributed(Convolution1D(filters=8,
                        kernel_size=4,
                        padding='same')))
model.add(TimeDistributed(BatchNormalization()))
model.add(TimeDistributed(LeakyReLU()))
model.add(TimeDistributed(Dropout(0.5)))

model.add(TimeDistributed(Flatten()))

model.add(TimeDistributed((Dense(64))))
model.add(TimeDistributed(BatchNormalization()))
model.add(TimeDistributed(LeakyReLU()))

model.add(LSTM(32, dropout=0.25, stateful=False))
model.add(Dropout(0.5))

model.add(Dense(2))
model.add(Activation('softmax'))
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
time_distributed_302 (TimeDi (None, None, 81, 16)      336       
_________________________________________________________________
time_distributed_303 (TimeDi (None, None, 81, 16)      64        
_________________________________________________________________
time_distributed_304 (TimeDi (None, None, 81, 16)      0         
_________________________________________________________________
time_distributed_305 (TimeDi (None, None, 81, 16)      0         
_________________________________________________________________
time_distributed_306 (TimeDi (None, None, 81, 8)       520       
_________________________________________________________________
time_distributed_307 (TimeDi (None, None, 81, 8)       32        
_________________________________________________________________
time_distributed_308 (TimeDi (None, None, 81, 8)       0         
__________

In [20]:
model = Sequential()
model.add(Convolution1D(input_shape = (WINDOW, EMB_SIZE),
                        filters=16,
                        kernel_size=4,
                        padding='same'))
model.add(BatchNormalization())
model.add(LeakyReLU())
model.add(Dropout(0.5))

model.add(Convolution1D(filters=8,
                        kernel_size=4,
                        padding='same'))
model.add(BatchNormalization())
model.add(LeakyReLU())
model.add(Dropout(0.5))

model.add(Flatten())

model.add(Dense(32))
model.add(BatchNormalization())
model.add(LeakyReLU())


model.add(Dense(2))
model.add(Activation('softmax'))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_7 (Conv1D)            (None, 162, 16)           336       
_________________________________________________________________
batch_normalization_10 (Batc (None, 162, 16)           64        
_________________________________________________________________
leaky_re_lu_10 (LeakyReLU)   (None, 162, 16)           0         
_________________________________________________________________
dropout_7 (Dropout)          (None, 162, 16)           0         
_________________________________________________________________
conv1d_8 (Conv1D)            (None, 162, 8)            520       
_________________________________________________________________
batch_normalization_11 (Batc (None, 162, 8)            32        
_________________________________________________________________
leaky_re_lu_11 (LeakyReLU)   (None, 162, 8)            0         
__________

In [21]:
opt = Nadam(lr=0.001)

reduce_lr = ReduceLROnPlateau(monitor='val_acc', factor=0.9, patience=30, min_lr=0.000001, verbose=1)
checkpointer = ModelCheckpoint(filepath="model.hdf5", verbose=1, save_best_only=True)


model.compile(optimizer=opt, 
              loss='categorical_crossentropy',
              metrics=['accuracy'])

history = model.fit(X_train, Y_train, 
          epochs = 100, 
          batch_size = 128, 
          verbose=1, 
          validation_data=(X_test, Y_test),
          callbacks=[reduce_lr, checkpointer],
          shuffle='batch')

Train on 8547 samples, validate on 1215 samples
Epoch 1/100

Epoch 00001: val_loss improved from inf to 0.70099, saving model to model.hdf5
Epoch 2/100

Epoch 00002: val_loss did not improve from 0.70099
Epoch 3/100

Epoch 00003: val_loss improved from 0.70099 to 0.69696, saving model to model.hdf5
Epoch 4/100

Epoch 00004: val_loss did not improve from 0.69696
Epoch 5/100

Epoch 00005: val_loss did not improve from 0.69696
Epoch 6/100

Epoch 00006: val_loss did not improve from 0.69696
Epoch 7/100

Epoch 00007: val_loss did not improve from 0.69696
Epoch 8/100

Epoch 00008: val_loss did not improve from 0.69696
Epoch 9/100

Epoch 00009: val_loss did not improve from 0.69696
Epoch 10/100

Epoch 00010: val_loss did not improve from 0.69696
Epoch 11/100

Epoch 00011: val_loss did not improve from 0.69696
Epoch 12/100

Epoch 00012: val_loss did not improve from 0.69696
Epoch 13/100

Epoch 00013: val_loss did not improve from 0.69696
Epoch 14/100

Epoch 00014: val_loss did not improve from


Epoch 00043: val_loss did not improve from 0.69696
Epoch 44/100

Epoch 00044: val_loss did not improve from 0.69696
Epoch 45/100

Epoch 00045: val_loss did not improve from 0.69696
Epoch 46/100

Epoch 00046: val_loss did not improve from 0.69696
Epoch 47/100

Epoch 00047: val_loss did not improve from 0.69696
Epoch 48/100

Epoch 00048: val_loss did not improve from 0.69696
Epoch 49/100

Epoch 00049: val_loss did not improve from 0.69696
Epoch 50/100

Epoch 00050: val_loss did not improve from 0.69696
Epoch 51/100

Epoch 00051: val_loss did not improve from 0.69696
Epoch 52/100

Epoch 00052: val_loss did not improve from 0.69696
Epoch 53/100

Epoch 00053: val_loss did not improve from 0.69696
Epoch 54/100

Epoch 00054: val_loss did not improve from 0.69696
Epoch 55/100

Epoch 00055: val_loss did not improve from 0.69696
Epoch 56/100

Epoch 00056: val_loss did not improve from 0.69696
Epoch 57/100

Epoch 00057: val_loss did not improve from 0.69696
Epoch 58/100

Epoch 00058: val_loss di


Epoch 00087: val_loss did not improve from 0.69696
Epoch 88/100

Epoch 00088: val_loss did not improve from 0.69696
Epoch 89/100

Epoch 00089: val_loss did not improve from 0.69696
Epoch 90/100

Epoch 00090: val_loss did not improve from 0.69696
Epoch 91/100

Epoch 00091: val_loss did not improve from 0.69696
Epoch 92/100

Epoch 00092: val_loss did not improve from 0.69696
Epoch 93/100

Epoch 00093: val_loss did not improve from 0.69696
Epoch 94/100

Epoch 00094: val_loss did not improve from 0.69696
Epoch 95/100

Epoch 00095: val_loss did not improve from 0.69696
Epoch 96/100

Epoch 00096: ReduceLROnPlateau reducing learning rate to 0.0009000000427477062.

Epoch 00096: val_loss did not improve from 0.69696
Epoch 97/100

Epoch 00097: val_loss did not improve from 0.69696
Epoch 98/100

Epoch 00098: val_loss did not improve from 0.69696
Epoch 99/100

Epoch 00099: val_loss did not improve from 0.69696
Epoch 100/100

Epoch 00100: val_loss did not improve from 0.69696


In [22]:
plt.figure()
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='best')
plt.show()

plt.figure()
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='best')
plt.show()

In [85]:
from sklearn.metrics import confusion_matrix

model.load_weights("model.hdf5")
pred = model.predict(np.array(X_test), batch_size=128)

C = confusion_matrix([np.argmax(y) for y in Y_test], [np.argmax(y) for y in pred])

print (C / C.astype(np.float).sum(axis=1)[:, None])

[[0.98774885 0.01225115]
 [0.98405104 0.01594896]]


In [20]:
pred[0]

array([0.51347464, 0.48652527], dtype=float32)

In [62]:
C

array([[562,  97],
       [516, 121]])

In [45]:
C / C.astype(np.float).sum(axis=1)[:, None]

array([[0.83673469, 0.15855573, 0.00470958],
       [0.78915663, 0.20481928, 0.0060241 ],
       [0.86335404, 0.13043478, 0.00621118]])

In [69]:
probs = Y_train.sum(axis=0) / Y_train.shape[0]
probs

array([0.55539956, 0.44460044])

In [68]:
pred

array([[0.4436597 , 0.5563404 ],
       [0.5040323 , 0.49596766],
       [0.5359964 , 0.4640036 ],
       ...,
       [0.57311577, 0.4268842 ],
       [0.5362818 , 0.46371824],
       [0.5139088 , 0.48609126]], dtype=float32)

In [120]:
s = np.random.binomial(1, probs[1], pred.shape[0])
s

array([1, 0, 1, ..., 0, 0, 1])

In [121]:
C1 = confusion_matrix([np.argmax(y) for y in Y_test], s)
print (C1 / C1.astype(np.float).sum(axis=1)[:, None])

[[0.58421851 0.41578149]
 [0.59026688 0.40973312]]


In [122]:
([np.argmax(y) for y in Y_test] == s).sum() / pred.shape[0]

0.4984567901234568

<keras.models.Sequential at 0x7f61cf415860>