## Replicate - DeepLOB: Deep Convolutional Neural Networks for Limit Order Books
(Paper authors: Zihao Zhang, Stefan Zohren, Stephen Roberts)

Dataset source: https://drive.google.com/drive/folders/1Xen3aRid9ZZhFqJRgEMyETNazk02cNmv?usp=sharing

In [2]:
# import libraries
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from keras.utils import np_utils
from keras.models import Model
from keras.optimizers import Adam
from keras.layers import Input, Conv2D, LeakyReLU, MaxPooling2D, concatenate, LSTM, Reshape, Dense
from keras.callbacks import EarlyStopping

import pandas_market_calendars as mcal

#### Data Preprocessing

In [4]:
nyse = mcal.get_calendar('NYSE')
dates = list(nyse.schedule(start_date='2020-01-01', end_date='2020-01-09').index)

In [5]:
dates_str_list = []
for trading_day in dates:
    dates_str_list.append(str(trading_day.date()))

In [6]:
daily_data_dict= {}

In [7]:
for i in range(len(dates_str_list)):
    date = dates_str_list[i]
    if date not in daily_data_dict.keys():
        date = dates_str_list[i]
        daily_data_dict[date] = np.array(pd.read_csv('./data/JNJ_orderbook/JNJ_' + date + '_34200000_57600000_orderbook_10.csv',header = None))

In [8]:
normalization_mean_dict = {}
normalization_stddev_dict = {}

In [9]:
for i in range(5,len(dates_str_list)):
    date = dates_str_list[i]
    
    if (date not in normalization_mean_dict.keys()) or (date not in normalization_stddev_dict.keys()):
        look_back_dates_list = dates_str_list[(i-5):i]
        prev_5_day_orderbook_np = None
        for look_back_date in look_back_dates_list:
            if prev_5_day_orderbook_np is None:
                prev_5_day_orderbook_np = daily_data_dict[look_back_date]
            else:
                prev_5_day_orderbook_np = np.vstack((prev_5_day_orderbook_np, daily_data_dict[look_back_date]))
                
        
        price_mean = prev_5_day_orderbook_np[:,range(0,prev_5_day_orderbook_np.shape[1],2)].mean()
        price_std = prev_5_day_orderbook_np[:,range(0,prev_5_day_orderbook_np.shape[1],2)].std()
        size_mean = prev_5_day_orderbook_np[:,range(1,prev_5_day_orderbook_np.shape[1],2)].mean()
        size_std = prev_5_day_orderbook_np[:,range(1,prev_5_day_orderbook_np.shape[1],2)].std()
        
        normalization_mean_dict[date] = np.repeat([[price_mean,size_mean]], 20, axis=0).flatten()
        normalization_stddev_dict[date] = np.repeat([[price_std,size_std]], 20, axis=0).flatten()

In [10]:
daily_norm_data_dict = {}

In [11]:
for i in range(5,len(dates_str_list)):
    date = dates_str_list[i]
    if date not in daily_norm_data_dict.keys():
        daily_norm_data_dict[date] = (daily_data_dict[date] - normalization_mean_dict[date])/ normalization_stddev_dict[date]

In [12]:
daily_norm_data_dict['2020-01-09'].shape

(159941, 40)

In [13]:
def moving_average(x, k):
    return np.convolve(x, np.ones(k), 'valid') / k

In [14]:
def generate_labels(k, alpha, daily_data_dict):
    daily_label_dict = {}
    for date in list(daily_data_dict.keys())[5:]:
        price_ask = daily_data_dict[date][:,0]
        size_ask = daily_data_dict[date][:,1]
        price_bid = daily_data_dict[date][:,2]
        size_bid = daily_data_dict[date][:,3]
        mid_price = (price_ask * size_bid + price_bid * size_ask) / (size_ask + size_bid)
        future_k_avg_mid_price = moving_average(mid_price, k)[1:]
        change_pct = (future_k_avg_mid_price - mid_price[:-k])/mid_price[:-k]
        y_label = (-(change_pct < -alpha).astype(int))  + (change_pct > alpha).astype(int)
        
        daily_label_dict[date] = y_label.reshape(-1,1)
    return daily_label_dict

In [15]:
k=26
date = '2020-01-09'
price_ask = daily_data_dict[date][:,0]
size_ask = daily_data_dict[date][:,1]
price_bid = daily_data_dict[date][:,2]
size_bid = daily_data_dict[date][:,3]
mid_price = (price_ask * size_bid + price_bid * size_ask) / (size_ask + size_bid)
future_k_avg_mid_price = moving_average(mid_price, k)[1:]
change_pct = (future_k_avg_mid_price - mid_price[:-k])/mid_price[:-k]

In [68]:
np.percentile(change_pct,33.33)

-1.970212233871668e-05

In [69]:
np.percentile(change_pct,66.67)

1.8990615131648153e-05

In [16]:
def generate_X_y(k, alpha, timestamp_per_sample, daily_norm_data_dict, daily_data_dict):
    #k is the number of future timesteps used to generate the label y
    data_x = None
    for date in daily_norm_data_dict.keys():
        if data_x is None:
            data_x = daily_norm_data_dict[date].copy()[:-k,:]
        else:
            data_x = np.vstack((data_x, daily_norm_data_dict[date][:-k,:]))
    print(data_x.shape)
    
    daily_label_dict = generate_labels(k, alpha, daily_data_dict)
    data_y = None
    for date in daily_label_dict.keys():
        if data_y is None:
            data_y = daily_label_dict[date].copy()
        else:
            data_y = np.vstack((data_y, daily_label_dict[date]))
            
    [N, P_x] = data_x.shape
#     P_y = data_y.shape[1]
    
    x = np.zeros([(N-timestamp_per_sample+1), timestamp_per_sample, P_x])
    
    for i in range(N-timestamp_per_sample+1):
        x[i] = data_x[i:(i+timestamp_per_sample), :]
        
    x = x.reshape(x.shape + (1,))
    y = data_y[(timestamp_per_sample-1):]
    y = np_utils.to_categorical(y, 3)
    
    return x, y

In [22]:
# k = 8
# alpha = 7e-6
# price_ask = daily_data_dict['2020-01-09'][:,0]
# size_ask = daily_data_dict['2020-01-09'][:,1]
# price_bid = daily_data_dict['2020-01-09'][:,2]
# size_bid = daily_data_dict['2020-01-09'][:,3]
# mid_price = (price_ask * size_bid + price_bid * size_ask) / (size_ask + size_bid)
# future_k_avg_mid_price = moving_average(mid_price, k)[1:]
# change_pct = (future_k_avg_mid_price - mid_price[:-k])/mid_price[:-k]
# y_label = (-(change_pct < -alpha).astype(int))  + (change_pct > alpha).astype(int) 

In [23]:
# np.percentile(change_pct,33.33)

In [24]:
# np.percentile(change_pct,66.67)

In [25]:
X,y = generate_X_y(k=8, alpha=7e-6, timestamp_per_sample=100,
                   daily_norm_data_dict= daily_norm_data_dict, 
                   daily_data_dict = daily_data_dict)

(159933, 40)


In [26]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2)

In [27]:
X_train.shape

(127867, 100, 40, 1)

In [29]:
y_train.shape

(127867, 3)

In [30]:
X_test.shape

(31967, 100, 40, 1)

In [31]:
y_test.shape

(31967, 3)

In [72]:
# del X
# del y
# del X2
# del y2

In [17]:
X2,y2 = generate_X_y(k=80, alpha=3.63e-5, timestamp_per_sample=100,
                     daily_norm_data_dict= daily_norm_data_dict, 
                   daily_data_dict = daily_data_dict)

(159861, 40)


In [18]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2,y2,test_size = 0.2)

In [19]:
X3,y3 = generate_X_y(k=26, alpha=1.93e-5, timestamp_per_sample=100,
                     daily_norm_data_dict= daily_norm_data_dict, 
                   daily_data_dict = daily_data_dict)

(159915, 40)


In [20]:
X_train3, X_test3, y_train3, y_test3 = train_test_split(X3,y3,test_size = 0.2)

#### Hypter Parameters

In [22]:
lookback_timestep = 100
feature_num = 40

#Conv param
conv_filter_num = 16

#Inception module param
inception_num = 32

#LSTM param
LSTM_num = 64

#Activation param
leaky_relu_alpha = 0.01


#### Training Hyper Params

In [23]:
# categorical crossentropy loss
loss = 'categorical_crossentropy'

# ADAM is used
learning_rate = 0.01
adam_epsilon = 1
optimizer = Adam(lr=learning_rate, epsilon=1)

# accuracy is used for stopping training
metrics = ['accuracy']

#max epoch num is not specified in paper, use 120 because paper mentions training stops at about 100 epochs
num_epoch = 10000
#stop training when validation accuracy does not improve for 20 epochs
stop_epoch_num = 20
#mini-batch size 32 from paper
batch_size = 32


Inception model

From paper: In our case, we split the input into a small set of lowerdimensional representations by using 1 × 1 convolutions, transform the representations by a set of filters, here 3 × 1 and 5 × 1, and then merge the outputs. A max-pooling layer is used inside the Inception Module, with stride 1 and zero padding. “Inception@32” represents one module and indicates all convolutional layers have 32 filters in this module, and the approach is depicted schematically in Figure 4

#### Model

In [24]:

def initiate_DeepLOB_model(lookback_timestep, feature_num, conv_filter_num, inception_num, LSTM_num, leaky_relu_alpha,
                          loss, optimizer, metrics):
    
    input_tensor = Input(shape=(lookback_timestep, feature_num, 1))
    
    # Conv block1
    print(input_tensor.shape)
    conv_layer1 = Conv2D(conv_filter_num, (1,2), strides=(1, 2))(input_tensor)
    print(conv_layer1.shape)
    conv_layer1 =LeakyReLU(alpha=leaky_relu_alpha)(conv_layer1)
    print(conv_layer1.shape)
    conv_layer1 = Conv2D(conv_filter_num, (4,1), padding='same')(conv_layer1)
    conv_first1 = LeakyReLU(alpha=leaky_relu_alpha)(conv_layer1)
    print(conv_layer1.shape)
    conv_layer1 = Conv2D(conv_filter_num, (4,1), padding='same')(conv_layer1)
    conv_layer1 = LeakyReLU(alpha=leaky_relu_alpha)(conv_layer1)
    print(conv_layer1.shape)

    # Conv block2
    conv_layer2 = Conv2D(conv_filter_num, (1,2), strides=(1, 2))(conv_layer1)
    conv_layer2 = LeakyReLU(alpha=leaky_relu_alpha)(conv_layer2)
    print(conv_layer2.shape)
    conv_layer2 = Conv2D(conv_filter_num, (4,1), padding='same')(conv_layer2)
    conv_layer2 = LeakyReLU(alpha=leaky_relu_alpha)(conv_layer2)
    print(conv_layer2.shape)
    conv_layer2 = Conv2D(conv_filter_num, (4,1), padding='same')(conv_layer2)
    conv_layer2 = LeakyReLU(alpha=leaky_relu_alpha)(conv_layer2)
    print(conv_layer2.shape)

    # Conv block3
    conv_layer3 = Conv2D(conv_filter_num, (1,10))(conv_layer2)
    conv_layer3 = LeakyReLU(alpha=leaky_relu_alpha)(conv_layer3)
    print(conv_layer3.shape)
    conv_layer3 = Conv2D(conv_filter_num, (4,1), padding='same')(conv_layer3)
    conv_layer3 = LeakyReLU(alpha=leaky_relu_alpha)(conv_layer3)
    print(conv_layer3.shape)
    conv_layer3 = Conv2D(conv_filter_num, (4,1), padding='same')(conv_layer3)
    conv_layer3 = LeakyReLU(alpha=leaky_relu_alpha)(conv_layer3)
    print(conv_layer3.shape)
    
    # Inception module
    inception_module1 = Conv2D(inception_num, (1,1), padding='same')(conv_layer3)
    inception_module1 = LeakyReLU(alpha=leaky_relu_alpha)(inception_module1)
    print(inception_module1.shape)
    inception_module1 = Conv2D(inception_num, (3,1), padding='same')(inception_module1)
    inception_module1 = LeakyReLU(alpha=leaky_relu_alpha)(inception_module1)
    print(inception_module1.shape)

    inception_module2 = Conv2D(inception_num, (1,1), padding='same')(conv_layer3)
    inception_module2 = LeakyReLU(alpha=leaky_relu_alpha)(inception_module2)
    print(inception_module2.shape)
    inception_module2 = Conv2D(inception_num, (5,1), padding='same')(inception_module2)
    inception_module2 = LeakyReLU(alpha=leaky_relu_alpha)(inception_module2)
    print(inception_module2.shape)

    inception_module3 = MaxPooling2D((3,1), strides=(1,1), padding='same')(conv_layer3)
    print(inception_module3.shape)
    inception_module3 = Conv2D(inception_num, (1,1), padding='same')(inception_module3)
    print(inception_module3.shape)
    inception_module3 = LeakyReLU(alpha=leaky_relu_alpha)(inception_module3)
    print(inception_module3.shape)
    
    inception_module_final = concatenate([inception_module1, inception_module2, inception_module3], axis=3)
    print(inception_module_final.shape)
    inception_module_final = Reshape((inception_module_final.shape[1], inception_module_final.shape[3]))(inception_module_final)
    print(inception_module_final.shape)

    # LSTM
    LSTM_output = LSTM(LSTM_num)(inception_module_final)
    print(LSTM_output.shape)

    # Fully Connected Layer with softmax activation function for output
    model_output = Dense(3, activation='softmax')(LSTM_output)
    print(model_output.shape)
    
    DeepLOB_model = Model(inputs=input_tensor, outputs= model_output)  
    es = EarlyStopping(monitor='val_accuracy', mode='max', verbose=1)
    
    DeepLOB_model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

    return DeepLOB_model


In [35]:
DeepLOB_model = initiate_DeepLOB_model(lookback_timestep, feature_num, conv_filter_num, inception_num, LSTM_num, leaky_relu_alpha,
                          loss, optimizer, metrics)

(None, 100, 40, 1)
(None, 100, 20, 16)
(None, 100, 20, 16)
(None, 100, 20, 16)
(None, 100, 20, 16)
(None, 100, 10, 16)
(None, 100, 10, 16)
(None, 100, 10, 16)
(None, 100, 1, 16)
(None, 100, 1, 16)
(None, 100, 1, 16)
(None, 100, 1, 32)
(None, 100, 1, 32)
(None, 100, 1, 32)
(None, 100, 1, 32)
(None, 100, 1, 16)
(None, 100, 1, 32)
(None, 100, 1, 32)
(None, 100, 1, 96)
(None, 100, 96)
(None, 64)
(None, 3)


In [36]:
DeepLOB_model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 100, 40, 1)] 0                                            
__________________________________________________________________________________________________
conv2d (Conv2D)                 (None, 100, 20, 16)  48          input_1[0][0]                    
__________________________________________________________________________________________________
leaky_re_lu (LeakyReLU)         (None, 100, 20, 16)  0           conv2d[0][0]                     
__________________________________________________________________________________________________
conv2d_1 (Conv2D)               (None, 100, 20, 16)  1040        leaky_re_lu[0][0]                
______________________________________________________________________________________________

#### Model Training

In [44]:
es = EarlyStopping(monitor='val_accuracy', mode='max', patience = stop_epoch_num, verbose=1)
DeepLOB_model_2.fit(X_train, y_train, epochs=10000, batch_size=batch_size, verbose=2, validation_data=(X_test, y_test), callbacks = [es])

Epoch 1/10000
3996/3996 - 315s - loss: 1.0985 - accuracy: 0.3387 - val_loss: 1.0984 - val_accuracy: 0.3381
Epoch 2/10000
3996/3996 - 352s - loss: 1.0980 - accuracy: 0.3441 - val_loss: 1.0973 - val_accuracy: 0.3445
Epoch 3/10000
3996/3996 - 311s - loss: 1.0945 - accuracy: 0.3667 - val_loss: 1.0889 - val_accuracy: 0.3887
Epoch 4/10000
3996/3996 - 297s - loss: 1.0681 - accuracy: 0.4186 - val_loss: 1.0506 - val_accuracy: 0.4420
Epoch 5/10000
3996/3996 - 326s - loss: 1.0330 - accuracy: 0.4594 - val_loss: 1.0238 - val_accuracy: 0.4697
Epoch 6/10000
3996/3996 - 335s - loss: 1.0199 - accuracy: 0.4744 - val_loss: 1.0192 - val_accuracy: 0.4770
Epoch 7/10000
3996/3996 - 369s - loss: 1.0140 - accuracy: 0.4824 - val_loss: 1.0146 - val_accuracy: 0.4837
Epoch 8/10000
3996/3996 - 365s - loss: 1.0097 - accuracy: 0.4873 - val_loss: 1.0114 - val_accuracy: 0.4798
Epoch 9/10000
3996/3996 - 349s - loss: 1.0058 - accuracy: 0.4915 - val_loss: 1.0036 - val_accuracy: 0.4951
Epoch 10/10000
3996/3996 - 349s - los

Epoch 77/10000
3996/3996 - 332s - loss: 0.8113 - accuracy: 0.6377 - val_loss: 0.8903 - val_accuracy: 0.5929
Epoch 78/10000
3996/3996 - 326s - loss: 0.8088 - accuracy: 0.6401 - val_loss: 0.8831 - val_accuracy: 0.5955
Epoch 79/10000
3996/3996 - 324s - loss: 0.8020 - accuracy: 0.6434 - val_loss: 0.8900 - val_accuracy: 0.5936
Epoch 80/10000
3996/3996 - 326s - loss: 0.8004 - accuracy: 0.6432 - val_loss: 0.8783 - val_accuracy: 0.6022
Epoch 81/10000
3996/3996 - 328s - loss: 0.7949 - accuracy: 0.6481 - val_loss: 0.8923 - val_accuracy: 0.5955
Epoch 82/10000
3996/3996 - 335s - loss: 0.7921 - accuracy: 0.6502 - val_loss: 0.8768 - val_accuracy: 0.6055
Epoch 83/10000
3996/3996 - 329s - loss: 0.7881 - accuracy: 0.6523 - val_loss: 0.8684 - val_accuracy: 0.6067
Epoch 84/10000
3996/3996 - 327s - loss: 0.7837 - accuracy: 0.6538 - val_loss: 0.8756 - val_accuracy: 0.6045
Epoch 85/10000
3996/3996 - 328s - loss: 0.7792 - accuracy: 0.6568 - val_loss: 0.8888 - val_accuracy: 0.5974
Epoch 86/10000
3996/3996 - 3

Epoch 153/10000
3996/3996 - 302s - loss: 0.5890 - accuracy: 0.7577 - val_loss: 0.8001 - val_accuracy: 0.6663
Epoch 154/10000
3996/3996 - 305s - loss: 0.5948 - accuracy: 0.7552 - val_loss: 0.7463 - val_accuracy: 0.6948
Epoch 155/10000
3996/3996 - 300s - loss: 0.5823 - accuracy: 0.7597 - val_loss: 0.7629 - val_accuracy: 0.6862
Epoch 156/10000
3996/3996 - 311s - loss: 0.5837 - accuracy: 0.7586 - val_loss: 0.7551 - val_accuracy: 0.6874
Epoch 157/10000
3996/3996 - 303s - loss: 0.5816 - accuracy: 0.7607 - val_loss: 0.7496 - val_accuracy: 0.6889
Epoch 158/10000
3996/3996 - 301s - loss: 0.5822 - accuracy: 0.7596 - val_loss: 0.7935 - val_accuracy: 0.6719
Epoch 159/10000
3996/3996 - 301s - loss: 0.5805 - accuracy: 0.7617 - val_loss: 0.7535 - val_accuracy: 0.6895
Epoch 160/10000
3996/3996 - 304s - loss: 0.5824 - accuracy: 0.7598 - val_loss: 0.7802 - val_accuracy: 0.6774
Epoch 161/10000
3996/3996 - 302s - loss: 0.5807 - accuracy: 0.7617 - val_loss: 0.7396 - val_accuracy: 0.6966
Epoch 162/10000
399

<tensorflow.python.keras.callbacks.History at 0x2003fe51a60>

In [75]:
DeepLOB_model_3_k26 = initiate_DeepLOB_model(lookback_timestep, feature_num, conv_filter_num, inception_num, LSTM_num, leaky_relu_alpha,
                          loss, optimizer, metrics)

(None, 100, 40, 1)
(None, 100, 20, 16)
(None, 100, 20, 16)
(None, 100, 20, 16)
(None, 100, 20, 16)
(None, 100, 10, 16)
(None, 100, 10, 16)
(None, 100, 10, 16)
(None, 100, 1, 16)
(None, 100, 1, 16)
(None, 100, 1, 16)
(None, 100, 1, 32)
(None, 100, 1, 32)
(None, 100, 1, 32)
(None, 100, 1, 32)
(None, 100, 1, 16)
(None, 100, 1, 32)
(None, 100, 1, 32)
(None, 100, 1, 96)
(None, 100, 96)
(None, 64)
(None, 3)


In [28]:
es = EarlyStopping(monitor='val_accuracy', mode='max', patience = stop_epoch_num, verbose=1)
history_k26 = DeepLOB_model_3_k26.fit(X_train3, y_train3, epochs=num_epoch, batch_size=batch_size, verbose=2, validation_data=(X_test3, y_test3), callbacks = [es])

Epoch 1/120
3996/3996 - 296s - loss: 1.0978 - accuracy: 0.3454 - val_loss: 1.0965 - val_accuracy: 0.3413
Epoch 2/120
3996/3996 - 298s - loss: 1.0924 - accuracy: 0.3705 - val_loss: 1.0972 - val_accuracy: 0.3639
Epoch 3/120
3996/3996 - 302s - loss: 1.0727 - accuracy: 0.4107 - val_loss: 1.0507 - val_accuracy: 0.4428
Epoch 4/120
3996/3996 - 302s - loss: 1.0364 - accuracy: 0.4601 - val_loss: 1.0221 - val_accuracy: 0.4733
Epoch 5/120
3996/3996 - 301s - loss: 1.0136 - accuracy: 0.4812 - val_loss: 1.0104 - val_accuracy: 0.4813
Epoch 6/120
3996/3996 - 302s - loss: 1.0052 - accuracy: 0.4898 - val_loss: 1.0027 - val_accuracy: 0.4869
Epoch 7/120
3996/3996 - 305s - loss: 0.9996 - accuracy: 0.4926 - val_loss: 1.0000 - val_accuracy: 0.4907
Epoch 8/120
3996/3996 - 302s - loss: 0.9948 - accuracy: 0.4959 - val_loss: 1.0002 - val_accuracy: 0.4890
Epoch 9/120
3996/3996 - 305s - loss: 0.9902 - accuracy: 0.4990 - val_loss: 0.9917 - val_accuracy: 0.4989
Epoch 10/120
3996/3996 - 302s - loss: 0.9861 - accuracy

Epoch 79/120
3996/3996 - 318s - loss: 0.4981 - accuracy: 0.7964 - val_loss: 0.5825 - val_accuracy: 0.7582
Epoch 80/120
3996/3996 - 322s - loss: 0.4984 - accuracy: 0.7969 - val_loss: 0.5771 - val_accuracy: 0.7599
Epoch 81/120
3996/3996 - 325s - loss: 0.4904 - accuracy: 0.8009 - val_loss: 0.6058 - val_accuracy: 0.7531
Epoch 82/120
3996/3996 - 326s - loss: 0.4955 - accuracy: 0.7987 - val_loss: 0.5614 - val_accuracy: 0.7717
Epoch 83/120
3996/3996 - 325s - loss: 0.4877 - accuracy: 0.8012 - val_loss: 0.6697 - val_accuracy: 0.7244
Epoch 84/120
3996/3996 - 320s - loss: 0.4864 - accuracy: 0.8005 - val_loss: 0.5677 - val_accuracy: 0.7680
Epoch 85/120
3996/3996 - 328s - loss: 0.4782 - accuracy: 0.8041 - val_loss: 0.6067 - val_accuracy: 0.7538
Epoch 86/120
3996/3996 - 325s - loss: 0.4744 - accuracy: 0.8076 - val_loss: 0.5478 - val_accuracy: 0.7763
Epoch 87/120
3996/3996 - 326s - loss: 0.4703 - accuracy: 0.8094 - val_loss: 0.5855 - val_accuracy: 0.7600
Epoch 88/120
3996/3996 - 324s - loss: 0.4705 -