In [1]:
import numpy as np
import pandas as pd
from tensorflow import keras as tfk

In [2]:
# Get limit order book data 
orderbook = pd.read_csv('/Users/tanvipotdar/Projects/LOBster/data_tqap/INTC_2015-01-01_2015-01-31_10/INTC_2015-01-02_34200000_57600000_orderbook_10.csv')

In [3]:
# Normalise the data
from scipy.stats import zscore
normalised_data = orderbook.apply(zscore)
normalised_data.head(10)

Unnamed: 0,ask_price_1,ask_size_1,bid_price_1,bid_size_1,ask_price_2,ask_size_2,bid_price_2,bid_size_2,ask_price_3,ask_size_3,...,bid_price_8,bid_size_8,ask_price_9,ask_size_9,bid_price_9,bid_size_9,ask_price_10,ask_size_10,bid_price_10,bid_size_10
0,0.796834,-0.070811,0.114857,-1.063739,0.831288,-1.895349,0.011343,-1.984987,0.934676,-1.788479,...,-0.816904,-0.872724,2.450812,-0.888307,-0.885917,-1.164202,2.58852,-0.313841,-1.092996,-1.491542
1,0.796834,-0.070811,0.114857,-1.063739,0.831288,-1.895349,0.011343,-1.984987,0.934676,-1.788479,...,-0.816904,-0.872724,2.450812,-0.888307,-0.885917,-1.164202,2.58852,-0.313841,-1.092996,-1.491542
2,0.796834,-0.070811,0.114857,-1.154853,0.831288,-1.895349,0.011343,-1.984987,0.934676,-1.788479,...,-0.816904,-0.872724,2.450812,-0.888307,-0.885917,-1.164202,2.58852,-0.313841,-1.092996,-1.491542
3,0.796834,-0.070811,0.114857,-1.154853,0.831288,-1.895349,0.011343,-1.984987,0.934676,-1.788479,...,-0.437242,-0.872724,2.450812,-0.888307,-0.782372,-1.164202,2.58852,-0.313841,-0.851389,-1.491542
4,0.796834,-0.070811,0.114857,-1.154853,0.831288,-1.895349,0.011343,-1.984987,0.934676,-1.788479,...,-0.437242,-0.872724,2.450812,-0.888307,-0.782372,-1.164202,2.58852,-0.313841,-0.851389,-1.491542
5,0.796834,-0.070811,0.114857,-1.154853,0.831288,-1.895349,0.011343,-1.984987,0.934676,-1.788479,...,-0.437242,-0.872724,2.450812,-0.888307,-0.782372,-1.164202,2.58852,-0.313841,-0.851389,-1.491542
6,0.796834,-0.070811,0.114857,-1.154853,0.831288,-1.895349,0.011343,-1.984987,0.934676,-1.788479,...,-0.437242,-0.872724,2.450812,-0.888307,-0.782372,-1.164202,2.58852,-0.313841,-0.851389,-1.491542
7,0.796834,-0.070811,0.114857,-1.154853,0.831288,-1.895349,0.011343,-1.984987,0.934676,-1.788479,...,-0.437242,-0.872724,2.450812,-0.888307,-0.782372,-1.164202,2.58852,-0.313841,-0.851389,-1.491542
8,0.796834,-0.070811,0.632554,-1.154853,0.831288,-1.895349,0.149397,-2.053207,0.934676,-1.788479,...,-0.402727,-0.894391,2.450812,-0.888307,-0.40271,-1.164202,2.58852,-0.313841,-0.747843,-1.491542
9,0.796834,-0.070811,0.632554,-1.124482,0.831288,-1.895349,0.149397,-2.053207,0.934676,-1.788479,...,-0.402727,-0.894391,2.450812,-0.888307,-0.40271,-1.164202,2.58852,-0.313841,-0.747843,-1.491542


In [4]:
# calculate the midprice
normalised_data['midprice'] = (normalised_data.ask_price_1+normalised_data.bid_price_1)/2

In [5]:
# smoothed labelling of the midprice/ k is the prediction horizon
k = 10
# mean of previous k mid-prices
normalised_data['m_minus'] = normalised_data['midprice'].rolling(window=k).mean()
# mean of next k mid-prices
normalised_data['m_plus'] = normalised_data['midprice'][::-1].rolling(window=k).mean()[::-1]

In [6]:
# label the smoothed mid-prices based on a threshold/ alpha is the threshold 
alpha = 0.0001
normalised_data['change'] = (normalised_data.m_plus - normalised_data.m_minus)/normalised_data.m_minus
# assign categories up, down, stationary
normalised_data['label'] = pd.cut(normalised_data.change, bins=[-np.inf, -alpha, alpha, np.inf], 
                                  labels=["down", "stationary", "up"])
# drop all unlabelled values (will be first and last k values as they have no m_minus/m_plus value)
normalised_data.dropna(inplace=True)
normalised_data.head()

Unnamed: 0,ask_price_1,ask_size_1,bid_price_1,bid_size_1,ask_price_2,ask_size_2,bid_price_2,bid_size_2,ask_price_3,ask_size_3,...,bid_size_9,ask_price_10,ask_size_10,bid_price_10,bid_size_10,midprice,m_minus,m_plus,change,label
9,0.796834,-0.070811,0.632554,-1.124482,0.831288,-1.895349,0.149397,-2.053207,0.934676,-1.788479,...,-1.164202,2.58852,-0.313841,-0.747843,-1.491542,0.714694,0.507615,0.714694,0.407945,up
10,0.796834,-0.070811,0.632554,-1.124482,0.831288,-1.895349,0.149397,-2.053207,0.934676,-1.788479,...,-1.164202,2.58852,-0.313841,-0.747843,-1.491542,0.714694,0.5335,0.714694,0.339633,up
11,0.796834,-0.070811,0.632554,-1.124482,0.831288,-1.895349,0.149397,-2.053207,0.934676,-1.788479,...,-1.164202,2.58852,-0.313841,-0.747843,-1.491542,0.714694,0.559385,0.714694,0.277643,up
12,0.796834,-0.070811,0.632554,-1.09411,0.831288,-1.895349,0.149397,-2.053207,0.934676,-1.788479,...,-1.164202,2.58852,-0.313841,-0.747843,-1.491542,0.714694,0.58527,0.714694,0.221136,up
13,0.796834,-0.070811,0.632554,-1.09411,0.831288,-1.895349,0.149397,-2.053207,0.934676,-1.788479,...,-1.197339,2.58852,-0.313841,-0.368176,-1.491542,0.714694,0.611155,0.714694,0.169416,up


In [7]:
# get input data in correct shape
input_data = normalised_data[:100]
output_data = normalised_data[:100]
cols=input_data.columns.to_list()[:40]
input_data = input_data[cols]
input_array = input_data.to_numpy().reshape(1,100,10,4)
output_data = output_data[['label']]
output_array = output_data.to_numpy().reshape(1,100,1,1)

In [68]:
# convolutional layer
model = tfk.Sequential()
model.add(tfk.layers.Conv2D(filters=16, kernel_size=(1,2), input_shape=(100,10,4), strides=(1, 2)))
model.add(tfk.layers.LeakyReLU(alpha=0.01))

model.add(tfk.layers.Conv2D(filters=16, kernel_size=(1,2), strides=(1, 2)))
model.add(tfk.layers.LeakyReLU(alpha=0.01))

model.add(tfk.layers.Conv2D(filters=16, kernel_size=(1,2), input_shape=(100,10,1)))
model.add(tfk.layers.Dense(1, activation='relu'))
model.compile(optimizer='adam', loss='mse')
# model.fit(input_data, output_data, epochs=1000, verbose=1)
model.summary()

Instructions for updating:
Use tf.cast instead.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_153 (Conv2D)          (None, 100, 5, 16)        144       
_________________________________________________________________
leaky_re_lu_52 (LeakyReLU)   (None, 100, 5, 16)        0         
_________________________________________________________________
conv2d_154 (Conv2D)          (None, 100, 2, 16)        528       
_________________________________________________________________
leaky_re_lu_53 (LeakyReLU)   (None, 100, 2, 16)        0         
_________________________________________________________________
conv2d_155 (Conv2D)          (None, 100, 1, 16)        528       
_________________________________________________________________
dense_4 (Dense)              (None, 100, 1, 1)         17        
Total params: 1,217
Trainable params: 1,217
Non-trainable params: 0
__________________________

In [29]:
# convolutional layer
model = tfk.Sequential()
model.add(tfk.layers.Conv1D(filters=16, kernel_size=(2), input_shape=(100,40), strides=(2)))
model.add(tfk.layers.Conv1D(filters=16, kernel_size=(2), strides=(2)))
# model.add(tfk.layers.Conv2D(filters=16, kernel_size=(1,10)))
# model.compile(optimizer='adam', loss='mse')
# model.fit(input_data, output_data, epochs=1000, verbose=1)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_7 (Conv1D)            (None, 50, 16)            1296      
_________________________________________________________________
conv1d_8 (Conv1D)            (None, 25, 16)            528       
Total params: 1,824
Trainable params: 1,824
Non-trainable params: 0
_________________________________________________________________
