In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn import linear_model as lm
import seaborn as sns

# Modelling Time Slots

As the bid-ask price are not constant across the different time period after Liquidity Shocks, we'll separately model each of them with a specific granularity

In [2]:
granularity = 10

In [3]:
start = 52
end = 101

In [4]:
ranges = []
current_start = start
while(True):
    current_end = current_start + granularity
    if current_end > end:
        ranges.append(range(current_start, end))
        break
    ranges.append(range(current_start, current_end))
    current_start = current_end

In [5]:
ranges

[[52, 53, 54, 55, 56, 57, 58, 59, 60, 61],
 [62, 63, 64, 65, 66, 67, 68, 69, 70, 71],
 [72, 73, 74, 75, 76, 77, 78, 79, 80, 81],
 [82, 83, 84, 85, 86, 87, 88, 89, 90, 91],
 [92, 93, 94, 95, 96, 97, 98, 99, 100]]

In [6]:
train_table = pd.read_csv('../data/subset_train_OHE.csv')
test_table = pd.read_csv('../data/subset_test_OHE.csv')

In [7]:
bidPredictionColumns = []
for i in range(52, 101):
    for column in train_table.columns.values:
        if column.endswith(str(i)) and column.startswith('bid'):
            bidPredictionColumns.append(column)

askPredictionColumns = []
for i in range(52, 101):
    for column in train_table.columns.values:
        if column.endswith(str(i)) and column.startswith('ask'):
            askPredictionColumns.append(column)

predictionColumns = bidPredictionColumns + askPredictionColumns

In [8]:
featureColumns = []
columnsToIgnore = ['row_id']
for column in train_table.columns.values:
    if ((column not in predictionColumns) and (column not in columnsToIgnore) and (not column.startswith('time'))):
        featureColumns.append(column)

In [9]:
featureColumnsTimeWise = [featureColumns]

currentFeatureColumns = featureColumns[:]
index = 0
for i in range(len(ranges) - 1):
    for k in range(len(ranges[i])):
        currentFeatureColumns.append(bidPredictionColumns[index])
        currentFeatureColumns.append(askPredictionColumns[index])
        index = index+1
    featureColumnsTimeWise.append(currentFeatureColumns[:])

In [10]:
bid_models = []
ask_models = []

In [11]:
trainX = []
trainY_ask = []
trainY_bid = []

In [13]:
for i in range(len(ranges)):
    trainX_intermediate = np.zeros((train_table.shape[0], len(featureColumnsTimeWise[i])))
    trainY_ask_intermediate = np.zeros((train_table.shape[0]))
    trainY_bid_intermediate = np.zeros((train_table.shape[0]))
    
    index = 0
    for ix, row in train_table.iterrows():
        trainX_intermediate[index, :] = row[featureColumnsTimeWise[i]]
        trainY_ask_intermediate[index] = row['ask' + str(ranges[i][0])]
        trainY_bid_intermediate[index] = row['bid' + str(ranges[i][0])]
        index = index+1
    
    trainX.append(trainX_intermediate)
    trainY_ask.append(trainY_ask_intermediate)
    trainY_bid.append(trainY_bid_intermediate)

In [14]:
for i in range(len(ranges)):
    print(trainX[i].shape)

(81309, 209)
(81309, 229)
(81309, 249)
(81309, 269)
(81309, 289)


In [25]:
featureColumnsTimeWise[1]

['is_security_73',
 'is_security_102',
 'is_security_18',
 'p_tcount',
 'p_value',
 'trade_vwap',
 'trade_volume',
 'is_buyer',
 'is_seller',
 'transtype1_is_T',
 'transtype1_is_Q',
 'bid1',
 'ask1',
 'transtype2_is_T',
 'transtype2_is_Q',
 'bid2',
 'ask2',
 'transtype3_is_T',
 'transtype3_is_Q',
 'bid3',
 'ask3',
 'transtype4_is_T',
 'transtype4_is_Q',
 'bid4',
 'ask4',
 'transtype5_is_T',
 'transtype5_is_Q',
 'bid5',
 'ask5',
 'transtype6_is_T',
 'transtype6_is_Q',
 'bid6',
 'ask6',
 'transtype7_is_T',
 'transtype7_is_Q',
 'bid7',
 'ask7',
 'transtype8_is_T',
 'transtype8_is_Q',
 'bid8',
 'ask8',
 'transtype9_is_T',
 'transtype9_is_Q',
 'bid9',
 'ask9',
 'transtype10_is_T',
 'transtype10_is_Q',
 'bid10',
 'ask10',
 'transtype11_is_T',
 'transtype11_is_Q',
 'bid11',
 'ask11',
 'transtype12_is_T',
 'transtype12_is_Q',
 'bid12',
 'ask12',
 'transtype13_is_T',
 'transtype13_is_Q',
 'bid13',
 'ask13',
 'transtype14_is_T',
 'transtype14_is_Q',
 'bid14',
 'ask14',
 'transtype15_is_T',
 'tra

In [15]:
testX = []
for i in range(len(ranges)):
    testX_intermediate = np.zeros((test_table.shape[0], len(featureColumnsTimeWise[i])))
    
    index = 0
    for ix, row in test_table.iterrows():
        testX_intermediate[index, :] = np.array(row[featureColumnsTimeWise[i]])
        index = index+1
    
    testX.append(testX_intermediate)

In [22]:
testX[0]

array([[  0.00000000e+00,   0.00000000e+00,   1.00000000e+00, ...,
          1.00000000e+00,   4.35000000e+02,   4.35200000e+02],
       [  0.00000000e+00,   0.00000000e+00,   1.00000000e+00, ...,
          1.00000000e+00,   4.31900000e+02,   4.32100000e+02],
       [  1.00000000e+00,   0.00000000e+00,   0.00000000e+00, ...,
          1.00000000e+00,   3.15150000e+03,   3.15350000e+03],
       ..., 
       [  1.00000000e+00,   0.00000000e+00,   0.00000000e+00, ...,
          1.00000000e+00,   3.16950000e+03,   3.17050000e+03],
       [  0.00000000e+00,   0.00000000e+00,   1.00000000e+00, ...,
          1.00000000e+00,   4.26500000e+02,   4.26600000e+02],
       [  1.00000000e+00,   0.00000000e+00,   0.00000000e+00, ...,
          1.00000000e+00,   3.15500000e+03,   3.15600000e+03]])

In [23]:
for i in range(len(ranges)):
    model_ask = lm.LinearRegression(fit_intercept=True, normalize=False, n_jobs=1)
    model_ask.fit(trainX[i], trainY_ask[i])
    ask_models.append(model_ask)
    
    model_bid = lm.LinearRegression(fit_intercept=True, normalize=False, n_jobs=1)
    model_bid.fit(trainX[i], trainY_bid[i])
    bid_models.append(model_bid)

In [24]:
prediction = pd.read_csv('../predictions/template_prediction.csv')

In [34]:
testY_ask = []
testY_bid = []
for i in range(len(ranges)):
    testY_ask_temp = ask_models[i].predict(testX[i])
    testY_ask.append(testY_ask_temp)
    
    testY_bid_temp = bid_models[i].predict(testX[i])
    testY_bid.append(testY_bid_temp)
    
    print("predicted ", i)
    #Use the current prediction to fill
    #test set for upcoming predictions
    lastColumnNumber = testX[i].shape[1]
    if i != len(ranges)-1:
        for k in range(len(ranges[i])):
            for j in range(i+1, len(ranges)):
                testX[j][:, lastColumnNumber] = testY_bid_temp
                testX[j][:, lastColumnNumber+1] = testY_ask_temp
            lastColumnNumber = lastColumnNumber + 2

predicted  0
predicted  1
predicted  2
predicted  3
predicted  4


In [38]:
index = 0
for ix, row in test_table.iterrows():
    row_id = row['row_id']

    index_in_pred = prediction[prediction['row_id'] == row_id].index.tolist()[0]
    
    for i in range(len(ranges)):
        bid = testY_bid[i][index]
        ask = testY_ask[i][index]
        
        for k in ranges[i]:
            prediction.at[index_in_pred, 'bid' + str(k)] = bid
            prediction.at[index_in_pred, 'ask' + str(k)] = ask
    index = index+1

In [39]:
prediction.head()

Unnamed: 0,row_id,bid52,ask52,bid53,ask53,bid54,ask54,bid55,ask55,bid56,...,bid96,ask96,bid97,ask97,bid98,ask98,bid99,ask99,bid100,ask100
254689,254690,435.025307,435.16143,435.025307,435.16143,435.025307,435.16143,435.025307,435.16143,435.025307,...,434.979941,435.135814,434.979941,435.135814,434.979941,435.135814,434.979941,435.135814,434.979941,435.135814
500810,500811,431.927565,432.051395,431.927565,432.051395,431.927565,432.051395,431.927565,432.051395,431.927565,...,431.886699,432.057831,431.886699,432.057831,431.886699,432.057831,431.886699,432.057831,431.886699,432.057831
196078,196079,3151.550008,3153.438208,3151.550008,3153.438208,3151.550008,3153.438208,3151.550008,3153.438208,3151.550008,...,3151.572784,3153.450002,3151.572784,3153.450002,3151.572784,3153.450002,3151.572784,3153.450002,3151.572784,3153.450002
226436,226437,935.182805,935.531167,935.182805,935.531167,935.182805,935.531167,935.182805,935.531167,935.182805,...,935.137861,935.57722,935.137861,935.57722,935.137861,935.57722,935.137861,935.57722,935.137861,935.57722
339574,339575,947.211576,948.002552,947.211576,948.002552,947.211576,948.002552,947.211576,948.002552,947.211576,...,947.221189,948.008883,947.221189,948.008883,947.221189,948.008883,947.221189,948.008883,947.221189,948.008883
