In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
from sklearn import linear_model as lm
from matplotlib import pyplot as plt
import seaborn as sns

# Model Description

Simply use all the predictors available and generate one ask-bid value (just after the liquidity shock) and repeat that value across the other time-points

## Construct the X and Y and train

In [2]:
train_table = pd.read_csv('../data/subset_train_OHE.csv')
test_table = pd.read_csv('../data/subset_test_OHE.csv')

In [3]:
train_table.head()

Unnamed: 0,row_id,is_security_73,is_security_102,is_security_18,p_tcount,p_value,trade_vwap,trade_volume,is_buyer,is_seller,...,bid96,ask96,bid97,ask97,bid98,ask98,bid99,ask99,bid100,ask100
586498,586499,0,1,0,13226,8271646293,951.4,1021,0,1,...,951.8,951.9,951.6,951.9,951.6,951.8,951.6,951.8,951.6,951.9
675744,675745,1,0,0,19304,14603738564,3287.0,267,0,1,...,3289.0,3290.0,3289.0,3290.0,3289.0,3290.0,3289.0,3290.0,3289.0,3291.0
340312,340313,0,1,0,17510,11402659276,940.3,1368,1,0,...,941.1,941.8,941.1,941.7,941.1,941.6,940.9,941.6,940.8,941.6
136024,136025,0,0,1,22521,19264529682,431.9,551,1,0,...,431.7,431.85,431.7,431.85,431.7,431.85,431.7,431.85,431.7,431.85
197970,197971,1,0,0,14971,12747929651,3142.0,10,0,1,...,3141.0,3142.5,3141.0,3142.5,3141.0,3142.5,3141.0,3142.5,3141.0,3142.5


In [4]:
test_table.head()

Unnamed: 0,row_id,is_security_73,is_security_102,is_security_18,p_tcount,p_value,trade_vwap,trade_volume,is_buyer,is_seller,...,transtype49_is_T,transtype49_is_Q,time49,bid49,ask49,transtype50_is_T,transtype50_is_Q,time50,bid50,ask50
254689,254690,0,0,1,18917,10969715628,435.1,66,0,1,...,1,0,15:47:18.160,435.1,435.2,0,1,15:47:18.160,435.0,435.2
500810,500811,0,0,1,12917,9792408270,432.0,7661,0,1,...,1,0,08:32:12.389,432.0,432.1,0,1,08:32:12.389,431.9,432.1
196078,196079,1,0,0,14971,12747929651,3152.0,610,1,0,...,1,0,13:50:52.674,3151.5,3152.0,0,1,13:50:52.674,3151.5,3153.5
226436,226437,0,1,0,13344,8492047588,935.4,45,1,0,...,1,0,16:03:38.133,935.2,935.4,0,1,16:03:38.133,935.2,935.5
339574,339575,0,1,0,17510,11402659276,947.8,500,1,0,...,1,0,08:42:51.975,947.1,947.8,0,1,08:42:51.975,947.1,948.0


In [5]:
predictionColumns = []
for i in range(52, 101):
    for column in train_table.columns.values:
        if column.endswith(str(i)) and (column.startswith('ask') or column.startswith('bid')):
            predictionColumns.append(column)

In [6]:
featureColumns = []
for column in train_table.columns.values:
    if ((column not in predictionColumns) and (column != 'row_id') and (not column.startswith('time'))):
        featureColumns.append(column)

In [7]:
trainX = np.zeros((train_table.shape[0] * 2, len(featureColumns) + 1))
trainY = np.zeros((train_table.shape[0] * 2))

In [8]:
testX = np.zeros((test_table.shape[0] * 2, len(featureColumns) + 1))

In [9]:
index = 0
for ix, row in train_table.iterrows():
    X_features = (np.array(row[featureColumns])).flatten('F')
    X = np.concatenate((X_features, np.array([0])))
    Y = row[predictionColumns[0]]
    trainX[index, :] = X
    trainY[index] = Y
    
    index = index+1
    
    X = np.concatenate((X_features, np.array([1])))
    Y = row[predictionColumns[1]]
    trainX[index, :] = X
    trainY[index] = Y
    index = index+1

In [13]:
index = 0
for ix, row in test_table.iterrows():
    X_features = (np.array(row[featureColumns])).flatten('F')
    X = np.concatenate((X_features, np.array([0])))
    testX[index, :] = X
    
    index = index+1
    
    X = np.concatenate((X_features, np.array([1])))
    testX[index, :] = X
    index = index+1

In [16]:
LR_model = lm.LinearRegression(fit_intercept=True, normalize=False, n_jobs=1)

In [18]:
LR_model.fit(trainX, trainY)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

## Create the prediction file to be evaluated

In [20]:
testY = LR_model.predict(testX)

In [24]:
prediction = pd.read_csv('../predictions/template_prediction.csv')

In [26]:
i = 0
for ix, row in test_table.iterrows():
    row_id = row['row_id']
    
    index_in_pred = prediction[prediction['row_id'] == row_id].index.tolist()[0]
    
    bid = testY[i]
    i = i+1
    ask = testY[i]
    i = i+1
    
    for column in predictionColumns:
        if column.startswith('bid'):
            prediction.at[index_in_pred, column] = bid
        else:
            prediction.at[index_in_pred, column] = ask

In [27]:
prediction.head()

Unnamed: 0,row_id,bid52,ask52,bid53,ask53,bid54,ask54,bid55,ask55,bid56,...,bid96,ask96,bid97,ask97,bid98,ask98,bid99,ask99,bid100,ask100
254689,254690,434.647346,435.533534,434.647346,435.533534,434.647346,435.533534,434.647346,435.533534,434.647346,...,434.647346,435.533534,434.647346,435.533534,434.647346,435.533534,434.647346,435.533534,434.647346,435.533534
500810,500811,431.548295,432.434483,431.548295,432.434483,431.548295,432.434483,431.548295,432.434483,431.548295,...,431.548295,432.434483,431.548295,432.434483,431.548295,432.434483,431.548295,432.434483,431.548295,432.434483
196078,196079,3152.053638,3152.939826,3152.053638,3152.939826,3152.053638,3152.939826,3152.053638,3152.939826,3152.053638,...,3152.053638,3152.939826,3152.053638,3152.939826,3152.053638,3152.939826,3152.053638,3152.939826,3152.053638,3152.939826
226436,226437,934.915415,935.801603,934.915415,935.801603,934.915415,935.801603,934.915415,935.801603,934.915415,...,934.915415,935.801603,934.915415,935.801603,934.915415,935.801603,934.915415,935.801603,934.915415,935.801603
339574,339575,947.171433,948.057621,947.171433,948.057621,947.171433,948.057621,947.171433,948.057621,947.171433,...,947.171433,948.057621,947.171433,948.057621,947.171433,948.057621,947.171433,948.057621,947.171433,948.057621
