In [2]:
%matplotlib inline
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

## Model Description
Here, we try to model the whole curve(only till time-tick 3 for modelling purposes) and not just the first value by adding the "time-tick" parameter.

f(x, time_tick) = y

In [3]:
train_table = pd.read_csv('../data/subset_train_OHE.csv')
test_table = pd.read_csv('../data/subset_test_OHE.csv')

In [4]:
bidPredictionColumns = []
for i in range(52, 101):
    for column in train_table.columns.values:
        if column.endswith(str(i)) and column.startswith('bid'):
            bidPredictionColumns.append(column)

askPredictionColumns = []
for i in range(52, 101):
    for column in train_table.columns.values:
        if column.endswith(str(i)) and column.startswith('ask'):
            askPredictionColumns.append(column)

predictionColumns = bidPredictionColumns + askPredictionColumns

In [5]:
featureColumns = []
columnsToIgnore = ['row_id']
for column in train_table.columns.values:
    if ((column not in predictionColumns) and (column not in columnsToIgnore) and (not column.startswith('time'))):
        featureColumns.append(column)

In [16]:
trainX = np.zeros((train_table.shape[0] * 3, len(featureColumns) + 1))

trainY_bid = np.zeros((train_table.shape[0] * 3))
trainY_ask = np.zeros((train_table.shape[0] * 3))

In [17]:
index = 0
for ix, row in train_table.iterrows():
    X_features = np.array(row[featureColumns])
    for i in range(3):
        X = np.concatenate((X_features, np.array([i])))
        trainX[index, :] = X
        trainY_ask[index] = row[askPredictionColumns[i]]
        trainY_bid[index] = row[bidPredictionColumns[i]]
        index = index+1

In [18]:
testX = np.zeros((test_table.shape[0] * 3, len(featureColumns) + 1))
index = 0
for ix, row in test_table.iterrows():
    X_features = np.array(row[featureColumns])
    for i in range(3):
        X = np.concatenate((X_features, np.array([i])))
        testX[index, :] = X
        index = index+1

In [19]:
from sklearn import linear_model as lm

In [20]:
LR_model_ask = lm.LinearRegression(fit_intercept=True, normalize=False, n_jobs=1)
LR_model_ask.fit(trainX, trainY_ask)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [21]:
LR_model_bid = lm.LinearRegression(fit_intercept=True, normalize=False, n_jobs=1)
LR_model_bid.fit(trainX, trainY_bid)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [22]:
testY_ask = LR_model_ask.predict(testX)
testY_bid = LR_model_bid.predict(testX)

In [24]:
prediction = pd.read_csv('../predictions/template_prediction.csv')

In [25]:
index = 0
for ix, row in test_table.iterrows():
    row_id = row['row_id']

    index_in_pred = prediction[prediction['row_id'] == row_id].index.tolist()[0]
    
    for i in range(3):
        ask = testY_ask[index]
        bid = testY_bid[index]
        index = index+1
        
        prediction.at[index_in_pred, bidPredictionColumns[i]] = bid
        prediction.at[index_in_pred, askPredictionColumns[i]] = ask
    
    for i in range(3, 49):
        prediction.at[index_in_pred, bidPredictionColumns[i]] = bid
        prediction.at[index_in_pred, askPredictionColumns[i]] = ask

In [26]:
prediction.head()

Unnamed: 0,row_id,bid52,ask52,bid53,ask53,bid54,ask54,bid55,ask55,bid56,...,bid96,ask96,bid97,ask97,bid98,ask98,bid99,ask99,bid100,ask100
254689,254690,435.019248,435.161629,435.014753,435.169636,435.010258,435.177643,435.010258,435.177643,435.010258,...,435.010258,435.177643,435.010258,435.177643,435.010258,435.177643,435.010258,435.177643,435.010258,435.177643
500810,500811,431.932132,432.040772,431.927637,432.048779,431.923142,432.056787,431.923142,432.056787,431.923142,...,431.923142,432.056787,431.923142,432.056787,431.923142,432.056787,431.923142,432.056787,431.923142,432.056787
196078,196079,3151.594587,3153.360737,3151.590092,3153.368744,3151.585597,3153.376751,3151.585597,3153.376751,3151.585597,...,3151.585597,3153.376751,3151.585597,3153.376751,3151.585597,3153.376751,3151.585597,3153.376751,3151.585597,3153.376751
226436,226437,935.184028,935.554995,935.179533,935.563002,935.175039,935.57101,935.175039,935.57101,935.175039,...,935.175039,935.57101,935.175039,935.57101,935.175039,935.57101,935.175039,935.57101,935.175039,935.57101
339574,339575,947.246311,948.025961,947.241816,948.033968,947.237321,948.041976,947.237321,948.041976,947.237321,...,947.237321,948.041976,947.237321,948.041976,947.237321,948.041976,947.237321,948.041976,947.237321,948.041976
