In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
from sklearn import linear_model as lm
from matplotlib import pyplot as plt
import seaborn as sns

In [2]:
train_table = pd.read_csv('../data/subset_train_OHE.csv')
test_table = pd.read_csv('../data/subset_test_OHE.csv')

In [3]:
bidPredictionColumns = []
for i in range(52, 101):
    for column in train_table.columns.values:
        if column.endswith(str(i)) and column.startswith('bid'):
            bidPredictionColumns.append(column)

askPredictionColumns = []
for i in range(52, 101):
    for column in train_table.columns.values:
        if column.endswith(str(i)) and column.startswith('ask'):
            askPredictionColumns.append(column)

predictionColumns = bidPredictionColumns + askPredictionColumns

In [4]:
featureColumns = []
columnsToIgnore = []
for column in train_table.columns.values:
    if ((column not in predictionColumns) and (column not in columnsToIgnore) and (not column.startswith('time'))):
        featureColumns.append(column)

In [5]:
trainX = np.zeros((train_table.shape[0], len(featureColumns)))

trainY_ask = np.zeros((train_table.shape[0]))
trainY_bid = np.zeros((train_table.shape[0]))

testX = np.zeros((test_table.shape[0], len(featureColumns)))

In [6]:
index = 0
for ix, row in train_table.iterrows():
    X = (np.array(row[featureColumns])).flatten('F')
    Y_bid = row[bidPredictionColumns[0]]
    Y_ask = row[askPredictionColumns[0]]
    
    trainX[index, :] = X
    trainY_ask[index] = Y_ask
    trainY_bid[index] = Y_bid
    
    index = index+1

In [7]:
index = 0
for ix, row in test_table.iterrows():
    X = (np.array(row[featureColumns])).flatten('F')
    testX[index, :] = X

    index = index+1

In [8]:
LR_model_ask = lm.LinearRegression(fit_intercept=True, normalize=False, n_jobs=1)
LR_model_ask.fit(trainX, trainY_ask)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [9]:
LR_model_bid = lm.LinearRegression(fit_intercept=True, normalize=False, n_jobs=1)
LR_model_bid.fit(trainX, trainY_bid)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [10]:
testY_ask = LR_model_ask.predict(testX)
testY_bid = LR_model_bid.predict(testX)

In [11]:
prediction = pd.read_csv('../predictions/template_prediction.csv')

i = 0
for ix, row in test_table.iterrows():
    row_id = row['row_id']

    index_in_pred = prediction[prediction['row_id'] == row_id].index.tolist()[0]

    bid = testY_bid[i]
    ask = testY_ask[i]
    i = i+1
    
    for column in predictionColumns:
        if column.startswith('bid'):
            prediction.at[index_in_pred, column] = bid
        else:
            prediction.at[index_in_pred, column] = ask

In [12]:
prediction.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,row_id,bid52,ask52,bid53,ask53,bid54,ask54,bid55,...,bid96,ask96,bid97,ask97,bid98,ask98,bid99,ask99,bid100,ask100
0,0,0.0,559051,87643100.0,95035750.0,87643100.0,95035750.0,87643100.0,95035750.0,87643100.0,...,87643100.0,95035750.0,87643100.0,95035750.0,87643100.0,95035750.0,87643100.0,95035750.0,87643100.0,95035750.0
1,1,0.0,315342,49437690.0,53607620.0,49437690.0,53607620.0,49437690.0,53607620.0,49437690.0,...,49437690.0,53607620.0,49437690.0,53607620.0,49437690.0,53607620.0,49437690.0,53607620.0,49437690.0,53607620.0
2,2,0.0,503296,78899610.0,85554960.0,78899610.0,85554960.0,78899610.0,85554960.0,78899610.0,...,78899610.0,85554960.0,78899610.0,85554960.0,78899610.0,85554960.0,78899610.0,85554960.0,78899610.0,85554960.0
3,3,0.0,314897,49367630.0,53531650.0,49367630.0,53531650.0,49367630.0,53531650.0,49367630.0,...,49367630.0,53531650.0,49367630.0,53531650.0,49367630.0,53531650.0,49367630.0,53531650.0,49367630.0,53531650.0
4,4,0.0,560880,87929220.0,95346000.0,87929220.0,95346000.0,87929220.0,95346000.0,87929220.0,...,87929220.0,95346000.0,87929220.0,95346000.0,87929220.0,95346000.0,87929220.0,95346000.0,87929220.0,95346000.0
