In [1]:
%matplotlib inline
from ast import literal_eval
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.svm import SVC
from typing import Tuple

from sklearn import preprocessing
from sklearn.metrics import roc_auc_score

import warnings

from lob_data_utils import lob, db_result

sns.set_style('whitegrid')
warnings.filterwarnings('ignore')

In [2]:
n = 15
bp = ['bid_p' + str(i) for i in range(0, n)]
ap = ['ask_p' + str(i) for i in range(0, n)]
bv = ['bid_v' + str(i) for i in range(0, n)]
av = ['ask_v' + str(i) for i in range(0, n)]

In [3]:
def transform_to_orders(df: pd.DataFrame) -> pd.DataFrame:
    order_list = []
    for idx, row in df.iterrows():
        l_bid = [literal_eval(row.get('bid'))][0]
        bid_dict = [{'price': p, 'volume': v} for p, v in l_bid]
        d_bid = pd.DataFrame(bid_dict, columns=['price', 'volume'])
        d_bid = d_bid.sort_values(by='price', ascending=False).iloc[0:n]
        
        l_ask = [literal_eval(row.get('ask'))][0]
        ask_dict = [{'price': p, 'volume': v} for p, v in l_ask]
        d_ask = pd.DataFrame(ask_dict, columns=['price', 'volume'])
        d_ask = d_ask.sort_values(by='price', ascending=True).iloc[0:n]
        
        new_row_dict = {}
        mid_price = row['mid_price']
        
        d_bid['volume'] = d_bid['volume'] * d_bid['price']
        d_ask['volume'] = d_ask['volume'] * d_ask['price']
        
        total_capital = d_bid['volume'].sum() + d_ask['volume'].sum()
        d_bid['volume'] /= total_capital
        d_ask['volume'] /= total_capital
        
        d_bid['price'] = 1000.0 * (d_bid['price'] - mid_price) / mid_price
        d_ask['price'] = 1000.0 * (d_ask['price'] - mid_price) / mid_price
        for i in range(0, n):
            new_row_dict['bid_p' + str(i)] = d_bid.iloc[i]['price']
            new_row_dict['bid_v' + str(i)] = d_bid.iloc[i]['volume']
            new_row_dict['ask_p' + str(i)] = d_ask.iloc[i]['price']
            new_row_dict['ask_v' + str(i)] = d_ask.iloc[i]['volume']
            new_row_dict['mid_price'] = mid_price
            new_row_dict['mid_price_indicator'] = row.get('mid_price_indicator')
        order_list.append(new_row_dict)
    order_df = pd.DataFrame(order_list, columns=bp+bv+ap+av+['mid_price', 'mid_price_indicator'])
    return order_df

In [4]:
data_length = 5050

dfs = {}
dfs_test = {}
dfs_cv = {}

stocks = ['9061', '9062', '9063', '9064', '9065']

for s in stocks:
    d, d_cv, d_test = lob.load_prepared_data(s, data_dir='data/', cv=True, length=data_length)
    dfs[s] = transform_to_orders(d)
    print(dfs[s].head())
    dfs_cv[s] = transform_to_orders(d_cv)

     bid_p0    bid_p1    bid_p2    bid_p3    bid_p4    bid_p5    bid_p6  \
0 -0.212841 -0.354736 -0.496630 -0.638524 -0.780419 -0.922313 -1.064207   
1 -0.212932 -0.354887 -0.496842 -0.638796 -0.780751 -0.922706 -1.064660   
2 -0.141945 -0.283889 -0.425834 -0.567779 -0.709723 -0.851668 -0.993612   
3 -0.213023 -0.355038 -0.497053 -0.639068 -0.781084 -0.923099 -1.065114   
4 -0.212962 -0.354937 -0.496912 -0.638887 -0.780862 -0.922837 -1.064812   

     bid_p7    bid_p8    bid_p9         ...             ask_v7    ask_v8  \
0 -1.206101 -1.347996 -1.489890         ...           0.048016  0.011753   
1 -1.206615 -1.348570 -1.490525         ...           0.037337  0.024471   
2 -1.135557 -1.277502 -1.419446         ...           0.029431  0.024023   
3 -1.207129 -1.349144 -1.633175         ...           0.076909  0.041458   
4 -1.206786 -1.348761 -1.490736         ...           0.039361  0.010093   

     ask_v9   ask_v10   ask_v11   ask_v12   ask_v13   ask_v14  mid_price  \
0  0.007948  0.0

In [5]:
dfs['9061'].head()

Unnamed: 0,bid_p0,bid_p1,bid_p2,bid_p3,bid_p4,bid_p5,bid_p6,bid_p7,bid_p8,bid_p9,...,ask_v7,ask_v8,ask_v9,ask_v10,ask_v11,ask_v12,ask_v13,ask_v14,mid_price,mid_price_indicator
0,-0.212841,-0.354736,-0.49663,-0.638524,-0.780419,-0.922313,-1.064207,-1.206101,-1.347996,-1.48989,...,0.048016,0.011753,0.007948,0.027306,0.035609,0.001513,0.001566,0.047744,704.75,0.0
1,-0.212932,-0.354887,-0.496842,-0.638796,-0.780751,-0.922706,-1.06466,-1.206615,-1.34857,-1.490525,...,0.037337,0.024471,0.024513,0.032763,0.01196,0.008484,0.027781,0.030698,704.45,1.0
2,-0.141945,-0.283889,-0.425834,-0.567779,-0.709723,-0.851668,-0.993612,-1.135557,-1.277502,-1.419446,...,0.029431,0.024023,0.029104,0.032163,0.011738,0.008329,0.027272,0.030136,704.5,0.0
3,-0.213023,-0.355038,-0.497053,-0.639068,-0.781084,-0.923099,-1.065114,-1.207129,-1.349144,-1.633175,...,0.076909,0.041458,0.006005,0.031919,0.020106,0.019852,0.027777,0.004941,704.15,1.0
4,-0.212962,-0.354937,-0.496912,-0.638887,-0.780862,-0.922837,-1.064812,-1.206786,-1.348761,-1.490736,...,0.039361,0.010093,0.043301,0.019241,0.031081,0.009939,0.0079,0.027301,704.35,1.0


In [6]:
def svm_classification(df):
    clf = SVC()        
    X = df.loc[:, bp+ap+bv+av]
    y = df['mid_price_indicator'].values.reshape(-1, 1)
    y[0] = 0
    clf.fit(X, y)
    return clf

In [7]:
clfs = {}
for s in stocks:
    clf = svm_classification(dfs[s])
    clfs[s] = clf
    predictions = clf.predict(dfs[s].loc[:, bp+ap+bv+av])
    print(s, roc_auc_score(predictions, dfs[s]['mid_price_indicator']))

9061 0.543381214976
9062 0.754794973545
9063 0.620840735945
9064 0.558448266956
9065 0.533406192324


In [8]:
for s in stocks:
    predictions_cv = clfs[s].predict(dfs_cv[s].loc[:, bp+ap+bv+av])
    try:
        print(s, roc_auc_score(predictions_cv, dfs_cv[s]['mid_price_indicator']))
    except Exception as e:
        print(s, e)

9061 0.484393503019
9062 0.744776119403
9063 0.582337317397
9064 0.559378509232
9065 0.510539097189


In [9]:
for s in stocks:
    d, d_cv, d_test = lob.load_prepared_data(s, data_dir='data/', cv=True, length=data_length)
    dfs_test[s] = transform_to_orders(d_test)

In [10]:
for s in stocks:
    predictions_test = clfs[s].predict(dfs_test[s].loc[:, bp+ap+bv+av])
    try:
        print(s, roc_auc_score(predictions_test, dfs_test[s]['mid_price_indicator']))
    except Exception as e:
        print(s, e)

9061 0.50438350864
9062 0.758671952428
9063 0.662184594954
9064 0.600428367444
9065 0.523685480951
