In [1]:
%matplotlib inline
from ast import literal_eval
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.svm import SVC
from typing import Tuple

from sklearn import preprocessing
from sklearn.metrics import roc_auc_score

import warnings

from lob_data_utils import lob, db_result

sns.set_style('whitegrid')
warnings.filterwarnings('ignore')

In [2]:
n = 30
bp = ['bid_p' + str(i) for i in range(0, n)]
ap = ['ask_p' + str(i) for i in range(0, n)]
bv = ['bid_v' + str(i) for i in range(0, n)]
av = ['ask_v' + str(i) for i in range(0, n)]

In [3]:
def transform_to_orders(df: pd.DataFrame) -> pd.DataFrame:
    order_list = []
    for idx, row in df.iterrows():
        l_bid = [literal_eval(row.get('bid'))][0]
        bid_dict = [{'price': p, 'volume': v} for p, v in l_bid]
        d_bid = pd.DataFrame(bid_dict, columns=['price', 'volume'])
        d_bid = d_bid.sort_values(by='price', ascending=False).iloc[0:n]
        
        l_ask = [literal_eval(row.get('ask'))][0]
        ask_dict = [{'price': p, 'volume': v} for p, v in l_ask]
        d_ask = pd.DataFrame(ask_dict, columns=['price', 'volume'])
        d_ask = d_ask.sort_values(by='price', ascending=True).iloc[0:n]
        
        new_row_dict = {}
        mid_price = row['mid_price']
        
        d_bid['volume'] = d_bid['volume'] * d_bid['price']
        d_ask['volume'] = d_ask['volume'] * d_ask['price']
        
        total_capital = d_bid['volume'].sum() + d_ask['volume'].sum()
        d_bid['volume'] /= total_capital
        d_ask['volume'] /= total_capital
        
        d_bid['price'] = (d_bid['price'] - mid_price) * (d_bid['price'] - mid_price) / mid_price
        d_ask['price'] = (d_ask['price'] - mid_price) * (d_ask['price'] - mid_price) / mid_price
        for i in range(0, n):
            new_row_dict['bid_p' + str(i)] = d_bid.iloc[i]['price']
            new_row_dict['bid_v' + str(i)] = d_bid.iloc[i]['volume']
            new_row_dict['ask_p' + str(i)] = d_ask.iloc[i]['price']
            new_row_dict['ask_v' + str(i)] = d_ask.iloc[i]['volume']
            new_row_dict['mid_price'] = mid_price
            new_row_dict['mid_price_indicator'] = row.get('mid_price_indicator')
        order_list.append(new_row_dict)
    order_df = pd.DataFrame(order_list, columns=bp+bv+ap+av+['mid_price', 'mid_price_indicator'])
    return order_df

In [4]:
data_length = 5050

dfs = {}
dfs_test = {}
dfs_cv = {}

stocks = ['9061'] #, '9062', '9063', '9064', '9065']

for s in stocks:
    d, d_cv, d_test = lob.load_prepared_data(s, data_dir='data/', cv=True, length=data_length)
    dfs[s] = transform_to_orders(d)
    print(dfs[s].head())
    dfs_cv[s] = transform_to_orders(d_cv)

     bid_p0    bid_p1    bid_p2    bid_p3    bid_p4    bid_p5    bid_p6  \
0  0.000032  0.000089  0.000174  0.000287  0.000429  0.000600  0.000798   
1  0.000032  0.000089  0.000174  0.000287  0.000429  0.000600  0.000798   
2  0.000014  0.000057  0.000128  0.000227  0.000355  0.000511  0.000696   
3  0.000032  0.000089  0.000174  0.000288  0.000430  0.000600  0.000799   
4  0.000032  0.000089  0.000174  0.000287  0.000429  0.000600  0.000799   

     bid_p7    bid_p8    bid_p9         ...            ask_v22   ask_v23  \
0  0.001025  0.001281  0.001564         ...           0.004307  0.002872   
1  0.001026  0.001281  0.001565         ...           0.019366  0.002980   
2  0.000908  0.001150  0.001419         ...           0.019110  0.001471   
3  0.001026  0.001282  0.001878         ...           0.008117  0.083633   
4  0.001026  0.001281  0.001565         ...           0.080648  0.018053   

    ask_v24   ask_v25   ask_v26   ask_v27   ask_v28   ask_v29  mid_price  \
0  0.002872  0.0

In [5]:
dfs['9061'].head()

Unnamed: 0,bid_p0,bid_p1,bid_p2,bid_p3,bid_p4,bid_p5,bid_p6,bid_p7,bid_p8,bid_p9,...,ask_v22,ask_v23,ask_v24,ask_v25,ask_v26,ask_v27,ask_v28,ask_v29,mid_price,mid_price_indicator
0,3.2e-05,8.9e-05,0.000174,0.000287,0.000429,0.0006,0.000798,0.001025,0.001281,0.001564,...,0.004307,0.002872,0.002872,0.016088,0.001773,0.013176,0.016738,0.012941,704.75,0.0
1,3.2e-05,8.9e-05,0.000174,0.000287,0.000429,0.0006,0.000798,0.001026,0.001281,0.001565,...,0.019366,0.00298,0.002981,0.002981,0.016698,0.00184,0.013675,0.017372,704.45,1.0
2,1.4e-05,5.7e-05,0.000128,0.000227,0.000355,0.000511,0.000696,0.000908,0.00115,0.001419,...,0.01911,0.001471,0.002941,0.002942,0.016477,0.001816,0.013494,0.017143,704.5,0.0
3,3.2e-05,8.9e-05,0.000174,0.000288,0.00043,0.0006,0.000799,0.001026,0.001282,0.001878,...,0.008117,0.083633,0.018721,0.001441,0.002882,0.011488,0.016142,0.001779,704.15,1.0
4,3.2e-05,8.9e-05,0.000174,0.000287,0.000429,0.0006,0.000799,0.001026,0.001281,0.001565,...,0.080648,0.018053,0.001389,0.002779,0.011078,0.015566,0.001715,0.004449,704.35,1.0


In [6]:
def svm_classification(df):
    clf = SVC()        
    X = df.loc[:, bp+ap+bv+av]
    y = df['mid_price_indicator'].values.reshape(-1, 1)
    y[0] = 0
    clf.fit(X, y)
    return clf

In [7]:
clfs = {}
for s in stocks:
    clf = svm_classification(dfs[s])
    clfs[s] = clf
    predictions = clf.predict(dfs[s].loc[:, bp+ap+bv+av])
    print(s, roc_auc_score(predictions, dfs[s]['mid_price_indicator']))

ValueError: Only one class present in y_true. ROC AUC score is not defined in that case.

In [None]:
for s in stocks:
    predictions_cv = clfs[s].predict(dfs_cv[s].loc[:, bp+ap+bv+av])
    try:
        print(s, roc_auc_score(predictions_cv, dfs_cv[s]['mid_price_indicator']))
    except Exception as e:
        print(s, e)

In [None]:
for s in stocks:
    d, d_cv, d_test = lob.load_prepared_data(s, data_dir='data/', cv=True, length=data_length)
    dfs_test[s] = transform_to_orders(d_test)

In [None]:
for s in stocks:
    predictions_test = clfs[s].predict(dfs_test[s].loc[:, bp+ap+bv+av])
    try:
        print(s, roc_auc_score(predictions_test, dfs_test[s]['mid_price_indicator']))
    except Exception as e:
        print(s, e)