In [1]:
%matplotlib inline
from ast import literal_eval
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.svm import SVC
from typing import Tuple

from sklearn import preprocessing
from sklearn.metrics import roc_auc_score

import warnings

from lob_data_utils import lob, db_result

sns.set_style('whitegrid')
warnings.filterwarnings('ignore')

In [2]:
n = 15
bp = ['bid_p' + str(i) for i in range(0, n)]
ap = ['ask_p' + str(i) for i in range(0, n)]
bv = ['bid_v' + str(i) for i in range(0, n)]
av = ['ask_v' + str(i) for i in range(0, n)]

In [3]:
def transform_to_orders(df: pd.DataFrame) -> pd.DataFrame:
    order_list = []
    for idx, row in df.iterrows():
        l_bid = [literal_eval(row.get('bid'))][0]
        bid_dict = [{'price': p, 'volume': v} for p, v in l_bid]
        d_bid = pd.DataFrame(bid_dict, columns=['price', 'volume'])
        d_bid = d_bid.sort_values(by='price', ascending=False).iloc[0:n]
        
        l_ask = [literal_eval(row.get('ask'))][0]
        ask_dict = [{'price': p, 'volume': v} for p, v in l_ask]
        d_ask = pd.DataFrame(ask_dict, columns=['price', 'volume'])
        d_ask = d_ask.sort_values(by='price', ascending=True).iloc[0:n]
        
        new_row_dict = {}
        for i in range(0, n):
            new_row_dict['bid_p' + str(i)] = d_bid.iloc[i]['price']
            new_row_dict['bid_v' + str(i)] = d_bid.iloc[i]['volume']
            new_row_dict['ask_p' + str(i)] = d_ask.iloc[i]['price']
            new_row_dict['ask_v' + str(i)] = d_ask.iloc[i]['volume']
            new_row_dict['mid_price'] = row.get('mid_price')
            new_row_dict['mid_price_indicator'] = row.get('mid_price_indicator')
        order_list.append(new_row_dict)
    order_df = pd.DataFrame(order_list, columns=bp+bv+ap+av+['mid_price', 'mid_price_indicator'])
    return order_df

In [4]:
data_length = 10000

dfs = {}
dfs_test = {}
dfs_cv = {}

stocks = ['9061', '9062', '9063', '9064', '9065']

for s in stocks:
    d, d_cv, d_test = lob.load_prepared_data(s, data_dir='data/', cv=True, length=data_length)
    dfs[s] = transform_to_orders(d)
    dfs_cv[s] = transform_to_orders(d_cv)

In [5]:
dfs['9061'].head()

Unnamed: 0,bid_p0,bid_p1,bid_p2,bid_p3,bid_p4,bid_p5,bid_p6,bid_p7,bid_p8,bid_p9,...,ask_v7,ask_v8,ask_v9,ask_v10,ask_v11,ask_v12,ask_v13,ask_v14,mid_price,mid_price_indicator
0,708.6,708.5,708.4,708.3,708.2,708.1,708.0,707.9,707.8,707.7,...,3700.0,1700.0,1700.0,5744.0,5766.0,35470.0,5907.0,2220.0,708.7,0.0
1,707.5,707.4,707.3,707.2,707.1,707.0,706.9,706.8,706.7,706.4,...,5966.0,10223.0,3308.0,8985.0,18592.0,5810.0,10789.0,2136.0,707.65,0.0
2,707.4,707.3,707.2,707.1,707.0,706.9,706.8,706.7,706.6,706.5,...,6600.0,5961.0,7589.0,8028.0,29123.0,1308.0,6985.0,11713.0,707.45,0.0
3,707.1,707.0,706.9,706.8,706.7,706.6,706.5,706.4,706.3,706.2,...,11426.0,9626.0,25552.0,1359.0,1440.0,1305.0,12274.0,1308.0,707.2,1.0
4,707.3,707.2,707.1,707.0,706.9,706.8,706.7,706.6,706.5,706.4,...,10669.0,1359.0,740.0,1305.0,12274.0,1308.0,2473.0,11713.0,707.4,0.0


In [6]:
def svm_classification(df):
    clf = SVC()        
    X = df.loc[:, bp+ap+bv+av]
    y = df['mid_price_indicator'].values.reshape(-1, 1)
    y[0] = 0
    clf.fit(X, y)
    return clf

In [7]:
clfs = {}
for s in stocks:
    clf = svm_classification(dfs[s])
    clfs[s] = clf
    predictions = clf.predict(dfs[s].loc[:, bp+ap+bv+av])
    print(s, roc_auc_score(predictions, dfs[s]['mid_price_indicator']))

9061 1.0
9062 1.0
9063 1.0
9064 1.0
9065 1.0


In [8]:
for s in stocks:
    predictions_cv = clfs[s].predict(dfs_cv[s].loc[:, bp+ap+bv+av])
    try:
        print(s, roc_auc_score(predictions_cv, dfs_cv[s]['mid_price_indicator']))
    except Exception as e:
        print(s, e)

9061 Only one class present in y_true. ROC AUC score is not defined in that case.
9062 Only one class present in y_true. ROC AUC score is not defined in that case.
9063 Only one class present in y_true. ROC AUC score is not defined in that case.
9064 Only one class present in y_true. ROC AUC score is not defined in that case.
9065 Only one class present in y_true. ROC AUC score is not defined in that case.
