In [1]:
%load_ext autoreload

%autoreload 2
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.svm import SVC
 
from sklearn import metrics

from mlxtend.plotting import plot_decision_regions
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from ast import literal_eval

import warnings
import numpy as np
from collections import OrderedDict

from lob_data_utils import lob, db_result, model
from lob_data_utils.svm_calculation import lob_svm
import os


sns.set_style('whitegrid')
warnings.filterwarnings('ignore')

  """)


In [2]:
data_length = 15000
stocks = ['9064', '9061', '9265']

In [3]:
def convert_scores(df, column):
    scores = []
    for i, row in df.iterrows():
        try:
            scores.append(np.mean(row[column]))
        except:
            scores.append(np.mean(np.array(literal_eval(row[column])).astype(np.float64)))
    return scores
scores_columns = ['f1', 'kappa', 'matthews', 'precision', 'recall', 'roc_auc', 'train_f1', 'train_kappa',
       'train_matthews', 'train_precision', 'train_recall', 'train_roc_auc']

In [4]:
data_results_dir = '../gaussian_filter/data_res_gdf_feature_scaling' # '../gaussian_filter/data_res_logistic/'
data_dir = '../gaussian_filter/data_gdf_feature_scaling/'# '../gaussian_filter/data_gdf/'
gdf_start = 24
gdf_end = 26
gdf_columns = ['gdf_{}'.format(i) for i in range(gdf_start, gdf_end)]

rr = [0.01, 0.05, 0.1, 0.5, 1.0]
ss = [0.01, 0.05, 0.1, 0.5, 1.0]

df_scores_all = pd.DataFrame()

for stock in stocks:
    for r in rr:
        for s in ss:
            filename =  'res_{}_len15000_r{}_s{}_K{}-{}.csv'.format(stock, r, s, gdf_start, gdf_end)
            df = pd.read_csv(os.path.join(data_results_dir, filename))
            df.dropna(inplace=True)
            df.drop(columns=['Unnamed: 0'], inplace=True)
            for col in scores_columns:
                df[col] = convert_scores(df, col)
    
            df_scores_all = df_scores_all.append(df, ignore_index=True)
df_scores_all.sort_values(by='matthews', ascending=False).groupby('stock').head(1)

Unnamed: 0,C,K,f1,gamma,kappa,matthews,method,precision,r,recall,roc_auc,s,stock,train_f1,train_kappa,train_matthews,train_precision,train_recall,train_roc_auc
1822,10000,50,0.376846,100,0.018853,0.034561,svm_rbf,0.516538,1.0,0.448246,0.509489,0.1,9265,0.399704,0.02864,0.031279,0.464086,0.391779,0.514267
268,1000,50,0.528482,1000,0.019708,0.028951,svm_rbf,0.526985,0.1,0.649098,0.510002,0.01,9064,0.648165,0.017494,0.022453,0.515761,0.878371,0.508629
868,1000,50,0.417016,1000,0.021158,0.027098,svm_rbf,0.52325,0.05,0.402623,0.510714,1.0,9061,0.443466,0.076661,0.082439,0.548516,0.38663,0.537946


In [5]:
df_best = df_scores_all.sort_values(by='matthews', ascending=False).groupby('stock').head(1)

dfs = {}
dfs_test = {}
for stock in stocks:
    r = df_best[df_best['stock'] == int(stock)]['r'].values[0]
    s = df_best[df_best['stock'] == int(stock)]['s'].values[0]
    print(r,s, stock)
    gdf_filename = 'gdf_{}_r{}_s{}_K50_feature_scaling'.format(stock, r, s)
    print(gdf_filename)
    dfs[stock], dfs_test[stock] = lob.load_prepared_data(
        gdf_filename, data_dir=data_dir, cv=False, length=15000)

0.1 0.01 9064
gdf_9064_r0.1_s0.01_K50_feature_scaling
0.05 1.0 9061
gdf_9061_r0.05_s1.0_K50_feature_scaling
1.0 0.1 9265
gdf_9265_r1.0_s0.1_K50_feature_scaling


In [6]:
dfs[stock].head()

Unnamed: 0.1,Unnamed: 0,datetime,gdf_0,gdf_1,gdf_10,gdf_11,gdf_12,gdf_13,gdf_14,gdf_15,...,gdf_48,gdf_49,gdf_5,gdf_6,gdf_7,gdf_8,gdf_9,mid_price,mid_price_indicator,queue_imbalance
3000,3000,2013-09-16 09:45:00,7.65393e-136,3.3427140000000005e-125,5.530716e-49,1.09661e-42,7.998882e-37,2.146419e-31,2.1189039999999998e-26,7.695345e-22,...,1.096607e-42,5.53071e-49,5.520948e-87,1.6246359999999998e-78,1.7587499999999998e-70,7.004184e-63,1.026164e-55,3340.75,1.0,
3001,3001,2013-09-16 09:46:00,7.65393e-136,3.3427140000000005e-125,5.530716e-49,1.09661e-42,7.998882e-37,2.146419e-31,2.1189039999999998e-26,7.695345e-22,...,1.096607e-42,5.53071e-49,5.520948e-87,1.6246359999999998e-78,1.7587499999999998e-70,7.004184e-63,1.026164e-55,3341.0,1.0,
3002,3002,2013-09-16 09:48:00,7.65393e-136,3.3427140000000005e-125,5.530716e-49,1.09661e-42,7.998882e-37,2.146419e-31,2.1189039999999998e-26,7.695345e-22,...,1.096607e-42,5.53071e-49,5.520948e-87,1.6246359999999998e-78,1.7587499999999998e-70,7.004184e-63,1.026164e-55,3341.75,0.0,
3003,3003,2013-09-16 09:49:00,7.65393e-136,3.3427140000000005e-125,5.530716e-49,1.09661e-42,7.998882e-37,2.146419e-31,2.1189039999999998e-26,7.695345e-22,...,1.096607e-42,5.53071e-49,5.520948e-87,1.6246359999999998e-78,1.7587499999999998e-70,7.004184e-63,1.026164e-55,3341.25,0.0,
3004,3004,2013-09-16 09:50:00,7.65393e-136,3.3427140000000005e-125,5.530716e-49,1.09661e-42,7.998882e-37,2.146419e-31,2.1189039999999998e-26,7.695345e-22,...,1.096607e-42,5.53071e-49,5.520948e-87,1.6246359999999998e-78,1.7587499999999998e-70,7.004184e-63,1.026164e-55,3341.0,1.0,


In [7]:
def get_scores_dict_for_data(functions_to_run, dfss, log_clf, stock):
    scores = {'stock': stock}
    for func_name, func in functions_to_run.items():
        for df_name, df in dfss.items():
            pred = log_clf.predict(df.loc[:, gdf_columns])
            scores['{}_{}'.format(df_name, func_name)] = func(df['mid_price_indicator'], pred)
    return scores
            
functions_to_run = {'precision': metrics.precision_score, 'roc_auc': metrics.roc_auc_score,
                   'f1_score': metrics.f1_score, 'recall': metrics.recall_score, 
                   'matthews': metrics.matthews_corrcoef, 'kappa': metrics.cohen_kappa_score}
scores = []
for stock in stocks:
    clf = SVC(kernel='rbf', gamma=df_best[df_best['stock'] == int(stock)]['gamma'].values[0], 
              C=df_best[df_best['stock'] == int(stock)]['C'].values[0])
    train_scores = model.validate_model(
        clf, dfs[stock].loc[:, gdf_columns], dfs[stock]['mid_price_indicator'].values.reshape(-1, 1))
    dfs_dict = {'d': dfs[stock], 'test': dfs_test[stock], }
    res = get_scores_dict_for_data(functions_to_run, dfs_dict, clf, stock)
    res = {**res, **train_scores}
    scores.append(res)
df_scores = pd.DataFrame(scores, index=stocks)

In [8]:
for col in scores_columns:
     df_scores[col] = convert_scores(df_scores, col)
df_scores[[c for c in df_scores.columns if 'mat' in c]]

Unnamed: 0,d_matthews,matthews,test_matthews,train_matthews
9064,0.02697,0.028951,-0.008817,0.022453
9061,0.064491,0.027098,0.000999,0.082439
9265,0.02866,0.034561,0.000458,0.031279


In [9]:
df_scores_all.sort_values(by='matthews', ascending=False).groupby('stock').head(1)

Unnamed: 0,C,K,f1,gamma,kappa,matthews,method,precision,r,recall,roc_auc,s,stock,train_f1,train_kappa,train_matthews,train_precision,train_recall,train_roc_auc
1822,10000,50,0.376846,100,0.018853,0.034561,svm_rbf,0.516538,1.0,0.448246,0.509489,0.1,9265,0.399704,0.02864,0.031279,0.464086,0.391779,0.514267
268,1000,50,0.528482,1000,0.019708,0.028951,svm_rbf,0.526985,0.1,0.649098,0.510002,0.01,9064,0.648165,0.017494,0.022453,0.515761,0.878371,0.508629
868,1000,50,0.417016,1000,0.021158,0.027098,svm_rbf,0.52325,0.05,0.402623,0.510714,1.0,9061,0.443466,0.076661,0.082439,0.548516,0.38663,0.537946


In [10]:
train_X = dfs[stock].loc[:, gdf_columns]
train_Y = dfs[stock]['mid_price_indicator']

clf = SVC(kernel='rbf', gamma=1, C=1)
train_scores = model.validate_model(clf, train_X, train_Y)
train_scores



{'precision': [0.0,
  0.0,
  0.0,
  0.5,
  0.51010101010101,
  0.6530612244897959,
  0.0,
  0.5064220183486239,
  0.4981651376146789],
 'f1': [0.0,
  0.0,
  0.0,
  0.11674347158218125,
  0.6056971514242878,
  0.10110584518167455,
  0.0,
  0.6723507917174179,
  0.6650336803429271],
 'recall': [0.0,
  0.0,
  0.0,
  0.06608695652173913,
  0.7453874538745388,
  0.0547945205479452,
  0.0,
  1.0,
  1.0],
 'roc_auc': [0.5,
  0.5,
  0.5,
  0.49615027437737436,
  0.5186791283971234,
  0.5105988413016406,
  0.5,
  0.5,
  0.5],
 'kappa': [0.0,
  0.0,
  0.0,
  -0.007328950963532144,
  0.037264160448137074,
  0.019799664316036192,
  0.0,
  0.0,
  0.0],
 'matthews': [0.0,
  0.0,
  0.0,
  -0.015092859753538267,
  0.04190885569182589,
  0.051020729311544735,
  0.0,
  0.0,
  0.0],
 'train_precision': [0.0,
  0.0,
  0.0,
  0.8,
  0.5229309435951502,
  0.5197761194029851,
  0.512668918918919,
  0.5087155963302752,
  0.508460754332314],
 'train_f1': [0.0,
  0.0,
  0.0,
  0.0037157454714352067,
  0.4294372