In [1]:
%load_ext autoreload

%autoreload 2
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn import metrics

# from mlxtend.plotting import plot_decision_regions
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from ast import literal_eval

import warnings
import numpy as np
from collections import OrderedDict

from lob_data_utils import lob, db_result, model, roc_results
from lob_data_utils.svm_calculation import lob_svm
import os


sns.set_style('whitegrid')
warnings.filterwarnings('ignore')

In [2]:
data_length = 10000
r = 1.0
s = 0.1
stocks = list(roc_results.result_cv_10000.keys())

In [3]:
def get_mean_scores(scores: dict) -> dict:
    mean_scores = {}
    for k, v in scores.items():
        mean_scores[k] = np.mean(v)
    return mean_scores

def get_score_for_clf(clf, df_test):
    x_test = df_test[['queue_imbalance']]
    y_test = df_test['mid_price_indicator'].values
    return model.test_model(clf, x_test, y_test)

def get_logistic_regression(stock, data_length):
    df, df_test = lob.load_prepared_data(
        stock, data_dir='../gaussian_filter/data', cv=False, length=data_length)
    clf = LogisticRegression()
    train_x = df[['queue_imbalance']]

    scores = model.validate_model(clf, train_x, df['mid_price_indicator'])
    res = {
        **get_mean_scores(scores),
        'stock': stock,
        'kernel': 'logistic',
    }
    test_scores = get_score_for_clf(clf, df_test)
    return {**res, **test_scores}

In [4]:
df_res = pd.DataFrame()
for stock in stocks:
    #pd.read_csv('svm_features_{}_len{}_r{}_s{}.csv'.format(stock, data_length, r, s))
    filename = 'svm_pca_only_gdf_{}_len{}_r{}_s{}.csv'.format(stock, data_length, r, s)
    if os.path.exists(filename):
        df_res = df_res.append(pd.read_csv(filename))
#df_res.drop(columns=['Unnamed: 0'], inplace=True)
columns = ['C', 'f1', 'features', 'gamma', 'kappa',
           'matthews', 'roc_auc', 'stock',
       'test_f1', 'test_kappa', 'test_matthews', 'test_roc_auc']
df_res[columns].sort_values(by='matthews', ascending=False).groupby('stock').head(1)

Unnamed: 0,C,f1,features,gamma,kappa,matthews,roc_auc,stock,test_f1,test_kappa,test_matthews,test_roc_auc
16,0.1,0.601535,pca_gdf_que_prev5,0.1,0.18439,0.186328,0.592055,11946,0.639307,0.203649,0.204979,0.601357
16,0.1,0.594169,pca_gdf_que5,0.1,0.181196,0.182546,0.590864,3879,0.579099,0.122366,0.123059,0.561245
16,0.1,0.590991,pca_gdf_que_prev3,0.1,0.167183,0.168202,0.583564,3035,0.573466,0.100702,0.100834,0.550283
22,1.0,0.583461,pca_gdf_que_prev3,0.01,0.157029,0.165084,0.578596,4320,0.616865,0.157942,0.158483,0.578643
37,100.0,0.591563,pca_gdf_que4,0.1,0.1546,0.160703,0.577466,1956,0.59852,0.132072,0.134023,0.566075
35,100.0,0.612237,pca_gdf_que_prev3,0.001,0.158109,0.159497,0.578839,10484,0.571845,0.119215,0.119889,0.55975
44,1000.0,0.585406,pca_gdf_que_prev6,0.1,0.156813,0.158284,0.578586,9761,0.592979,0.140533,0.141033,0.570193
23,1.0,0.518663,pca_gdf_que_prev8,0.1,0.153319,0.157957,0.57647,7858,0.46671,0.159639,0.171967,0.578549
37,100.0,0.568037,pca_gdf_que_prev3,0.1,0.152282,0.157892,0.576448,12417,0.587604,0.155238,0.155676,0.577692
36,100.0,0.620314,pca_gdf_que10,0.01,0.150366,0.156159,0.575421,13061,0.630616,0.094236,0.098849,0.546355


In [5]:
log_res = []
for stock in stocks:
    log_res.append(get_logistic_regression(stock, data_length))
df_log_res = pd.DataFrame(log_res)
df_log_res['stock'] = df_log_res['stock'].values.astype(np.int)
df_log_res.index = df_log_res['stock'].values.astype(np.int)

In [6]:
df_gdf_best = df_res[columns].sort_values(by='test_matthews', ascending=False).groupby('stock').head(1)
df_gdf_best['stock'] = df_gdf_best['stock'].values.astype(np.int)
df_gdf_best.index = df_gdf_best['stock'].values.astype(np.int)

In [7]:
df_all = pd.merge(df_gdf_best, df_log_res, on='stock', suffixes=['_svm', '_log'])

In [8]:
all_columns = [ 'features', 'matthews_svm', 'matthews_log',  'test_matthews_svm',  'test_matthews_log',
       'roc_auc_svm', 'roc_auc_log', 'test_roc_auc_svm',  'test_roc_auc_log', 'stock', 
               'f1_svm', 'f1_log', 'test_f1_svm', 'test_f1_log', 'stock']
df_all[all_columns]

Unnamed: 0,features,matthews_svm,matthews_log,test_matthews_svm,test_matthews_log,roc_auc_svm,roc_auc_log,test_roc_auc_svm,test_roc_auc_log,stock,f1_svm,f1_log,test_f1_svm,test_f1_log,stock.1
0,pca_gdf_que_prev5,0.178755,0.186824,0.209901,0.203627,0.587738,0.592373,0.603152,0.601087,11946,0.597635,0.595737,0.647217,0.634056,11946
1,pca_gdf_que_prev8,0.14311,0.146384,0.188654,0.171235,0.570644,0.572852,0.589621,0.584512,7858,0.541327,0.551977,0.508227,0.539665,7858
2,pca_gdf_que5,0.125702,0.142499,0.186497,0.162155,0.562138,0.570582,0.590626,0.580693,10508,0.588936,0.582458,0.627486,0.595089,10508
3,pca_gdf_que_prev3,0.160138,0.156657,0.175396,0.163789,0.574493,0.577752,0.586201,0.581178,4320,0.573369,0.610483,0.634821,0.620721,4320
4,pca_gdf_que1,0.141919,0.137272,0.17284,0.168692,0.57081,0.567489,0.585921,0.583011,3161,0.582392,0.579932,0.611111,0.621412,3161
5,pca_gdf_que_prev7,0.132748,0.130301,0.170344,0.146515,0.563191,0.564461,0.584524,0.573201,1113,0.485531,0.534083,0.56168,0.591563,1113
6,pca_gdf_que1,0.068016,0.125634,0.167827,0.162023,0.530527,0.56234,0.582087,0.58089,2651,0.385594,0.553881,0.627187,0.59779,2651
7,pca_gdf_que10,0.121055,0.131577,0.160446,0.15053,0.558791,0.564686,0.573209,0.570781,2602,0.601326,0.612748,0.654351,0.641621,2602
8,pca_gdf_que_prev2,0.117601,0.129177,0.159735,0.150086,0.556259,0.562399,0.576013,0.573388,3022,0.608947,0.609023,0.655518,0.635929,3022
9,pca_gdf_que_prev6,0.037995,0.1447,0.158639,0.132214,0.515451,0.571695,0.574695,0.564652,9761,0.66436,0.593399,0.639693,0.609361,9761


In [9]:
len(df_all[df_all['matthews_svm'] > df_all['matthews_log']][all_columns]), len(df_all)

(18, 53)

In [10]:
len(df_all[df_all['roc_auc_svm'] > df_all['roc_auc_log']][all_columns]), len(df_all)

(12, 53)

In [11]:
df_all[df_all['test_matthews_svm'] < df_all['test_matthews_log']][all_columns]

Unnamed: 0,features,matthews_svm,matthews_log,test_matthews_svm,test_matthews_log,roc_auc_svm,roc_auc_log,test_roc_auc_svm,test_roc_auc_log,stock,f1_svm,f1_log,test_f1_svm,test_f1_log,stock.1
36,pca_gdf_que_prev5,0.13183,0.130253,0.117491,0.13486,0.565512,0.564948,0.558665,0.567422,9086,0.555166,0.554918,0.562963,0.567149,9086
42,pca_gdf_que1,0.112584,0.113968,0.103896,0.106202,0.556183,0.556876,0.551929,0.553061,9269,0.546062,0.546408,0.564477,0.567554,9269
50,pca_gdf_que_prev3,0.110439,0.111922,0.078955,0.086004,0.554532,0.554976,0.539452,0.543004,9063,0.525435,0.525106,0.532725,0.545274,9063


In [12]:
df_all[df_all['test_roc_auc_svm'] < df_all['test_roc_auc_log']][all_columns]

Unnamed: 0,features,matthews_svm,matthews_log,test_matthews_svm,test_matthews_log,roc_auc_svm,roc_auc_log,test_roc_auc_svm,test_roc_auc_log,stock,f1_svm,f1_log,test_f1_svm,test_f1_log,stock.1
36,pca_gdf_que_prev5,0.13183,0.130253,0.117491,0.13486,0.565512,0.564948,0.558665,0.567422,9086,0.555166,0.554918,0.562963,0.567149,9086
41,pca_gdf_que_prev9,0.029188,0.132153,0.104188,0.094105,0.511745,0.565317,0.53855,0.547029,12456,0.666193,0.596622,0.663947,0.561743,12456
42,pca_gdf_que1,0.112584,0.113968,0.103896,0.106202,0.556183,0.556876,0.551929,0.553061,9269,0.546062,0.546408,0.564477,0.567554,9269
43,pca_gdf_que1,0.106286,0.109434,0.103763,0.103686,0.546975,0.549733,0.550479,0.550556,11867,0.389519,0.412946,0.471014,0.473273,11867
50,pca_gdf_que_prev3,0.110439,0.111922,0.078955,0.086004,0.554532,0.554976,0.539452,0.543004,9063,0.525435,0.525106,0.532725,0.545274,9063


In [13]:
df_all[df_all['matthews_svm'] < df_all['matthews_log']]['features'].value_counts()

pca_gdf_que4         5
pca_gdf_que2         4
pca_gdf_que10        4
pca_gdf_que_prev3    3
pca_gdf_que1         3
pca_gdf_que5         3
pca_gdf_que_prev2    2
pca_gdf_que_prev5    2
pca_gdf_que_prev9    2
pca_gdf_que3         2
pca_gdf_que_prev8    1
pca_gdf_que7         1
pca_gdf_que8         1
pca_gdf_que6         1
pca_gdf_que_prev6    1
Name: features, dtype: int64

In [14]:
df_all[df_all['matthews_svm'] > df_all['matthews_log']]['features'].value_counts()

pca_gdf_que_prev3    4
pca_gdf_que1         4
pca_gdf_que5         2
pca_gdf_que_prev8    1
pca_gdf_que_prev7    1
pca_gdf_que2         1
pca_gdf_que4         1
pca_gdf_que9         1
pca_gdf_que_prev5    1
pca_gdf_que3         1
pca_gdf_que_prev2    1
Name: features, dtype: int64