In [1]:
%load_ext autoreload

%autoreload 2
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn import metrics

# from mlxtend.plotting import plot_decision_regions
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from ast import literal_eval

import warnings
import numpy as np
from collections import OrderedDict

from lob_data_utils import lob, db_result, model, roc_results
from lob_data_utils.svm_calculation import lob_svm
import os


sns.set_style('whitegrid')
warnings.filterwarnings('ignore')

In [2]:
data_length = 10000
r = 1.0
s = 0.1
stocks = list(roc_results.result_cv_10000.keys())

In [3]:
def get_mean_scores(scores: dict) -> dict:
    mean_scores = {}
    for k, v in scores.items():
        mean_scores[k] = np.mean(v)
    return mean_scores

def get_score_for_clf(clf, df_test):
    x_test = df_test[['queue_imbalance']]
    y_test = df_test['mid_price_indicator'].values
    return model.test_model(clf, x_test, y_test)

def get_logistic_regression(stock, data_length):
    df, df_test = lob.load_prepared_data(
        stock, data_dir='../gaussian_filter/data', cv=False, length=data_length)
    clf = LogisticRegression()
    train_x = df[['queue_imbalance']]

    scores = model.validate_model(clf, train_x, df['mid_price_indicator'])
    res = {
        **get_mean_scores(scores),
        'stock': stock,
        'kernel': 'logistic',
    }
    test_scores = get_score_for_clf(clf, df_test)
    return {**res, **test_scores}

In [4]:
df_res = pd.DataFrame()
for stock in stocks:
    #pd.read_csv('svm_features_{}_len{}_r{}_s{}.csv'.format(stock, data_length, r, s))
    filename = 'svm_pca_gdf_{}_len{}_r{}_s{}.csv'.format(stock, data_length, r, s)
    if os.path.exists(filename):
        df_res = df_res.append(pd.read_csv(filename))
#df_res.drop(columns=['Unnamed: 0'], inplace=True)
columns = ['C', 'f1', 'features', 'gamma', 'kappa',
           'matthews', 'roc_auc', 'stock',
       'test_f1', 'test_kappa', 'test_matthews', 'test_roc_auc']
df_res[columns].sort_values(by='matthews', ascending=False).groupby('stock').head(1)

Unnamed: 0,C,f1,features,gamma,kappa,matthews,roc_auc,stock,test_f1,test_kappa,test_matthews,test_roc_auc
16,0.1,0.601535,pca_gdf_que_prev5,0.1,0.18439,0.186328,0.592055,11946,0.639307,0.203649,0.204979,0.601357
16,0.1,0.594169,pca_gdf_que5,0.1,0.181196,0.182546,0.590864,3879,0.579099,0.122366,0.123059,0.561245
16,0.1,0.590991,pca_gdf_que_prev3,0.1,0.167183,0.168202,0.583564,3035,0.573466,0.100702,0.100834,0.550283
22,1.0,0.583461,pca_gdf_que_prev3,0.01,0.157029,0.165084,0.578596,4320,0.616865,0.157942,0.158483,0.578643
37,100.0,0.590645,gdf_20_30_que,0.1,0.155573,0.161973,0.577967,1956,0.601017,0.137075,0.139123,0.568578
25,1.0,0.580106,gdf_0-50_que,10.0,0.158407,0.161125,0.579487,9761,0.588349,0.143871,0.144074,0.571888
35,100.0,0.612569,gdf_0-50_que_prev,0.001,0.159432,0.160949,0.579515,10484,0.570874,0.117218,0.11788,0.558749
37,100.0,0.568224,pca_gdf_que_prev3,0.1,0.152565,0.158177,0.576587,12417,0.587604,0.155238,0.155676,0.577692
36,100.0,0.620409,pca_gdf_que10,0.01,0.150373,0.156206,0.575425,13061,0.630878,0.095307,0.099938,0.546884
42,1000.0,0.585798,gdf_0-50_que_prev,0.001,0.152273,0.155985,0.576308,1472,0.635294,0.112329,0.115936,0.555214


In [5]:
log_res = []
for stock in stocks:
    log_res.append(get_logistic_regression(stock, data_length))
df_log_res = pd.DataFrame(log_res)
df_log_res['stock'] = df_log_res['stock'].values.astype(np.int)
df_log_res.index = df_log_res['stock'].values.astype(np.int)

In [6]:
df_gdf_best = df_res[columns].sort_values(by='test_matthews', ascending=False).groupby('stock').head(1)
df_gdf_best['stock'] = df_gdf_best['stock'].values.astype(np.int)
df_gdf_best.index = df_gdf_best['stock'].values.astype(np.int)

In [7]:
df_all = pd.merge(df_gdf_best, df_log_res, on='stock', suffixes=['_svm', '_log'])

In [13]:
all_columns = [ 'features', 'matthews_svm', 'matthews_log',  'test_matthews_svm',  'test_matthews_log',
       'roc_auc_svm', 'roc_auc_log', 'test_roc_auc_svm',  'test_roc_auc_log', 'stock', 
               'f1_svm', 'f1_log', 'test_f1_svm', 'test_f1_log', 'stock' ]
df_all[all_columns]

Unnamed: 0,features,matthews_svm,matthews_log,test_matthews_svm,test_matthews_log,roc_auc_svm,roc_auc_log,test_roc_auc_svm,test_roc_auc_log,stock,f1_svm,f1_log,test_f1_svm,test_f1_log,stock.1
0,pca_gdf_que_prev5,0.179081,0.186824,0.209901,0.203627,0.587891,0.592373,0.603152,0.601087,11946,0.597725,0.595737,0.647217,0.634056,11946
1,gdf_20_30_que_prev,0.128854,0.146384,0.191942,0.171235,0.565109,0.572852,0.59324,0.584512,7858,0.488197,0.551977,0.531778,0.539665,7858
2,pca_gdf_que_prev3,0.160138,0.156657,0.175396,0.163789,0.574493,0.577752,0.586201,0.581178,4320,0.573369,0.610483,0.634821,0.620721,4320
3,que,0.141919,0.137272,0.171835,0.168692,0.57081,0.567489,0.585412,0.583011,3161,0.582392,0.579932,0.610824,0.621412,3161
4,gdf_23-27_que,0.120206,0.142499,0.170629,0.162155,0.558879,0.570582,0.584491,0.580693,10508,0.589271,0.582458,0.60586,0.595089,10508
5,pca_gdf_que_prev7,0.132432,0.130301,0.170344,0.146515,0.563045,0.564461,0.584524,0.573201,1113,0.485222,0.534083,0.56168,0.591563,1113
6,pca_gdf_que1,0.068016,0.125634,0.167827,0.162023,0.530527,0.56234,0.582087,0.58089,2651,0.385594,0.553881,0.627187,0.59779,2651
7,gdf_0-50_que_prev,0.1152,0.129177,0.161254,0.150086,0.554974,0.562399,0.57718,0.573388,3022,0.611541,0.609023,0.653474,0.635929,3022
8,pca_gdf_que7,0.131436,0.131577,0.159355,0.15053,0.564687,0.564686,0.578104,0.570781,2602,0.589369,0.612748,0.62354,0.641621,2602
9,pca_gdf_que_prev3,0.156204,0.150172,0.157082,0.154446,0.575773,0.573432,0.578299,0.576707,12417,0.57068,0.591476,0.591481,0.596958,12417


In [14]:
len(df_all[df_all['matthews_svm'] > df_all['matthews_log']][all_columns]), len(df_all)

(20, 53)

In [10]:
len(df_all[df_all['roc_auc_svm'] > df_all['roc_auc_log']][all_columns]), len(df_all)

(19, 53)

In [16]:
df_all[df_all['test_matthews_svm'] < df_all['test_matthews_log']][all_columns]

Unnamed: 0,features,matthews_svm,matthews_log,test_matthews_svm,test_matthews_log,roc_auc_svm,roc_auc_log,test_roc_auc_svm,test_roc_auc_log,stock,f1_svm,f1_log,test_f1_svm,test_f1_log,stock.1
15,gdf_20_30_que_prev,0.15011,0.145601,0.147751,0.153016,0.574493,0.572127,0.573867,0.576301,11869,0.579905,0.582713,0.571859,0.561531,11869
36,pca_gdf_que_prev5,0.13183,0.130253,0.117491,0.13486,0.565512,0.564948,0.558665,0.567422,9086,0.555166,0.554918,0.562963,0.567149,9086
43,que,0.112584,0.113968,0.103896,0.106202,0.556183,0.556876,0.551929,0.553061,9269,0.546062,0.546408,0.564477,0.567554,9269
51,gdf_24-26_que_prev,0.111322,0.111922,0.076066,0.086004,0.555039,0.554976,0.537993,0.543004,9063,0.524973,0.525106,0.529052,0.545274,9063


In [12]:
df_all[df_all['test_roc_auc_svm'] < df_all['test_roc_auc_log']][all_columns]

Unnamed: 0,features,matthews_svm,matthews_log,test_matthews_svm,test_matthews_log,roc_auc_svm,roc_auc_log,test_roc_auc_svm,test_roc_auc_log,stock,f1_svm,f1_log,test_f1_svm,test_f1_log
15,gdf_20_30_que_prev,0.15011,0.145601,0.147751,0.153016,0.574493,0.572127,0.573867,0.576301,11869,0.579905,0.582713,0.571859,0.561531
36,pca_gdf_que_prev5,0.13183,0.130253,0.117491,0.13486,0.565512,0.564948,0.558665,0.567422,9086,0.555166,0.554918,0.562963,0.567149
42,pca_gdf_que_prev9,0.029188,0.132153,0.104188,0.094105,0.511745,0.565317,0.53855,0.547029,12456,0.666193,0.596622,0.663947,0.561743
43,que,0.112584,0.113968,0.103896,0.106202,0.556183,0.556876,0.551929,0.553061,9269,0.546062,0.546408,0.564477,0.567554
44,pca_gdf_que1,0.106286,0.109434,0.103763,0.103686,0.546975,0.549733,0.550479,0.550556,11867,0.389519,0.412946,0.471014,0.473273
51,gdf_24-26_que_prev,0.111322,0.111922,0.076066,0.086004,0.555039,0.554976,0.537993,0.543004,9063,0.524973,0.525106,0.529052,0.545274
