In [1]:
%load_ext autoreload

%autoreload 2
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn import metrics

# from mlxtend.plotting import plot_decision_regions
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from ast import literal_eval

import warnings
import numpy as np
from collections import OrderedDict

from lob_data_utils import lob, db_result, model, roc_results
from lob_data_utils.svm_calculation import lob_svm
import os


sns.set_style('whitegrid')
warnings.filterwarnings('ignore')

In [2]:
data_length = 10000
r = 0.1
s = 0.1
stocks = list(roc_results.result_cv_10000.keys())

In [3]:
def get_mean_scores(scores: dict) -> dict:
    mean_scores = {}
    for k, v in scores.items():
        mean_scores[k] = np.mean(v)
    return mean_scores

def get_score_for_clf(clf, df_test):
    x_test = df_test[['queue_imbalance']]
    y_test = df_test['mid_price_indicator'].values
    return model.test_model(clf, x_test, y_test)

def get_logistic_regression(stock, data_length):
    df, df_test = lob.load_prepared_data(
        stock, data_dir='../gaussian_filter/data', cv=False, length=data_length)
    clf = LogisticRegression()
    train_x = df[['queue_imbalance']]

    scores = model.validate_model(clf, train_x, df['mid_price_indicator'])
    res = {
        **get_mean_scores(scores),
        'stock': stock,
        'kernel': 'logistic',
    }
    test_scores = get_score_for_clf(clf, df_test)
    return {**res, **test_scores}

In [4]:
df_res = pd.DataFrame()
for stock in stocks:
    #pd.read_csv('svm_features_{}_len{}_r{}_s{}.csv'.format(stock, data_length, r, s))
    filename = 'svm_pca_only_gdf_{}_len{}_r{}_s{}.csv'.format(stock, data_length, r, s)
    if os.path.exists(filename):
        df_res = df_res.append(pd.read_csv(filename))
#df_res.drop(columns=['Unnamed: 0'], inplace=True)
columns = ['C', 'f1', 'features', 'gamma', 'kappa',
           'matthews', 'roc_auc', 'stock',
       'test_f1', 'test_kappa', 'test_matthews', 'test_roc_auc']
df_res[columns].sort_values(by='matthews', ascending=False).groupby('stock').head(1)

Unnamed: 0,C,f1,features,gamma,kappa,matthews,roc_auc,stock,test_f1,test_kappa,test_matthews,test_roc_auc
18,0.1,0.58157,pca_gdf_que_prev6,10.0,0.182965,0.186542,0.591616,11946,0.631963,0.188646,0.189814,0.593902
16,0.1,0.595059,pca_gdf_que3,0.1,0.183312,0.184655,0.591923,3879,0.576923,0.120317,0.120923,0.560216
16,0.1,0.591444,pca_gdf_que_prev3,0.1,0.16838,0.169412,0.584164,3035,0.573466,0.100702,0.100834,0.550283
22,1.0,0.583637,pca_gdf_que_prev3,0.01,0.157676,0.165721,0.578921,4320,0.616583,0.156892,0.157443,0.578116
35,100.0,0.611687,pca_gdf_que_prev3,0.001,0.158354,0.159676,0.57899,10484,0.571152,0.11534,0.116068,0.557816
23,1.0,0.618558,pca_gdf_que10,0.1,0.151856,0.157183,0.576134,13061,0.630737,0.106897,0.111082,0.552668
37,100.0,0.567604,pca_gdf_que_prev3,0.1,0.151612,0.157168,0.576109,12417,0.586914,0.154228,0.154651,0.577185
17,0.1,0.597497,pca_gdf_que6,1.0,0.149205,0.15698,0.574688,1956,0.616347,0.141303,0.145818,0.570713
42,1000.0,0.585711,pca_gdf_que7,0.001,0.152738,0.156651,0.576624,1472,0.635364,0.119467,0.122784,0.55879
23,1.0,0.516834,pca_gdf_que_prev7,0.1,0.151659,0.156468,0.575622,7858,0.466019,0.159565,0.172041,0.578505


In [5]:
log_res = []
for stock in stocks:
    log_res.append(get_logistic_regression(stock, data_length))
df_log_res = pd.DataFrame(log_res)
df_log_res['stock'] = df_log_res['stock'].values.astype(np.int)
df_log_res.index = df_log_res['stock'].values.astype(np.int)

In [6]:
df_gdf_best = df_res[columns].sort_values(by='test_matthews', ascending=False).groupby('stock').head(1)
df_gdf_best['stock'] = df_gdf_best['stock'].values.astype(np.int)
df_gdf_best.index = df_gdf_best['stock'].values.astype(np.int)

In [7]:
df_all = pd.merge(df_gdf_best, df_log_res, on='stock', suffixes=['_svm', '_log'])

In [15]:
all_columns = [ 'features', 'matthews_svm', 'matthews_log',  'test_matthews_svm',  'test_matthews_log',
       'roc_auc_svm', 'roc_auc_log', 'test_roc_auc_svm',  'test_roc_auc_log', 'stock', 
               'f1_svm', 'f1_log', 'test_f1_svm', 'test_f1_log', 'stock']
df_all[all_columns]

Unnamed: 0,features,matthews_svm,matthews_log,test_matthews_svm,test_matthews_log,roc_auc_svm,roc_auc_log,test_roc_auc_svm,test_roc_auc_log,stock,f1_svm,f1_log,test_f1_svm,test_f1_log,stock.1
0,pca_gdf_que_prev6,0.181962,0.186824,0.208721,0.203627,0.590317,0.592373,0.603896,0.601087,11946,0.596373,0.595737,0.63212,0.634056,11946
1,pca_gdf_que_prev7,0.131391,0.146384,0.190906,0.171235,0.566007,0.572852,0.59276,0.584512,7858,0.489334,0.551977,0.531469,0.539665,7858
2,pca_gdf_que5,0.122443,0.142499,0.181417,0.162155,0.560214,0.570582,0.588135,0.580693,10508,0.590181,0.582458,0.625395,0.595089,10508
3,pca_gdf_que_prev3,0.160155,0.156657,0.175396,0.163789,0.57451,0.577752,0.586201,0.581178,4320,0.573269,0.610483,0.634821,0.620721,4320
4,pca_gdf_que1,0.141919,0.137272,0.17284,0.168692,0.57081,0.567489,0.585921,0.583011,3161,0.582392,0.579932,0.611111,0.621412,3161
5,pca_gdf_que_prev7,0.133594,0.130301,0.172216,0.146515,0.562816,0.564461,0.585276,0.573201,1113,0.480175,0.534083,0.55873,0.591563,1113
6,pca_gdf_que1,0.068016,0.125634,0.167827,0.162023,0.530527,0.56234,0.582087,0.58089,2651,0.385594,0.553881,0.627187,0.59779,2651
7,pca_gdf_que1,0.047616,0.138016,0.163453,0.138375,0.521425,0.568685,0.578631,0.568627,1431,0.65704,0.585431,0.628471,0.593882,1431
8,pca_gdf_que3,0.129238,0.129177,0.16245,0.150086,0.563101,0.562399,0.580794,0.573388,3022,0.60367,0.609023,0.624036,0.635929,3022
9,pca_gdf_que7,0.128263,0.131577,0.160744,0.15053,0.562496,0.564686,0.573943,0.570781,2602,0.599646,0.612748,0.652336,0.641621,2602


In [16]:
len(df_all[df_all['matthews_svm'] > df_all['matthews_log']][all_columns]), len(df_all)

(18, 53)

In [17]:
len(df_all[df_all['roc_auc_svm'] > df_all['roc_auc_log']][all_columns]), len(df_all)

(15, 53)

In [18]:
df_all[df_all['test_matthews_svm'] < df_all['test_matthews_log']][all_columns]

Unnamed: 0,features,matthews_svm,matthews_log,test_matthews_svm,test_matthews_log,roc_auc_svm,roc_auc_log,test_roc_auc_svm,test_roc_auc_log,stock,f1_svm,f1_log,test_f1_svm,test_f1_log,stock.1
15,pca_gdf_que_prev10,0.144513,0.145601,0.147667,0.153016,0.57178,0.572127,0.573833,0.576301,11869,0.57925,0.582713,0.573574,0.561531,11869
37,pca_gdf_que_prev5,0.126252,0.130253,0.114973,0.13486,0.561881,0.564948,0.557523,0.567422,9086,0.539466,0.554918,0.544799,0.567149,9086
41,pca_gdf_que1,0.112584,0.113968,0.103896,0.106202,0.556183,0.556876,0.551929,0.553061,9269,0.546062,0.546408,0.564477,0.567554,9269
50,pca_gdf_que_prev3,0.114035,0.111922,0.076066,0.086004,0.555615,0.554976,0.537993,0.543004,9063,0.515005,0.525106,0.529052,0.545274,9063


In [12]:
df_all[df_all['test_roc_auc_svm'] < df_all['test_roc_auc_log']][all_columns]

Unnamed: 0,features,matthews_svm,matthews_log,test_matthews_svm,test_matthews_log,roc_auc_svm,roc_auc_log,test_roc_auc_svm,test_roc_auc_log,stock,f1_svm,f1_log,test_f1_svm,test_f1_log
15,pca_gdf_que_prev10,0.144513,0.145601,0.147667,0.153016,0.57178,0.572127,0.573833,0.576301,11869,0.57925,0.582713,0.573574,0.561531
37,pca_gdf_que_prev5,0.126252,0.130253,0.114973,0.13486,0.561881,0.564948,0.557523,0.567422,9086,0.539466,0.554918,0.544799,0.567149
39,pca_gdf_que_prev10,0.029977,0.132153,0.105444,0.094105,0.512057,0.565317,0.539063,0.547029,12456,0.666342,0.596622,0.664193,0.561743
41,pca_gdf_que1,0.112584,0.113968,0.103896,0.106202,0.556183,0.556876,0.551929,0.553061,9269,0.546062,0.546408,0.564477,0.567554
42,pca_gdf_que1,0.105879,0.109434,0.103763,0.103686,0.546818,0.549733,0.550479,0.550556,11867,0.38881,0.412946,0.471014,0.473273
44,pca_gdf_que6,0.118763,0.136666,0.10075,0.096997,0.556252,0.567394,0.546941,0.547942,4060,0.624887,0.59291,0.608472,0.573308
49,pca_gdf_que2,0.083963,0.132624,0.087992,0.086034,0.539619,0.563571,0.537586,0.53853,13003,0.450012,0.479875,0.377309,0.409548
50,pca_gdf_que_prev3,0.114035,0.111922,0.076066,0.086004,0.555615,0.554976,0.537993,0.543004,9063,0.515005,0.525106,0.529052,0.545274
51,pca_gdf_que_prev10,0.015351,0.117283,0.074153,0.074043,0.505328,0.557431,0.532388,0.537,9058,0.223283,0.53941,0.623742,0.528993


In [13]:
df_all[df_all['matthews_svm'] < df_all['matthews_log']]['features'].value_counts()

pca_gdf_que2          5
pca_gdf_que1          5
pca_gdf_que5          4
pca_gdf_que_prev10    3
pca_gdf_que6          2
pca_gdf_que10         2
pca_gdf_que_prev5     2
pca_gdf_que4          2
pca_gdf_que_prev3     2
pca_gdf_que3          2
pca_gdf_que_prev2     1
pca_gdf_que_prev7     1
pca_gdf_que8          1
pca_gdf_que_prev8     1
pca_gdf_que_prev6     1
pca_gdf_que7          1
Name: features, dtype: int64

In [14]:
df_all[df_all['matthews_svm'] > df_all['matthews_log']]['features'].value_counts()

pca_gdf_que3         3
pca_gdf_que_prev3    3
pca_gdf_que1         3
pca_gdf_que_prev7    2
pca_gdf_que_prev4    2
pca_gdf_que5         1
pca_gdf_que_prev2    1
pca_gdf_que8         1
pca_gdf_que_prev5    1
pca_gdf_que7         1
Name: features, dtype: int64