In [1]:
%load_ext autoreload

%autoreload 2
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn import metrics

# from mlxtend.plotting import plot_decision_regions
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from ast import literal_eval

import warnings
import numpy as np
from collections import OrderedDict

from lob_data_utils import lob, db_result, model, roc_results
from lob_data_utils.svm_calculation import lob_svm
import os


sns.set_style('whitegrid')
warnings.filterwarnings('ignore')

In [2]:
data_length = 10000
r = 1.0
s = 1.0
stocks = list(roc_results.result_cv_10000.keys())

In [3]:
def get_mean_scores(scores: dict) -> dict:
    mean_scores = {}
    for k, v in scores.items():
        mean_scores[k] = np.mean(v)
    return mean_scores

def get_score_for_clf(clf, df_test):
    x_test = df_test[['queue_imbalance']]
    y_test = df_test['mid_price_indicator'].values
    return model.test_model(clf, x_test, y_test)

def get_logistic_regression(stock, data_length):
    df, df_test = lob.load_prepared_data(
        stock, data_dir='../gaussian_filter/data', cv=False, length=data_length)
    clf = LogisticRegression()
    train_x = df[['queue_imbalance']]

    scores = model.validate_model(clf, train_x, df['mid_price_indicator'])
    res = {
        **get_mean_scores(scores),
        'stock': stock,
        'kernel': 'logistic',
    }
    test_scores = get_score_for_clf(clf, df_test)
    return {**res, **test_scores}

In [4]:
df_res = pd.DataFrame()
for stock in stocks:
    #pd.read_csv('svm_features_{}_len{}_r{}_s{}.csv'.format(stock, data_length, r, s))
    filename = 'svm_pca_only_gdf_{}_len{}_r{}_s{}.csv'.format(stock, data_length, r, s)
    if os.path.exists(filename):
        df_res = df_res.append(pd.read_csv(filename))
#df_res.drop(columns=['Unnamed: 0'], inplace=True)
columns = ['C', 'f1', 'features', 'gamma', 'kappa',
           'matthews', 'roc_auc', 'stock',
       'test_f1', 'test_kappa', 'test_matthews', 'test_roc_auc']
df_res[columns].sort_values(by='matthews', ascending=False).groupby('stock').head(1)

Unnamed: 0,C,f1,features,gamma,kappa,matthews,roc_auc,stock,test_f1,test_kappa,test_matthews,test_roc_auc
37,100.0,0.603789,pca_gdf_que4,0.1,0.189756,0.191654,0.594763,11946,0.626234,0.201771,0.201996,0.600693
28,10.0,0.593088,pca_gdf_que4,0.001,0.180103,0.181585,0.590329,3879,0.566337,0.123816,0.123915,0.561932
38,100.0,0.600282,pca_gdf_que1,1.0,0.169434,0.171004,0.584663,3035,0.575083,0.100511,0.100688,0.550177
17,0.1,0.601108,pca_gdf_que4,1.0,0.157448,0.164031,0.578763,1956,0.601738,0.129149,0.131675,0.564618
35,100.0,0.569319,pca_gdf_que_prev9,0.001,0.162968,0.16401,0.581638,7858,0.532731,0.164913,0.166582,0.581953
22,1.0,0.583532,pca_gdf_que_prev3,0.01,0.154999,0.163069,0.577577,4320,0.617216,0.157856,0.158424,0.578592
30,10.0,0.587315,pca_gdf_que_prev5,0.1,0.157832,0.159166,0.579142,9761,0.603878,0.139908,0.141347,0.56983
28,10.0,0.570876,pca_gdf_que2,0.001,0.153234,0.158249,0.576935,12417,0.594021,0.1585,0.159298,0.579349
35,100.0,0.610708,pca_gdf_que_prev3,0.001,0.155426,0.156886,0.57747,10484,0.564303,0.109927,0.110385,0.555076
42,1000.0,0.58432,pca_gdf_que6,0.001,0.151211,0.15449,0.575766,1472,0.634867,0.116089,0.119465,0.557107


In [5]:
log_res = []
for stock in stocks:
    log_res.append(get_logistic_regression(stock, data_length))
df_log_res = pd.DataFrame(log_res)
df_log_res['stock'] = df_log_res['stock'].values.astype(np.int)
df_log_res.index = df_log_res['stock'].values.astype(np.int)

In [6]:
df_gdf_best = df_res[columns].sort_values(by='test_matthews', ascending=False).groupby('stock').head(1)
df_gdf_best['stock'] = df_gdf_best['stock'].values.astype(np.int)
df_gdf_best.index = df_gdf_best['stock'].values.astype(np.int)

In [7]:
df_all = pd.merge(df_gdf_best, df_log_res, on='stock', suffixes=['_svm', '_log'])

In [15]:
all_columns = [ 'features', 'matthews_svm', 'matthews_log',  'test_matthews_svm',  'test_matthews_log',
       'roc_auc_svm', 'roc_auc_log', 'test_roc_auc_svm',  'test_roc_auc_log', 'stock', 
               'f1_svm', 'f1_log', 'test_f1_svm', 'test_f1_log', 'stock']
df_all[all_columns]

Unnamed: 0,features,matthews_svm,matthews_log,test_matthews_svm,test_matthews_log,roc_auc_svm,roc_auc_log,test_roc_auc_svm,test_roc_auc_log,stock,f1_svm,f1_log,test_f1_svm,test_f1_log,stock.1
0,pca_gdf_que4,0.189098,0.186824,0.204618,0.203627,0.593677,0.592373,0.60181,0.601087,11946,0.603092,0.595737,0.630941,0.634056,11946
1,pca_gdf_que_prev3,0.161749,0.156657,0.177985,0.163789,0.579902,0.577752,0.586768,0.581178,4320,0.620289,0.610483,0.642073,0.620721,4320
2,pca_gdf_que5,0.141114,0.142499,0.174959,0.162155,0.567432,0.570582,0.584089,0.580693,10508,0.622321,0.582458,0.627886,0.595089,10508
3,pca_gdf_que_prev9,0.134107,0.146384,0.173381,0.171235,0.567367,0.572852,0.583913,0.584512,7858,0.491723,0.551977,0.517343,0.539665,7858
4,pca_gdf_que3,0.108373,0.137272,0.167922,0.168692,0.553111,0.567489,0.583824,0.583011,3161,0.539567,0.579932,0.599518,0.621412,3161
5,pca_gdf_que1,0.113993,0.125634,0.166685,0.162023,0.556786,0.56234,0.583233,0.58089,2651,0.501255,0.553881,0.599229,0.59779,2651
6,pca_gdf_que_prev7,0.125957,0.130301,0.162579,0.146515,0.560368,0.564461,0.5808,0.573201,1113,0.49068,0.534083,0.561001,0.591563,1113
7,pca_gdf_que10,0.116676,0.131577,0.161298,0.15053,0.555966,0.564686,0.575496,0.570781,2602,0.593411,0.612748,0.647331,0.641621,2602
8,pca_gdf_que1,0.046888,0.138016,0.160484,0.138375,0.521126,0.568685,0.577134,0.568627,1431,0.656601,0.585431,0.627641,0.593882,1431
9,pca_gdf_que2,0.155395,0.150172,0.160438,0.154446,0.575317,0.573432,0.579881,0.576707,12417,0.573145,0.591476,0.595479,0.596958,12417


In [9]:
len(df_all[df_all['matthews_svm'] > df_all['matthews_log']][all_columns]), len(df_all)

(19, 53)

In [10]:
len(df_all[df_all['roc_auc_svm'] > df_all['roc_auc_log']][all_columns]), len(df_all)

(16, 53)

In [16]:
df_all[df_all['test_matthews_svm'] < df_all['test_matthews_log']][all_columns]

Unnamed: 0,features,matthews_svm,matthews_log,test_matthews_svm,test_matthews_log,roc_auc_svm,roc_auc_log,test_roc_auc_svm,test_roc_auc_log,stock,f1_svm,f1_log,test_f1_svm,test_f1_log,stock.1
4,pca_gdf_que3,0.108373,0.137272,0.167922,0.168692,0.553111,0.567489,0.583824,0.583011,3161,0.539567,0.579932,0.599518,0.621412,3161
12,pca_gdf_que_prev10,0.148575,0.145601,0.150493,0.153016,0.572621,0.572127,0.575244,0.576301,11869,0.591002,0.582713,0.579495,0.561531,11869
27,pca_gdf_que_prev2,0.123121,0.130253,0.127163,0.13486,0.559384,0.564948,0.563468,0.567422,9086,0.522086,0.554918,0.539195,0.567149,9086
49,pca_gdf_que2,0.092872,0.132624,0.085228,0.086034,0.544369,0.563571,0.536509,0.53853,13003,0.325912,0.479875,0.377632,0.409548,13003
50,pca_gdf_que2,0.105739,0.111922,0.080212,0.086004,0.551588,0.554976,0.540072,0.543004,9063,0.515458,0.525106,0.551926,0.545274,9063
52,pca_gdf_que2,0.095108,0.117283,0.072545,0.074043,0.546711,0.557431,0.536262,0.537,9058,0.475113,0.53941,0.530633,0.528993,9058


In [17]:
df_all[df_all['test_roc_auc_svm'] < df_all['test_roc_auc_log']][all_columns]

Unnamed: 0,features,matthews_svm,matthews_log,test_matthews_svm,test_matthews_log,roc_auc_svm,roc_auc_log,test_roc_auc_svm,test_roc_auc_log,stock,f1_svm,f1_log,test_f1_svm,test_f1_log,stock.1
3,pca_gdf_que_prev9,0.134107,0.146384,0.173381,0.171235,0.567367,0.572852,0.583913,0.584512,7858,0.491723,0.551977,0.517343,0.539665,7858
12,pca_gdf_que_prev10,0.148575,0.145601,0.150493,0.153016,0.572621,0.572127,0.575244,0.576301,11869,0.591002,0.582713,0.579495,0.561531,11869
16,pca_gdf_que_prev5,0.093566,0.12563,0.145583,0.143575,0.544434,0.561209,0.569429,0.570249,1907,0.534906,0.599786,0.646465,0.63052,1907
24,pca_gdf_que6,0.146598,0.14217,0.131533,0.130087,0.571515,0.570308,0.561857,0.562736,12255,0.560795,0.561546,0.626228,0.614298,12255
27,pca_gdf_que_prev2,0.123121,0.130253,0.127163,0.13486,0.559384,0.564948,0.563468,0.567422,9086,0.522086,0.554918,0.539195,0.567149,9086
39,pca_gdf_que10,0.104658,0.113968,0.106215,0.106202,0.551626,0.556876,0.553005,0.553061,9269,0.521679,0.546408,0.542199,0.567554,9269
41,pca_gdf_que_prev2,0.030198,0.132153,0.104572,0.094105,0.512215,0.565317,0.538501,0.547029,12456,0.666428,0.596622,0.664444,0.561743,12456
49,pca_gdf_que2,0.092872,0.132624,0.085228,0.086034,0.544369,0.563571,0.536509,0.53853,13003,0.325912,0.479875,0.377632,0.409548,13003
50,pca_gdf_que2,0.105739,0.111922,0.080212,0.086004,0.551588,0.554976,0.540072,0.543004,9063,0.515458,0.525106,0.551926,0.545274,9063
52,pca_gdf_que2,0.095108,0.117283,0.072545,0.074043,0.546711,0.557431,0.536262,0.537,9058,0.475113,0.53941,0.530633,0.528993,9058


In [13]:
df_all[df_all['matthews_svm'] < df_all['matthews_log']]['features'].value_counts()

pca_gdf_que2          5
pca_gdf_que1          4
pca_gdf_que_prev5     3
pca_gdf_que10         3
pca_gdf_que_prev2     3
pca_gdf_que3          3
pca_gdf_que_prev7     2
pca_gdf_que_prev9     2
pca_gdf_que_prev4     2
pca_gdf_que_prev3     2
pca_gdf_que_prev10    1
pca_gdf_que9          1
pca_gdf_que5          1
pca_gdf_que4          1
pca_gdf_que7          1
Name: features, dtype: int64

In [14]:
df_all[df_all['matthews_svm'] > df_all['matthews_log']]['features'].value_counts()

pca_gdf_que2          3
pca_gdf_que4          3
pca_gdf_que_prev10    2
pca_gdf_que1          2
pca_gdf_que6          2
pca_gdf_que_prev2     1
pca_gdf_que5          1
pca_gdf_que3          1
pca_gdf_que_prev8     1
pca_gdf_que8          1
pca_gdf_que_prev3     1
pca_gdf_que9          1
Name: features, dtype: int64