In [1]:
%load_ext autoreload

%autoreload 2
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn import metrics

# from mlxtend.plotting import plot_decision_regions
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from ast import literal_eval

import warnings
import numpy as np
from collections import OrderedDict

from lob_data_utils import lob, db_result, model, roc_results
from lob_data_utils.svm_calculation import lob_svm
import os


sns.set_style('whitegrid')
warnings.filterwarnings('ignore')

In [2]:
data_length = 10000
r = 0.1
s = 0.1
stocks = list(roc_results.result_cv_10000.keys())

In [3]:
def get_mean_scores(scores: dict) -> dict:
    mean_scores = {}
    for k, v in scores.items():
        mean_scores[k] = np.mean(v)
    return mean_scores

def get_score_for_clf(clf, df_test):
    x_test = df_test[['queue_imbalance']]
    y_test = df_test['mid_price_indicator'].values
    return model.test_model(clf, x_test, y_test)

def get_logistic_regression(stock, data_length):
    df, df_test = lob.load_prepared_data(
        stock, data_dir='../gaussian_filter/data', cv=False, length=data_length)
    clf = LogisticRegression()
    train_x = df[['queue_imbalance']]

    scores = model.validate_model(clf, train_x, df['mid_price_indicator'])
    res = {
        **get_mean_scores(scores),
        'stock': stock,
        'kernel': 'logistic',
    }
    test_scores = get_score_for_clf(clf, df_test)
    return {**res, **test_scores}

In [4]:
df_res = pd.DataFrame()
for stock in stocks:
    #pd.read_csv('svm_features_{}_len{}_r{}_s{}.csv'.format(stock, data_length, r, s))
    filename = 'svm_pca_gdf_{}_len{}_r{}_s{}.csv'.format(stock, data_length, r, s)
    if os.path.exists(filename):
        df_res = df_res.append(pd.read_csv(filename))
#df_res.drop(columns=['Unnamed: 0'], inplace=True)
columns = ['C', 'f1', 'features', 'gamma', 'kappa',
           'matthews', 'roc_auc', 'stock',
       'test_f1', 'test_kappa', 'test_matthews', 'test_roc_auc']
df_res[columns].sort_values(by='matthews', ascending=False).groupby('stock').head(1)

Unnamed: 0,C,f1,features,gamma,kappa,matthews,roc_auc,stock,test_f1,test_kappa,test_matthews,test_roc_auc
18,0.1,0.58157,pca_gdf_que_prev6,10.0,0.182965,0.186542,0.591616,11946,0.631963,0.188646,0.189814,0.593902
16,0.1,0.595059,pca_gdf_que3,0.1,0.183312,0.184655,0.591923,3879,0.576923,0.120317,0.120923,0.560216
16,0.1,0.591444,pca_gdf_que_prev3,0.1,0.16838,0.169412,0.584164,3035,0.573466,0.100702,0.100834,0.550283
22,1.0,0.583637,pca_gdf_que_prev3,0.01,0.157676,0.165721,0.578921,4320,0.616583,0.156892,0.157443,0.578116
37,100.0,0.590636,gdf_20_30_que,0.1,0.154976,0.161223,0.577663,1956,0.600648,0.137069,0.139069,0.568575
35,100.0,0.611209,gdf_0-50_que_prev,0.001,0.157204,0.158681,0.57839,10484,0.571567,0.118236,0.118917,0.559261
23,1.0,0.618727,pca_gdf_que10,0.1,0.152157,0.157528,0.576282,13061,0.630471,0.105829,0.110006,0.552138
37,100.0,0.567604,pca_gdf_que_prev3,0.1,0.151612,0.157168,0.576109,12417,0.586914,0.154228,0.154651,0.577185
42,1000.0,0.585711,pca_gdf_que7,0.001,0.152738,0.156651,0.576624,1472,0.635364,0.119467,0.122784,0.55879
36,100.0,0.508729,pca_gdf_que_prev7,0.01,0.1502,0.156395,0.574876,7858,0.437838,0.149984,0.167144,0.573579


In [5]:
log_res = []
for stock in stocks:
    log_res.append(get_logistic_regression(stock, data_length))
df_log_res = pd.DataFrame(log_res)
df_log_res['stock'] = df_log_res['stock'].values.astype(np.int)
df_log_res.index = df_log_res['stock'].values.astype(np.int)

In [6]:
df_gdf_best = df_res[columns].sort_values(by='test_matthews', ascending=False).groupby('stock').head(1)
df_gdf_best['stock'] = df_gdf_best['stock'].values.astype(np.int)
df_gdf_best.index = df_gdf_best['stock'].values.astype(np.int)

In [7]:
df_all = pd.merge(df_gdf_best, df_log_res, on='stock', suffixes=['_svm', '_log'])

In [13]:
all_columns = [ 'features', 'matthews_svm', 'matthews_log',  'test_matthews_svm',  'test_matthews_log',
       'roc_auc_svm', 'roc_auc_log', 'test_roc_auc_svm',  'test_roc_auc_log', 'stock', 
               'f1_svm', 'f1_log', 'test_f1_svm', 'test_f1_log', 'stock' ]
df_all[all_columns]

Unnamed: 0,features,matthews_svm,matthews_log,test_matthews_svm,test_matthews_log,roc_auc_svm,roc_auc_log,test_roc_auc_svm,test_roc_auc_log,stock,f1_svm,f1_log,test_f1_svm,test_f1_log,stock.1
0,pca_gdf_que_prev6,0.181962,0.186824,0.208721,0.203627,0.590317,0.592373,0.603896,0.601087,11946,0.596373,0.595737,0.63212,0.634056,11946
1,pca_gdf_que_prev7,0.131059,0.146384,0.189878,0.171235,0.565843,0.572852,0.592237,0.584512,7858,0.489117,0.551977,0.530612,0.539665,7858
2,pca_gdf_que_prev3,0.160155,0.156657,0.175396,0.163789,0.57451,0.577752,0.586201,0.581178,4320,0.573269,0.610483,0.634821,0.620721,4320
3,pca_gdf_que_prev7,0.134182,0.130301,0.172099,0.146515,0.563109,0.564461,0.585243,0.573201,1113,0.480728,0.534083,0.559197,0.591563,1113
4,que,0.141919,0.137272,0.171835,0.168692,0.57081,0.567489,0.585412,0.583011,3161,0.582392,0.579932,0.610824,0.621412,3161
5,gdf_23-27_que,0.123376,0.142499,0.171688,0.162155,0.560414,0.570582,0.585003,0.580693,10508,0.591168,0.582458,0.606519,0.595089,10508
6,pca_gdf_que1,0.068016,0.125634,0.167827,0.162023,0.530527,0.56234,0.582087,0.58089,2651,0.385594,0.553881,0.627187,0.59779,2651
7,que,0.047276,0.138016,0.162363,0.138375,0.521266,0.568685,0.578129,0.568627,1431,0.656961,0.585431,0.627866,0.593882,1431
8,pca_gdf_que7,0.127954,0.131577,0.161787,0.15053,0.562341,0.564686,0.574458,0.570781,2602,0.599395,0.612748,0.652605,0.641621,2602
9,gdf_0-50_que_prev,0.131866,0.129177,0.160774,0.150086,0.563613,0.562399,0.578991,0.573388,3022,0.609968,0.609023,0.636803,0.635929,3022


In [9]:
len(df_all[df_all['matthews_svm'] > df_all['matthews_log']][all_columns]), len(df_all)

(22, 53)

In [10]:
len(df_all[df_all['roc_auc_svm'] > df_all['roc_auc_log']][all_columns]), len(df_all)

(18, 53)

In [14]:
df_all[df_all['test_matthews_svm'] < df_all['test_matthews_log']][all_columns]

Unnamed: 0,features,matthews_svm,matthews_log,test_matthews_svm,test_matthews_log,roc_auc_svm,roc_auc_log,test_roc_auc_svm,test_roc_auc_log,stock,f1_svm,f1_log,test_f1_svm,test_f1_log,stock.1
15,gdf_20_30_que_prev,0.15079,0.145601,0.147729,0.153016,0.574836,0.572127,0.573858,0.576301,11869,0.580123,0.582713,0.572289,0.561531,11869
37,pca_gdf_que_prev5,0.126252,0.130253,0.114973,0.13486,0.561881,0.564948,0.557523,0.567422,9086,0.539466,0.554918,0.544799,0.567149,9086
50,gdf_24-26_que_prev,0.114318,0.111922,0.076066,0.086004,0.555757,0.554976,0.537993,0.543004,9063,0.515207,0.525106,0.529052,0.545274,9063


In [12]:
df_all[df_all['test_roc_auc_svm'] < df_all['test_roc_auc_log']][all_columns]

Unnamed: 0,features,matthews_svm,matthews_log,test_matthews_svm,test_matthews_log,roc_auc_svm,roc_auc_log,test_roc_auc_svm,test_roc_auc_log,stock,f1_svm,f1_log,test_f1_svm,test_f1_log
15,gdf_20_30_que_prev,0.15079,0.145601,0.147729,0.153016,0.574836,0.572127,0.573858,0.576301,11869,0.580123,0.582713,0.572289,0.561531
37,pca_gdf_que_prev5,0.126252,0.130253,0.114973,0.13486,0.561881,0.564948,0.557523,0.567422,9086,0.539466,0.554918,0.544799,0.567149
39,gdf_20_30_que_prev,0.032287,0.113968,0.108182,0.106202,0.514008,0.556876,0.551246,0.553061,9269,0.263009,0.546408,0.468235,0.567554
41,pca_gdf_que_prev10,0.029977,0.132153,0.105444,0.094105,0.512057,0.565317,0.539063,0.547029,12456,0.666342,0.596622,0.664193,0.561743
43,que,0.105879,0.109434,0.103763,0.103686,0.546818,0.549733,0.550479,0.550556,11867,0.38881,0.412946,0.471014,0.473273
49,pca_gdf_que2,0.08429,0.132624,0.087992,0.086034,0.539783,0.563571,0.537586,0.53853,13003,0.450285,0.479875,0.377309,0.409548
50,gdf_24-26_que_prev,0.114318,0.111922,0.076066,0.086004,0.555757,0.554976,0.537993,0.543004,9063,0.515207,0.525106,0.529052,0.545274
51,gdf_20_30_que_prev,0.015351,0.117283,0.07525,0.074043,0.505328,0.557431,0.532888,0.537,9058,0.223283,0.53941,0.623994,0.528993
