In [1]:
%load_ext autoreload

%autoreload 2
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn import metrics

# from mlxtend.plotting import plot_decision_regions
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from ast import literal_eval

import warnings
import numpy as np
from collections import OrderedDict

from lob_data_utils import lob, db_result, model, roc_results
from lob_data_utils.svm_calculation import lob_svm
import os


sns.set_style('whitegrid')
warnings.filterwarnings('ignore')

In [2]:
data_length = 10000
r = 0.1
s = 1.0
stocks = list(roc_results.result_cv_10000.keys())

In [3]:
def get_mean_scores(scores: dict) -> dict:
    mean_scores = {}
    for k, v in scores.items():
        mean_scores[k] = np.mean(v)
    return mean_scores

def get_score_for_clf(clf, df_test):
    x_test = df_test[['queue_imbalance']]
    y_test = df_test['mid_price_indicator'].values
    return model.test_model(clf, x_test, y_test)

def get_logistic_regression(stock, data_length):
    df, df_test = lob.load_prepared_data(
        stock, data_dir='../gaussian_filter/data', cv=False, length=data_length)
    clf = LogisticRegression()
    train_x = df[['queue_imbalance']]

    scores = model.validate_model(clf, train_x, df['mid_price_indicator'])
    res = {
        **get_mean_scores(scores),
        'stock': stock,
        'kernel': 'logistic',
    }
    test_scores = get_score_for_clf(clf, df_test)
    return {**res, **test_scores}

In [4]:
df_res = pd.DataFrame()
for stock in stocks:
    #pd.read_csv('svm_features_{}_len{}_r{}_s{}.csv'.format(stock, data_length, r, s))
    filename = 'svm_pca_only_gdf_{}_len{}_r{}_s{}.csv'.format(stock, data_length, r, s)
    if os.path.exists(filename):
        df_res = df_res.append(pd.read_csv(filename))
#df_res.drop(columns=['Unnamed: 0'], inplace=True)
columns = ['C', 'f1', 'features', 'gamma', 'kappa',
           'matthews', 'roc_auc', 'stock',
       'test_f1', 'test_kappa', 'test_matthews', 'test_roc_auc']
df_res[columns].sort_values(by='matthews', ascending=False).groupby('stock').head(1)

Unnamed: 0,C,f1,features,gamma,kappa,matthews,roc_auc,stock,test_f1,test_kappa,test_matthews,test_roc_auc
42,1000.0,0.601738,pca_gdf_que_prev8,0.001,0.19102,0.193847,0.595327,11946,0.626921,0.195091,0.195535,0.59728
16,0.1,0.594436,pca_gdf_que4,0.1,0.182123,0.183503,0.59133,3879,0.575435,0.122216,0.122692,0.561159
42,1000.0,0.563819,pca_gdf_que_prev10,0.001,0.166019,0.167706,0.583163,7858,0.516934,0.176826,0.18178,0.58753
22,1.0,0.58773,pca_gdf_que_prev3,0.01,0.165907,0.166645,0.582957,3035,0.570746,0.099963,0.100044,0.549928
31,10.0,0.624959,pca_gdf_que_prev3,1.0,0.164088,0.165984,0.581748,4320,0.627731,0.15633,0.157988,0.577598
37,100.0,0.590188,pca_gdf_que5,0.1,0.157928,0.16345,0.579142,1956,0.588512,0.125956,0.127068,0.563007
22,1.0,0.571797,pca_gdf_que_prev3,0.01,0.153445,0.158677,0.577,12417,0.590909,0.154439,0.155125,0.57731
42,1000.0,0.58041,pca_gdf_que_prev9,0.001,0.156176,0.158632,0.578438,9761,0.61046,0.133328,0.135927,0.566502
36,100.0,0.626485,pca_gdf_que9,0.01,0.148728,0.157256,0.574547,13061,0.639511,0.094729,0.101326,0.546465
35,100.0,0.611943,pca_gdf_que3,0.001,0.155394,0.156893,0.577441,10484,0.567673,0.113104,0.113672,0.556679


In [5]:
log_res = []
for stock in stocks:
    log_res.append(get_logistic_regression(stock, data_length))
df_log_res = pd.DataFrame(log_res)
df_log_res['stock'] = df_log_res['stock'].values.astype(np.int)
df_log_res.index = df_log_res['stock'].values.astype(np.int)

In [6]:
df_gdf_best = df_res[columns].sort_values(by='test_matthews', ascending=False).groupby('stock').head(1)
df_gdf_best['stock'] = df_gdf_best['stock'].values.astype(np.int)
df_gdf_best.index = df_gdf_best['stock'].values.astype(np.int)

In [7]:
df_all = pd.merge(df_gdf_best, df_log_res, on='stock', suffixes=['_svm', '_log'])

In [8]:
all_columns = [ 'features', 'matthews_svm', 'matthews_log',  'test_matthews_svm',  'test_matthews_log',
       'roc_auc_svm', 'roc_auc_log', 'test_roc_auc_svm',  'test_roc_auc_log', 'stock', 
               'f1_svm', 'f1_log', 'test_f1_svm', 'test_f1_log', 'stock']
df_all[all_columns]

Unnamed: 0,features,matthews_svm,matthews_log,test_matthews_svm,test_matthews_log,roc_auc_svm,roc_auc_log,test_roc_auc_svm,test_roc_auc_log,stock,f1_svm,f1_log,test_f1_svm,test_f1_log,stock.1
0,pca_gdf_que_prev8,0.159626,0.186824,0.208474,0.203627,0.578849,0.592373,0.603612,0.601087,11946,0.526995,0.595737,0.634508,0.634056,11946
1,pca_gdf_que_prev10,0.167706,0.146384,0.18178,0.171235,0.583163,0.572852,0.58753,0.584512,7858,0.563819,0.551977,0.516934,0.539665,7858
2,pca_gdf_que_prev3,0.160092,0.156657,0.1764,0.163789,0.574586,0.577752,0.586677,0.581178,4320,0.57319,0.610483,0.635431,0.620721,4320
3,pca_gdf_que1,0.14238,0.137272,0.171827,0.168692,0.571041,0.567489,0.585445,0.583011,3161,0.583146,0.579932,0.61009,0.621412,3161
4,pca_gdf_que_prev7,0.134408,0.130301,0.170011,0.146515,0.562815,0.564461,0.583777,0.573201,1113,0.475626,0.534083,0.549462,0.591563,1113
5,pca_gdf_que1,0.129062,0.125634,0.168666,0.162023,0.564221,0.56234,0.584213,0.58089,2651,0.561284,0.553881,0.600577,0.59779,2651
6,pca_gdf_que5,0.075843,0.142499,0.16684,0.162155,0.535618,0.570582,0.58078,0.580693,10508,0.630194,0.582458,0.620845,0.595089,10508
7,pca_gdf_que10,0.12865,0.131577,0.165576,0.15053,0.563957,0.564686,0.579491,0.570781,2602,0.591001,0.612748,0.638925,0.641621,2602
8,pca_gdf_que1,0.122678,0.129177,0.163151,0.150086,0.55972,0.562399,0.580864,0.573388,3022,0.608659,0.609023,0.628982,0.635929,3022
9,pca_gdf_que1,0.048215,0.138016,0.162463,0.138375,0.521724,0.568685,0.578132,0.568627,1431,0.657545,0.585431,0.628194,0.593882,1431


In [9]:
len(df_all[df_all['matthews_svm'] > df_all['matthews_log']][all_columns]), len(df_all)

(20, 53)

In [10]:
len(df_all[df_all['roc_auc_svm'] > df_all['roc_auc_log']][all_columns]), len(df_all)

(15, 53)

In [11]:
df_all[df_all['test_matthews_svm'] < df_all['test_matthews_log']][all_columns]

Unnamed: 0,features,matthews_svm,matthews_log,test_matthews_svm,test_matthews_log,roc_auc_svm,roc_auc_log,test_roc_auc_svm,test_roc_auc_log,stock,f1_svm,f1_log,test_f1_svm,test_f1_log,stock.1
28,pca_gdf_que_prev2,0.124144,0.130253,0.125377,0.13486,0.559962,0.564948,0.562432,0.567422,9086,0.523951,0.554918,0.531972,0.567149,9086
49,pca_gdf_que6,0.075776,0.132624,0.085385,0.086034,0.534466,0.563571,0.537391,0.53853,13003,0.273337,0.479875,0.392788,0.409548,13003
50,pca_gdf_que2,0.109044,0.111922,0.076066,0.086004,0.552626,0.554976,0.537993,0.543004,9063,0.498788,0.525106,0.529052,0.545274,9063
51,pca_gdf_que2,0.09229,0.117283,0.070548,0.074043,0.545322,0.557431,0.535261,0.537,9058,0.475188,0.53941,0.528666,0.528993,9058


In [12]:
df_all[df_all['test_roc_auc_svm'] < df_all['test_roc_auc_log']][all_columns]

Unnamed: 0,features,matthews_svm,matthews_log,test_matthews_svm,test_matthews_log,roc_auc_svm,roc_auc_log,test_roc_auc_svm,test_roc_auc_log,stock,f1_svm,f1_log,test_f1_svm,test_f1_log,stock.1
28,pca_gdf_que_prev2,0.124144,0.130253,0.125377,0.13486,0.559962,0.564948,0.562432,0.567422,9086,0.523951,0.554918,0.531972,0.567149,9086
42,pca_gdf_que1,0.105285,0.109434,0.103763,0.103686,0.54667,0.549733,0.550479,0.550556,11867,0.388778,0.412946,0.471014,0.473273,11867
49,pca_gdf_que6,0.075776,0.132624,0.085385,0.086034,0.534466,0.563571,0.537391,0.53853,13003,0.273337,0.479875,0.392788,0.409548,13003
50,pca_gdf_que2,0.109044,0.111922,0.076066,0.086004,0.552626,0.554976,0.537993,0.543004,9063,0.498788,0.525106,0.529052,0.545274,9063
51,pca_gdf_que2,0.09229,0.117283,0.070548,0.074043,0.545322,0.557431,0.535261,0.537,9058,0.475188,0.53941,0.528666,0.528993,9058


In [13]:
df_all[df_all['matthews_svm'] < df_all['matthews_log']]['features'].value_counts()

pca_gdf_que3         6
pca_gdf_que2         5
pca_gdf_que1         4
pca_gdf_que10        2
pca_gdf_que_prev3    2
pca_gdf_que_prev2    2
pca_gdf_que5         2
pca_gdf_que9         2
pca_gdf_que_prev9    2
pca_gdf_que4         2
pca_gdf_que6         2
pca_gdf_que_prev8    1
pca_gdf_que8         1
Name: features, dtype: int64

In [14]:
df_all[df_all['matthews_svm'] > df_all['matthews_log']]['features'].value_counts()

pca_gdf_que1          5
pca_gdf_que_prev3     3
pca_gdf_que10         2
pca_gdf_que_prev7     2
pca_gdf_que_prev2     2
pca_gdf_que9          2
pca_gdf_que_prev10    2
pca_gdf_que2          1
pca_gdf_que5          1
Name: features, dtype: int64