In [1]:
%load_ext autoreload

%autoreload 2
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn import metrics

# from mlxtend.plotting import plot_decision_regions
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from ast import literal_eval

import warnings
import numpy as np
from collections import OrderedDict

from lob_data_utils import lob, db_result, model, roc_results
from lob_data_utils.svm_calculation import lob_svm
import os


sns.set_style('whitegrid')
warnings.filterwarnings('ignore')

In [2]:
data_length = 10000
r = 0.1
s = 0.1
stocks = list(roc_results.result_cv_10000.keys())

In [3]:
def get_mean_scores(scores: dict) -> dict:
    mean_scores = {}
    for k, v in scores.items():
        mean_scores[k] = np.mean(v)
    return mean_scores

def get_score_for_clf(clf, df_test):
    x_test = df_test[['queue_imbalance']]
    y_test = df_test['mid_price_indicator'].values
    return model.test_model(clf, x_test, y_test)

def get_logistic_regression(stock, data_length):
    df, df_test = lob.load_prepared_data(
        stock, data_dir='../gaussian_filter/data', cv=False, length=data_length)
    clf = LogisticRegression()
    train_x = df[['queue_imbalance']]

    scores = model.validate_model(clf, train_x, df['mid_price_indicator'])
    res = {
        **get_mean_scores(scores),
        'stock': stock,
        'kernel': 'logistic',
    }
    test_scores = get_score_for_clf(clf, df_test)
    return {**res, **test_scores}

In [4]:
df_res = pd.DataFrame()
for stock in stocks:
    #pd.read_csv('svm_features_{}_len{}_r{}_s{}.csv'.format(stock, data_length, r, s))
    filename = 'svm_pca_only_gdf_{}_len{}_r{}_s{}.csv'.format(stock, data_length, r, s)
    if os.path.exists(filename):
        df_res = df_res.append(pd.read_csv(filename))
#df_res.drop(columns=['Unnamed: 0'], inplace=True)
columns = ['C', 'f1', 'features', 'gamma', 'kappa',
           'matthews', 'roc_auc', 'stock',
       'test_f1', 'test_kappa', 'test_matthews', 'test_roc_auc']
df_res[columns].sort_values(by='matthews', ascending=False).groupby('stock').head(1)

Unnamed: 0,C,f1,features,gamma,kappa,matthews,roc_auc,stock,test_f1,test_kappa,test_matthews,test_roc_auc
42,1000.0,0.585711,pca_gdf_que7,0.001,0.152738,0.156651,0.576624,1472,0.635364,0.119467,0.122784,0.55879
45,1000.0,0.520271,pca_gdf_que5,1.0,0.135072,0.139166,0.567845,9270,0.52762,0.085164,0.085445,0.542615
31,10.0,0.561681,pca_gdf_que_prev2,1.0,0.121475,0.125003,0.560711,9094,0.588732,0.121901,0.122494,0.560852
42,1000.0,0.515607,pca_gdf_que_prev3,0.001,0.113407,0.117054,0.557108,9063,0.522809,0.069043,0.069199,0.534544
24,1.0,0.563332,pca_gdf_que2,1.0,0.110648,0.113999,0.555375,9069,0.596476,0.080342,0.082867,0.540038


In [5]:
log_res = []
for stock in stocks:
    log_res.append(get_logistic_regression(stock, data_length))
df_log_res = pd.DataFrame(log_res)
df_log_res['stock'] = df_log_res['stock'].values.astype(np.int)
df_log_res.index = df_log_res['stock'].values.astype(np.int)

In [6]:
df_gdf_best = df_res[columns].sort_values(by='test_matthews', ascending=False).groupby('stock').head(1)
df_gdf_best['stock'] = df_gdf_best['stock'].values.astype(np.int)
df_gdf_best.index = df_gdf_best['stock'].values.astype(np.int)

In [7]:
df_all = pd.merge(df_gdf_best, df_log_res, on='stock', suffixes=['_svm', '_log'])

In [8]:
all_columns = [ 'features', 'matthews_svm', 'matthews_log',  'test_matthews_svm',  'test_matthews_log',
       'roc_auc_svm', 'roc_auc_log', 'test_roc_auc_svm',  'test_roc_auc_log', 'stock', 
               'f1_svm', 'f1_log', 'test_f1_svm', 'test_f1_log', ]
df_all[all_columns]

Unnamed: 0,features,matthews_svm,matthews_log,test_matthews_svm,test_matthews_log,roc_auc_svm,roc_auc_log,test_roc_auc_svm,test_roc_auc_log,stock,f1_svm,f1_log,test_f1_svm,test_f1_log
0,pca_gdf_que_prev2,0.123814,0.120071,0.136019,0.129138,0.560401,0.5592,0.56796,0.564415,9094,0.541219,0.556465,0.580457,0.583333
1,pca_gdf_que7,0.149064,0.1486,0.128257,0.118223,0.573251,0.572888,0.561548,0.557251,1472,0.58076,0.583823,0.636402,0.627148
2,pca_gdf_que2,0.087958,0.107065,0.121065,0.115421,0.542579,0.552507,0.559619,0.556709,9069,0.545622,0.556665,0.598353,0.598182
3,pca_gdf_que5,0.126075,0.129431,0.101088,0.070337,0.557459,0.563917,0.548588,0.534968,9270,0.437023,0.532698,0.477784,0.561171
4,pca_gdf_que_prev3,0.114035,0.111922,0.076066,0.086004,0.555615,0.554976,0.537993,0.543004,9063,0.515005,0.525106,0.529052,0.545274


In [9]:
len(df_all[df_all['matthews_svm'] > df_all['matthews_log']][all_columns]), len(df_all)

(3, 5)

In [10]:
len(df_all[df_all['roc_auc_svm'] > df_all['roc_auc_log']][all_columns]), len(df_all)

(3, 5)

In [11]:
df_all[df_all['test_matthews_svm'] < df_all['test_matthews_log']][all_columns]

Unnamed: 0,features,matthews_svm,matthews_log,test_matthews_svm,test_matthews_log,roc_auc_svm,roc_auc_log,test_roc_auc_svm,test_roc_auc_log,stock,f1_svm,f1_log,test_f1_svm,test_f1_log
4,pca_gdf_que_prev3,0.114035,0.111922,0.076066,0.086004,0.555615,0.554976,0.537993,0.543004,9063,0.515005,0.525106,0.529052,0.545274


In [12]:
df_all[df_all['test_roc_auc_svm'] < df_all['test_roc_auc_log']][all_columns]

Unnamed: 0,features,matthews_svm,matthews_log,test_matthews_svm,test_matthews_log,roc_auc_svm,roc_auc_log,test_roc_auc_svm,test_roc_auc_log,stock,f1_svm,f1_log,test_f1_svm,test_f1_log
4,pca_gdf_que_prev3,0.114035,0.111922,0.076066,0.086004,0.555615,0.554976,0.537993,0.543004,9063,0.515005,0.525106,0.529052,0.545274


In [13]:
df_all[df_all['matthews_svm'] < df_all['matthews_log']]['features'].value_counts()

pca_gdf_que5    1
pca_gdf_que2    1
Name: features, dtype: int64

In [14]:
df_all[df_all['matthews_svm'] > df_all['matthews_log']]['features'].value_counts()

pca_gdf_que_prev2    1
pca_gdf_que_prev3    1
pca_gdf_que7         1
Name: features, dtype: int64