In [2]:
%load_ext autoreload

%autoreload 2
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.svm import SVC
 
from sklearn import metrics

from mlxtend.plotting import plot_decision_regions
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from ast import literal_eval

import warnings
import numpy as np
from collections import OrderedDict

from lob_data_utils import lob, db_result, model
from lob_data_utils.svm_calculation import lob_svm
import os


sns.set_style('whitegrid')
warnings.filterwarnings('ignore')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
data_length = 15000
stocks = ['9064', '9061', '9265']

In [4]:
def convert_scores(df, column):
    scores = []
    for i, row in df.iterrows():
        try:
            scores.append(np.mean(row[column]))
        except:
            scores.append(np.mean(np.array(literal_eval(row[column])).astype(np.float64)))
    return scores
scores_columns = ['f1', 'kappa', 'matthews', 'precision', 'recall', 'roc_auc', 'train_f1', 'train_kappa',
       'train_matthews', 'train_precision', 'train_recall', 'train_roc_auc']

In [83]:
data_results_dir = '../gaussian_filter/data_res_gdf_feature_scaling' # '../gaussian_filter/data_res_logistic/'
data_dir = '../gaussian_filter/data_gdf_feature_scaling/'# '../gaussian_filter/data_gdf/'
gdf_start = 0
gdf_end = 50
gdf_columns = ['gdf_{}'.format(i) for i in range(gdf_start, gdf_end)]

df_scores_all = pd.DataFrame()

for s in stocks:
    df = pd.read_csv(os.path.join(data_results_dir, 'res_log_{}_len15000_K{}-{}.csv'.format(s, gdf_start, gdf_end)))
    df.dropna(inplace=True)
    df.drop(columns=['Unnamed: 0'], inplace=True)
    for col in scores_columns:
        df[col] = convert_scores(df, col)
    
    df_scores_all = df_scores_all.append(df, ignore_index=True)
df_scores_all.sort_values(by='matthews', ascending=False).groupby('stock').head(1)

Unnamed: 0,K,f1,kappa,matthews,method,precision,r,recall,roc_auc,s,stock,train_f1,train_kappa,train_matthews,train_precision,train_recall,train_roc_auc
71,50,0.478407,0.014423,0.023956,logistic,0.462356,1.0,0.615298,0.507238,0.05,9265,0.382824,0.024347,0.026759,0.462397,0.363776,0.512131
15,50,0.67339,0.001584,0.005694,logistic,0.508763,0.5,0.996191,0.500785,0.01,9064,0.658159,0.012255,0.021582,0.513983,0.920251,0.506038
28,50,0.311043,-0.003277,0.004709,logistic,0.511443,0.01,0.294051,0.498551,0.5,9061,0.311658,0.015889,0.02381,0.455225,0.304225,0.507871


In [84]:
df_best = df_scores_all.sort_values(by='matthews', ascending=False).groupby('stock').head(1)

dfs = {}
dfs_test = {}
for stock in stocks:
    r = df_best[df_best['stock'] == int(stock)]['r'].values[0]
    s = df_best[df_best['stock'] == int(stock)]['s'].values[0]
    print(r,s, stock)
    gdf_filename = 'gdf_{}_r{}_s{}_K50_feature_scaling'.format(stock, r, s)
    print(gdf_filename)
    dfs[stock], dfs_test[stock] = lob.load_prepared_data(
        gdf_filename, data_dir=data_dir, cv=False, length=15000)

0.5 0.01 9064
gdf_9064_r0.5_s0.01_K50_feature_scaling
0.01 0.5 9061
gdf_9061_r0.01_s0.5_K50_feature_scaling
1.0 0.05 9265
gdf_9265_r1.0_s0.05_K50_feature_scaling


In [88]:
dfs[stock].head()

Unnamed: 0.1,Unnamed: 0,datetime,gdf_0,gdf_1,gdf_10,gdf_11,gdf_12,gdf_13,gdf_14,gdf_15,...,gdf_48,gdf_49,gdf_5,gdf_6,gdf_7,gdf_8,gdf_9,mid_price,mid_price_indicator,queue_imbalance
3000,3000,2013-09-16 09:45:00,0.0,0.0,2.947292e-195,4.555155e-170,1.2894519999999999e-146,6.685429e-125,6.3485630000000005e-105,1.10419e-86,...,4.555155e-170,2.947292e-195,0.0,2.194442e-313,3.013809e-281,7.581053e-251,3.492733e-222,3340.75,1.0,
3001,3001,2013-09-16 09:46:00,0.0,0.0,2.947292e-195,4.555155e-170,1.2894519999999999e-146,6.685429e-125,6.3485630000000005e-105,1.10419e-86,...,4.555155e-170,2.947292e-195,0.0,2.194442e-313,3.013809e-281,7.581053e-251,3.492733e-222,3341.0,1.0,
3002,3002,2013-09-16 09:48:00,0.0,0.0,2.947292e-195,4.555155e-170,1.2894519999999999e-146,6.685429e-125,6.3485630000000005e-105,1.10419e-86,...,4.555155e-170,2.947292e-195,0.0,2.194442e-313,3.013809e-281,7.581053e-251,3.492733e-222,3341.75,0.0,
3003,3003,2013-09-16 09:49:00,0.0,0.0,2.947292e-195,4.555155e-170,1.2894519999999999e-146,6.685429e-125,6.3485630000000005e-105,1.10419e-86,...,4.555155e-170,2.947292e-195,0.0,2.194442e-313,3.013809e-281,7.581053e-251,3.492733e-222,3341.25,0.0,
3004,3004,2013-09-16 09:50:00,0.0,0.0,2.947292e-195,4.555155e-170,1.2894519999999999e-146,6.685429e-125,6.3485630000000005e-105,1.10419e-86,...,4.555155e-170,2.947292e-195,0.0,2.194442e-313,3.013809e-281,7.581053e-251,3.492733e-222,3341.0,1.0,


In [85]:
def get_scores_dict_for_data(functions_to_run, dfs, log_clf, stock):
    scores = {'stock': stock}
    for func_name, func in functions_to_run.items():
        for df_name, df in dfs.items():
            pred = log_clf.predict(df.loc[:, gdf_columns])
            scores['{}_{}'.format(df_name, func_name)] = func(df['mid_price_indicator'], pred)
    return scores
            
functions_to_run = {'precision': metrics.precision_score, 'roc_auc': metrics.roc_auc_score,
                   'f1_score': metrics.f1_score, 'recall': metrics.recall_score, 
                   'matthews': metrics.matthews_corrcoef, 'kappa': metrics.cohen_kappa_score}
scores = []
for stock in stocks:
    clf = LogisticRegression()
    train_scores = model.validate_model(
        clf, dfs[stock].loc[:, gdf_columns], dfs[stock]['mid_price_indicator'].values.reshape(-1, 1))
    dfs_dict = {'d': dfs[stock], 'test': dfs_test[stock], }
    res = get_scores_dict_for_data(functions_to_run, dfs_dict, clf, stock)
    res = {**res, **train_scores}
    scores.append(res)
df_scores = pd.DataFrame(scores, index=stocks)

In [86]:
for col in scores_columns:
     df_scores[col] = convert_scores(df_scores, col)
df_scores[[c for c in df_scores.columns if 'mat' in c]]

Unnamed: 0,d_matthews,matthews,test_matthews,train_matthews
9064,0.012039,0.005694,0.0,0.021582
9061,0.011897,0.004709,0.00328,0.02381
9265,0.02,0.023956,0.003681,0.026759


In [87]:
df_scores_all.sort_values(by='matthews', ascending=False).groupby('stock').head(1)

Unnamed: 0,K,f1,kappa,matthews,method,precision,r,recall,roc_auc,s,stock,train_f1,train_kappa,train_matthews,train_precision,train_recall,train_roc_auc
71,50,0.478407,0.014423,0.023956,logistic,0.462356,1.0,0.615298,0.507238,0.05,9265,0.382824,0.024347,0.026759,0.462397,0.363776,0.512131
15,50,0.67339,0.001584,0.005694,logistic,0.508763,0.5,0.996191,0.500785,0.01,9064,0.658159,0.012255,0.021582,0.513983,0.920251,0.506038
28,50,0.311043,-0.003277,0.004709,logistic,0.511443,0.01,0.294051,0.498551,0.5,9061,0.311658,0.015889,0.02381,0.455225,0.304225,0.507871
