In [25]:
%load_ext autoreload

%autoreload 2
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.svm import SVC
 
from sklearn import metrics

from mlxtend.plotting import plot_decision_regions
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from ast import literal_eval

import warnings
import numpy as np
from collections import OrderedDict

from lob_data_utils import lob, db_result, model
from lob_data_utils.svm_calculation import lob_svm
import os


sns.set_style('whitegrid')
warnings.filterwarnings('ignore')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [75]:
data_length = 15000
stocks = ['9064', '9061', '9265']

In [27]:
def convert_scores(df, column):
    scores = []
    for i, row in df.iterrows():
        try:
            scores.append(np.mean(row[column]))
        except:
            scores.append(np.mean(np.array(literal_eval(row[column])).astype(np.float64)))
    return scores
scores_columns = ['f1', 'kappa', 'matthews', 'precision', 'recall', 'roc_auc', 'train_f1', 'train_kappa',
       'train_matthews', 'train_precision', 'train_recall', 'train_roc_auc']

In [88]:
data_results_dir = '../gaussian_filter/data_res_logistic'
data_dir = '../gaussian_filter/data_gdf/'
gdf_start = 10
gdf_end = 40
gdf_columns = ['gdf_{}'.format(i) for i in range(gdf_start, gdf_end)]

df_scores_all = pd.DataFrame()

for s in stocks:
    df = pd.read_csv(os.path.join(data_results_dir, 'res_log_{}_len{}_K{}-{}.csv'.format(
        s, data_length, gdf_start, gdf_end)))
    df.dropna(inplace=True)
    df.drop(columns=['Unnamed: 0'], inplace=True)
    for col in scores_columns:
        df[col] = convert_scores(df, col)
    
    df_scores_all = df_scores_all.append(df, ignore_index=True)
df_scores_all.sort_values(by='matthews', ascending=False).groupby('stock').head(1)

Unnamed: 0,K,f1,kappa,matthews,method,precision,r,recall,roc_auc,s,stock,train_f1,train_kappa,train_matthews,train_precision,train_recall,train_roc_auc
61,50,0.424283,0.065802,0.069011,logistic,0.502541,0.1,0.418753,0.532867,0.05,9265,0.430767,0.071787,0.085059,0.611539,0.433418,0.535708
11,50,0.674517,0.010733,0.040177,logistic,0.511043,0.1,0.992647,0.50523,0.05,9064,0.673699,0.009983,0.027602,0.51313,0.981713,0.504947
31,50,0.238618,0.02741,0.039906,logistic,0.480211,0.05,0.25196,0.513834,0.05,9061,0.229274,0.025229,0.032124,0.373936,0.226723,0.51253


In [89]:
df_best = df_scores_all.sort_values(by='matthews', ascending=False).groupby('stock').head(1)

dfs = {}
dfs_test = {}
for stock in stocks:
    r = df_best[df_best['stock'] == int(stock)]['r'].values[0]
    s = df_best[df_best['stock'] == int(stock)]['s'].values[0]
    dfs[stock], dfs_test[stock] = lob.load_prepared_data(
        'gdf_{}_len{}_r{}_s{}_K50'.format(stock, data_length, r, s), 
        data_dir=data_dir, cv=False, length=data_length)

In [90]:
def get_scores_dict_for_data(functions_to_run, dfs, log_clf, stock):
    scores = {'stock': stock}
    for func_name, func in functions_to_run.items():
        for df_name, df in dfs.items():
            pred = log_clf.predict(df.loc[:, gdf_columns])
            scores['{}_{}'.format(df_name, func_name)] = func(df['mid_price_indicator'], pred)
    return scores
            
functions_to_run = {'precision': metrics.precision_score, 'roc_auc': metrics.roc_auc_score,
                   'f1_score': metrics.f1_score, 'recall': metrics.recall_score, 
                   'matthews': metrics.matthews_corrcoef, 'kappa': metrics.cohen_kappa_score}
scores = []
for stock in stocks:
    clf = LogisticRegression()
    train_scores = model.validate_model(
        clf, dfs[stock].loc[:, gdf_columns], dfs[stock]['mid_price_indicator'].values.reshape(-1, 1))
    dfs_dict = {'d': dfs[stock], 'test': dfs_test[stock], }
    res = get_scores_dict_for_data(functions_to_run, dfs_dict, clf, stock)
    res = {**res, **train_scores}
    scores.append(res)
df_scores = pd.DataFrame(scores, index=stocks)

In [91]:
for col in scores_columns:
     df_scores[col] = convert_scores(df_scores, col)
df_scores[[c for c in df_scores.columns if 'mat' in c]]

Unnamed: 0,d_matthews,matthews,test_matthews,train_matthews
9064,0.038161,0.040177,0.005955,0.027602
9061,0.069891,0.039906,0.049699,0.032124
9265,0.081119,0.069011,0.079383,0.085059


In [92]:
df_scores_all.sort_values(by='matthews', ascending=False).groupby('stock').head(1)

Unnamed: 0,K,f1,kappa,matthews,method,precision,r,recall,roc_auc,s,stock,train_f1,train_kappa,train_matthews,train_precision,train_recall,train_roc_auc
61,50,0.424283,0.065802,0.069011,logistic,0.502541,0.1,0.418753,0.532867,0.05,9265,0.430767,0.071787,0.085059,0.611539,0.433418,0.535708
11,50,0.674517,0.010733,0.040177,logistic,0.511043,0.1,0.992647,0.50523,0.05,9064,0.673699,0.009983,0.027602,0.51313,0.981713,0.504947
31,50,0.238618,0.02741,0.039906,logistic,0.480211,0.05,0.25196,0.513834,0.05,9061,0.229274,0.025229,0.032124,0.373936,0.226723,0.51253
