In [9]:
%load_ext autoreload

%autoreload 2
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn import metrics

# from mlxtend.plotting import plot_decision_regions
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from ast import literal_eval

import warnings
import numpy as np
from collections import OrderedDict

from lob_data_utils import lob, db_result, model, roc_results, stocks
from lob_data_utils.svm_calculation import lob_svm
import os


sns.set_style('whitegrid')
warnings.filterwarnings('ignore')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [10]:
data_length = 10000
rs_params = [(1.0, 1.0),(0.1, 0.1), (0.1, 1.0), (1.0, 0.1)]
stocks = stocks.all_stocks #list(roc_results.result_cv_10000.keys())
should_csv = True

In [11]:
def get_mean_scores(scores: dict) -> dict:
    mean_scores = {}
    for k, v in scores.items():
        mean_scores[k] = np.mean(v)
    return mean_scores

def get_score_for_clf(clf, df_test):
    x_test = df_test[['queue_imbalance']]
    y_test = df_test['mid_price_indicator'].values
    return model.test_model(clf, x_test, y_test)

def get_logistic_regression(stock, data_length):
    df, df_test = lob.load_prepared_data(
        stock, data_dir='../data/prepared', length=data_length)
    clf = LogisticRegression()
    train_x = df[['queue_imbalance']]

    scores = model.validate_model(clf, train_x, df['mid_price_indicator'])
    res = {
        **get_mean_scores(scores),
        'stock': stock,
        'kernel': 'logistic',
    }
    test_scores = get_score_for_clf(clf, df_test)
    return {**res, **test_scores}

In [13]:
log_res = []
for stock in stocks:
    try:
        res = get_logistic_regression(stock, data_length)
    except Exception as e:
        print(stock, e)
    # print(stock, res)
    
    log_res.append(res)
df_log_res = pd.DataFrame(log_res)
df_log_res['stock'] = df_log_res['stock'].values.astype(np.int)
df_log_res.index = df_log_res['stock'].values.astype(np.int)

if should_csv:
    df_log_res.to_csv('res_log_que.csv')

11390 File b'../data/prepared/11390.csv' does not exist
4695 File b'../data/prepared/4695.csv' does not exist
7843 File b'../data/prepared/7843.csv' does not exist


In [21]:
df_res = pd.DataFrame()
for stock in stocks:
    filename = 'res_svm_pca/svm_sigmoid_{}_len{}.csv_partial'.format(stock, data_length)
    if os.path.exists(filename):
        df_temp = pd.read_csv(filename)
        df_res = df_res.append(df_temp)
columns = ['C', 'f1', 'features', 'gamma', 'kappa',
           'matthews', 'roc_auc', 'stock',
       'test_f1', 'test_kappa', 'test_matthews', 'test_roc_auc', 'r', 's']
df_res[columns].sort_values(by='matthews', ascending=False).groupby('stock').head(1)

KeyError: "['C' 'f1' 'features' 'gamma' 'kappa' 'matthews' 'roc_auc' 'stock'\n 'test_f1' 'test_kappa' 'test_matthews' 'test_roc_auc' 'r' 's'] not in index"

In [None]:
df_gdf_best = df_res[columns].sort_values(by='matthews', ascending=False).groupby('stock').head(1)
df_gdf_best['stock'] = df_gdf_best['stock'].values.astype(np.int)
df_gdf_best.index = df_gdf_best['stock'].values.astype(np.int)

In [None]:
df_all = pd.merge(df_gdf_best, df_log_res, on='stock', suffixes=['_svm', '_log'])

In [None]:
all_columns = [ 'features', 'matthews_svm', 'matthews_log',  'test_matthews_svm',  'test_matthews_log',
       'roc_auc_svm', 'roc_auc_log', 'test_roc_auc_svm',  'test_roc_auc_log', 'stock', 
               'f1_svm', 'f1_log', 'test_f1_svm', 'test_f1_log', ]
df_all[all_columns]

In [None]:
df_all['matthews_diff'] = df_all['matthews_svm'] -  df_all['matthews_log']
df_all['matthews_test_diff'] = df_all['test_matthews_svm'] - df_all['test_matthews_log']

In [None]:
df_all['matthews_diff'].sum(), df_all['matthews_test_diff'].sum()

In [None]:
sns.distplot(df_all['matthews

In [None]:
sns.distplot(df_all['matthews_diff'], label='training')
sns.distplot(df_all['matthews_test_diff'], label='testing')
plt.legend()

In [None]:
sns.distplot(df_all['matthews_svm'], label='svm')
sns.distplot(df_all['matthews_log'], label='log')
plt.legend()

In [None]:
len(df_all[df_all['matthews_svm'] > df_all['matthews_log']][all_columns]),len(df_all[df_all['test_matthews_svm'] < df_all['test_matthews_log']]), len(df_all)

In [None]:
df_all[df_all['test_matthews_svm'] < df_all['test_matthews_log']][all_columns]

In [None]:
df_all[df_all['matthews_svm'] > df_all['matthews_log']][all_columns]['features'].value_counts()

In [None]:
len(df_all[df_all['roc_auc_svm'] > df_all['roc_auc_log']][all_columns]), len(df_all)

In [None]:
df_all[df_all['test_matthews_svm'] < df_all['test_matthews_log']][all_columns]

In [None]:
df_all[df_all['test_roc_auc_svm'] < df_all['test_roc_auc_log']][all_columns]