In [2]:
%load_ext autoreload

%autoreload 2
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn import metrics

# from mlxtend.plotting import plot_decision_regions
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from ast import literal_eval

import warnings
import numpy as np
from collections import OrderedDict
from sklearn.utils import class_weight

from lob_data_utils import lob, db_result, model, roc_results, stocks
from lob_data_utils.svm_calculation import lob_svm
import os


sns.set_style('whitegrid')
warnings.filterwarnings('ignore')

In [4]:
data_length = 24000
stocks = stocks.all_stocks
should_csv = True

In [11]:
def get_mean_scores(scores: dict) -> dict:
    mean_scores = {}
    for k, v in scores.items():
        mean_scores[k] = np.mean(v)
    return mean_scores

def get_score_for_clf(clf, df_test):
    x_test = df_test[['queue_imbalance']]
    y_test = df_test['mid_price_indicator'].values
    return model.test_model(clf, x_test, y_test)

def get_score_for_clf_prev(clf, df_test):
    x_test = df_test[['queue_imbalance', 'prev_queue_imbalance']]
    y_test = df_test['mid_price_indicator'].values
    return model.test_model(clf, x_test, y_test)

def get_logistic_regression(stock, data_length):
    df, df_test = lob.load_prepared_data(
        stock, data_dir='../data/prepared', length=data_length)

    train_x = df[['queue_imbalance']]
    y_train = df['mid_price_indicator']
    a = np.unique(y_train)
    class_weight_list = class_weight.compute_class_weight('balanced', a, y_train)
    class_weights = {}
    class_weights[a[0]] = class_weight_list[0]
    class_weights[a[1]] = class_weight_list[1]
    clf = LogisticRegression(class_weight=class_weights)
    scores = model.validate_model(clf, train_x, y_train, print_debug=False)
    res = {
        **get_mean_scores(scores),
        'stock': stock,
        'kernel': 'logistic',
        'class_weights': class_weights
    }
    test_scores = get_score_for_clf(clf, df_test)
    return {**res, **test_scores}

def get_logistic_regression_prev(stock, data_length):
    df, df_test = lob.load_prepared_data(
        stock, data_dir='../data/prepared', length=data_length)
    df['prev_queue_imbalance'] = df['queue_imbalance'].shift()
    df.dropna(inplace=True)
    df_test['prev_queue_imbalance'] = df_test['queue_imbalance'].shift()
    df_test.dropna(inplace=True)
    train_x = df[['queue_imbalance', 'prev_queue_imbalance']]
    y_train = df['mid_price_indicator']
    a = np.unique(y_train)
    class_weight_list = class_weight.compute_class_weight('balanced', a, y_train)
    class_weights = {}
    class_weights[a[0]] = class_weight_list[0]
    class_weights[a[1]] = class_weight_list[1]
    clf = LogisticRegression(class_weight=class_weights)
    scores = model.validate_model(clf, train_x, y_train, print_debug=False)
    res = {
        **get_mean_scores(scores),
        'stock': stock,
        'kernel': 'logistic',
        'class_weights': class_weights
    }
    test_scores = get_score_for_clf_prev(clf, df_test)
    return {**res, **test_scores}

In [6]:
log_res = []
for stock in stocks:
    try:
        res = get_logistic_regression(stock, data_length)

    except Exception as e:
        print(stock, e)
    log_res.append(res)
df_log_res = pd.DataFrame(log_res)
df_log_res['stock'] = df_log_res['stock'].values.astype(np.int)
df_log_res.index = df_log_res['stock'].values.astype(np.int)

if should_csv:
    df_log_res.to_csv('res_log_que.csv')

11390 File b'../data/prepared/11390.csv' does not exist
4695 File b'../data/prepared/4695.csv' does not exist
7843 File b'../data/prepared/7843.csv' does not exist


In [7]:
df_log_res

Unnamed: 0,class_weights,f1,kappa,kernel,matthews,precision,recall,roc_auc,stock,test_f1,...,train_matthews,train_precision,train_recall,train_roc_auc,train_val_f1,train_val_kappa,train_val_matthews,train_val_precision,train_val_recall,train_val_roc_auc
9269,"{0.0: 0.836091273297335, 1.0: 1.2438455558434827}",0.510800,0.093214,logistic,0.100470,0.435052,0.627165,0.550638,9269,0.456369,...,0.094565,0.448586,0.555066,0.548218,0.507599,0.086723,0.089307,0.449844,0.582545,0.545330
3022,"{0.0: 0.7298160255435608, 1.0: 1.587826662256037}",0.440221,0.102841,logistic,0.113407,0.357526,0.581350,0.560990,3022,0.464854,...,0.115502,0.368437,0.586173,0.562168,0.451454,0.099532,0.107777,0.366223,0.588595,0.557860
2051,"{0.0: 0.5945376850188889, 1.0: 3.144448083851949}",0.308374,0.096571,logistic,0.118968,0.213634,0.558961,0.579171,2051,0.314469,...,0.122543,0.208199,0.593842,0.583414,0.304148,0.097964,0.123845,0.206687,0.575964,0.584607
3879,"{0.0: 0.7008834051252099, 1.0: 1.7445029983645...",0.428990,0.124127,logistic,0.137354,0.337460,0.591021,0.576551,3879,0.436452,...,0.136157,0.350998,0.584954,0.575203,0.448760,0.126081,0.139540,0.354127,0.613801,0.576524
10166,"{0.0: 0.8430666549574076, 1.0: 1.2287213618328...",0.512189,0.119137,logistic,0.120872,0.470075,0.563449,0.561400,10166,0.473590,...,0.108777,0.460605,0.563420,0.555355,0.504377,0.102871,0.104679,0.458109,0.561199,0.553257
1080,"{0.0: 0.587299645173131, 1.0: 3.3637000700770847}",0.302637,0.104771,logistic,0.133942,0.205121,0.597464,0.591948,1080,0.317611,...,0.123861,0.198550,0.585144,0.586375,0.300672,0.098006,0.128519,0.200387,0.602122,0.589466
12059,"{0.0: 0.8266597778351847, 1.0: 1.265322261763543}",0.504627,0.126795,logistic,0.128926,0.458536,0.561676,0.565861,12059,0.470273,...,0.115285,0.452363,0.563859,0.558947,0.493497,0.105659,0.107477,0.447144,0.550615,0.554980
1907,"{0.0: 0.6993516427478692, 1.0: 1.7540654120226...",0.441177,0.126031,logistic,0.140511,0.349804,0.607673,0.576996,1907,0.435686,...,0.145256,0.354628,0.585237,0.580300,0.435055,0.134979,0.144274,0.353651,0.565326,0.579732
2050,"{0.0: 0.6291369028114555, 1.0: 2.435929967013448}",0.362010,0.122572,logistic,0.139848,0.268306,0.557974,0.585798,2050,0.415750,...,0.135503,0.265290,0.586653,0.583516,0.364684,0.116598,0.136887,0.264960,0.585481,0.584427
2050,"{0.0: 0.6291369028114555, 1.0: 2.435929967013448}",0.362010,0.122572,logistic,0.139848,0.268306,0.557974,0.585798,2050,0.415750,...,0.135503,0.265290,0.586653,0.583516,0.364684,0.116598,0.136887,0.264960,0.585481,0.584427


In [12]:
log_res = []
for stock in stocks:
    try:
        res = get_logistic_regression_prev(stock, data_length)

    except Exception as e:
        print(stock, e)
    log_res.append(res)
df_log_res = pd.DataFrame(log_res)
df_log_res['stock'] = df_log_res['stock'].values.astype(np.int)
df_log_res.index = df_log_res['stock'].values.astype(np.int)

if should_csv:
    df_log_res.to_csv('res_log_prev_que.csv')

11390 File b'../data/prepared/11390.csv' does not exist
4695 File b'../data/prepared/4695.csv' does not exist
7843 File b'../data/prepared/7843.csv' does not exist
