In [8]:
%load_ext autoreload

%autoreload 2
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn import metrics

# from mlxtend.plotting import plot_decision_regions
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from ast import literal_eval

import warnings
import numpy as np
from collections import OrderedDict
from sklearn.utils import class_weight

from lob_data_utils import lob, db_result, model, roc_results, stocks_numbers
from lob_data_utils.svm_calculation import lob_svm
import os


sns.set_style('whitegrid')
warnings.filterwarnings('ignore')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [13]:
data_length = 6929
stocks = stocks_numbers.chosen_stocks
should_csv = True

In [29]:
def get_mean_scores(scores: dict) -> dict:
    mean_scores = {}
    for k, v in scores.items():
        mean_scores[k] = np.mean(v)
    return mean_scores

def get_score_for_clf(clf, df_test):
    x_test = df_test[['queue_imbalance']]
    y_test = df_test['mid_price_indicator'].values
    return model.test_model(clf, x_test, y_test)

def get_score_for_clf_prev(clf, df_test):
    x_test = df_test[['queue_imbalance', 'prev_queue_imbalance']]
    y_test = df_test['mid_price_indicator'].values
    return model.test_model(clf, x_test, y_test)

class NullHyposthesisClassifier():
    def __init__(self):
        pass
    def fit(self, a, b):
        pass
    def predict(self, df):
        return np.array(np.ones(len(df)) * 1/2)

class FakeClassifier():
    def __init__(self):
        pass
    def fit(self, a, b):
        pass
    def predict(self, df):
        pred = []
        train_x = df['queue_imbalance'].values
        for x in train_x:
            if x <= 0.0:
                pred.append(0)
            else:
                pred.append(1)
        return np.array(pred)
    
def get_null_hyposthesis_classifier(stock, data_length):
    
    df, df_test = lob.load_prepared_data(
    stock, data_dir='../data/prepared_balanced', length=data_length)

    train_x = df[['queue_imbalance']]
    y_train = df['mid_price_indicator']
    a = np.unique(y_train)

    clf = NullHyposthesisClassifier()
    scores = model.validate_model(clf, train_x, y_train, print_debug=False)
    res = {
        **get_mean_scores(scores),
        'stock': stock,
        'kernel': 'null-hypothesis',
    }
    test_scores = get_score_for_clf(clf, df_test)
    return {**res, **test_scores}

def get_fake_classifier(stock, data_length):
    
    df, df_test = lob.load_prepared_data(
    stock, data_dir='../data/prepared_balanced', length=data_length)

    train_x = df[['queue_imbalance']]
    y_train = df['mid_price_indicator']
    a = np.unique(y_train)

    clf = FakeClassifier()
    scores = model.validate_model(clf, train_x, y_train, print_debug=False)
    res = {
        **get_mean_scores(scores),
        'stock': stock,
        'kernel': 'fake',
    }
    test_scores = get_score_for_clf(clf, df_test)
    return {**res, **test_scores}

def get_logistic_regression(stock, data_length):
    df, df_test = lob.load_prepared_data(
        stock, data_dir='../data/prepared_balanced', length=data_length)

    train_x = df[['queue_imbalance']]
    y_train = df['mid_price_indicator']
    a = np.unique(y_train)

    clf = LogisticRegression()
    scores = model.validate_model(clf, train_x, y_train, print_debug=False)
    res = {
        **get_mean_scores(scores),
        'stock': stock,
        'kernel': 'logistic-balanced',
    }
    test_scores = get_score_for_clf(clf, df_test)
    return {**res, **test_scores}

def get_logistic_regression_prev(stock, data_length):
    df, df_test = lob.load_prepared_data(
        stock, data_dir='../data/prepared_balanced', length=data_length)
    df['prev_queue_imbalance'] = df['queue_imbalance'].shift()
    df.dropna(inplace=True)
    df_test['prev_queue_imbalance'] = df_test['queue_imbalance'].shift()
    df_test.dropna(inplace=True)
    train_x = df[['queue_imbalance', 'prev_queue_imbalance']]
    y_train = df['mid_price_indicator']
    a = np.unique(y_train)
    clf = LogisticRegression()
    scores = model.validate_model(clf, train_x, y_train, print_debug=False)
    res = {
        **get_mean_scores(scores),
        'stock': stock,
        'kernel': 'logistic-balanced',
    }
    test_scores = get_score_for_clf_prev(clf, df_test)
    return {**res, **test_scores}

In [15]:
log_res = []
for stock in stocks:
    try:
        res = get_logistic_regression(stock, data_length)

    except Exception as e:
        print(stock, e)
    log_res.append(res)
df_log_res = pd.DataFrame(log_res)
df_log_res['stock'] = df_log_res['stock'].values.astype(np.int)
df_log_res.index = df_log_res['stock'].values.astype(np.int)

if should_csv:
    df_log_res.to_csv('res_log_balanced_que.csv')

2748 Classification metrics can't handle a mix of continuous and binary targets


In [16]:
log_res = []
for stock in stocks:
    try:
        res = get_logistic_regression_prev(stock, data_length)

    except Exception as e:
        print(stock, e)
    log_res.append(res)
df_log_res = pd.DataFrame(log_res)
df_log_res['stock'] = df_log_res['stock'].values.astype(np.int)
df_log_res.index = df_log_res['stock'].values.astype(np.int)

if should_csv:
    df_log_res.to_csv('res_log_balanced_prev_que.csv')

In [33]:
log_res = []
for stock in stocks:
    try:
        res = get_fake_classifier(stock, data_length)

    except Exception as e:
        print(stock, e)
    log_res.append(res)
df_log_res = pd.DataFrame(log_res)
df_log_res['stock'] = df_log_res['stock'].values.astype(np.int)
df_log_res.index = df_log_res['stock'].values.astype(np.int)

if should_csv:
    df_log_res.to_csv('res_fake_balanced_que.csv')

2748 Classification metrics can't handle a mix of continuous and binary targets


In [37]:
log_res = []
for stock in stocks:
    try:
        res = get_null_hyposthesis_classifier(stock, data_length)
        log_res.append(res)
    except Exception as e:
        print(stock, e)
    
df_log_res = pd.DataFrame(log_res)
df_log_res['stock'] = df_log_res['stock'].values.astype(np.int)
df_log_res.index = df_log_res['stock'].values.astype(np.int)

if should_csv:
    df_log_res.to_csv('res_null_hypothesis_balanced.csv')

9061 Classification metrics can't handle a mix of binary and continuous targets
3459 Classification metrics can't handle a mix of binary and continuous targets
4549 Classification metrics can't handle a mix of binary and continuous targets
9761 Classification metrics can't handle a mix of binary and continuous targets
4851 Classification metrics can't handle a mix of binary and continuous targets
9062 Classification metrics can't handle a mix of binary and continuous targets
11869 Classification metrics can't handle a mix of binary and continuous targets
12255 Classification metrics can't handle a mix of binary and continuous targets
2748 Classification metrics can't handle a mix of binary and continuous targets
4320 Classification metrics can't handle a mix of binary and continuous targets
11583 Classification metrics can't handle a mix of binary and continuous targets
4799 Classification metrics can't handle a mix of binary and continuous targets
9268 Classification metrics can't han

KeyError: 'stock'