In [1]:
%load_ext autoreload

%autoreload 2
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn import metrics

# from mlxtend.plotting import plot_decision_regions
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from ast import literal_eval

import warnings
import numpy as np
from collections import OrderedDict

from lob_data_utils import lob, db_result, model, roc_results
from lob_data_utils.svm_calculation import lob_svm
import os


sns.set_style('whitegrid')
warnings.filterwarnings('ignore')

In [2]:
data_length = 10000
rs_params = [(1.0, 1.0),(0.1, 0.1)]
stocks = list(roc_results.result_cv_10000.keys())

In [3]:
def get_mean_scores(scores: dict) -> dict:
    mean_scores = {}
    for k, v in scores.items():
        mean_scores[k] = np.mean(v)
    return mean_scores

def get_score_for_clf(clf, df_test):
    x_test = df_test[['queue_imbalance']]
    y_test = df_test['mid_price_indicator'].values
    return model.test_model(clf, x_test, y_test)

def get_logistic_regression(stock, data_length):
    df, df_test = lob.load_prepared_data(
        stock, data_dir='../gaussian_filter/data', cv=False, length=data_length)
    clf = LogisticRegression()
    train_x = df[['queue_imbalance']]

    scores = model.validate_model(clf, train_x, df['mid_price_indicator'])
    res = {
        **get_mean_scores(scores),
        'stock': stock,
        'kernel': 'logistic',
    }
    test_scores = get_score_for_clf(clf, df_test)
    return {**res, **test_scores}

In [4]:
df_res = pd.DataFrame()
for stock in stocks:
    for r, s in rs_params:
        filename = 'svm_features_{}_len{}_r{}_s{}.csv'.format(stock, data_length, r, s)
        if os.path.exists(filename):
            df_res = df_res.append(pd.read_csv(filename))

In [7]:
df_res.columns

Index(['Unnamed: 0', 'f1', 'features', 'matthews', 'roc_auc', 'stock'], dtype='object')

In [11]:
df_best = df_res.sort_values(by='roc_auc', ascending=False).groupby('stock').head(1)

In [16]:
df_best[df_best['features'] == 'gdf_0-50_que_prev']

Unnamed: 0.1,Unnamed: 0,f1,features,matthews,roc_auc,stock
13,13,0.616044,gdf_0-50_que_prev,0.160525,0.579122,10484
13,13,0.582263,gdf_0-50_que_prev,0.153776,0.57561,1472
13,13,0.555019,gdf_0-50_que_prev,0.137578,0.568531,9268
13,13,0.592979,gdf_0-50_que_prev,0.136065,0.566648,1907
13,13,0.613321,gdf_0-50_que_prev,0.131917,0.56373,3022
13,13,0.563877,gdf_0-50_que_prev,0.114953,0.557299,12059
