In [1]:
%load_ext autoreload

%autoreload 2
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.svm import SVC
 
from sklearn import metrics

from mlxtend.plotting import plot_decision_regions
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from ast import literal_eval

from sklearn.decomposition import PCA
import warnings
import numpy as np
from collections import OrderedDict

from lob_data_utils import lob, db_result, model
from lob_data_utils.svm_calculation import lob_svm
import os


sns.set_style('whitegrid')
warnings.filterwarnings('ignore')

In [2]:
data_length = 10000
stocks = ['9064', '9061', '9265']

In [3]:
def convert_scores(df, column):
    scores = []
    for i, row in df.iterrows():
        try:
            scores.append(np.mean(row[column]))
        except:
            scores.append(np.mean(np.array(literal_eval(row[column])).astype(np.float64)))
    return scores
scores_columns = ['f1', 'kappa', 'matthews', 'precision', 'recall', 'roc_auc', 'train_f1', 'train_kappa',
       'train_matthews', 'train_precision', 'train_recall', 'train_roc_auc']

In [4]:
dfs = {}
dfs_test = {}
dfs_reg = {}
dfs_reg_test = {}
data_dir='../gaussian_filter/data_gdf'
for stock in stocks:
    r = 1.0
    s = 1.0
    gdf_filename = 'gdf_{}_len{}_r{}_s{}_K50'.format(stock, data_length, r, s)
    reg_filename = '{}'.format(stock)
    print(gdf_filename)
    dfs[stock], dfs_test[stock] = lob.load_prepared_data(
        gdf_filename, data_dir=data_dir, cv=False, length=data_length)
    dfs_reg[stock], dfs_reg_test[stock] = lob.load_prepared_data(
        reg_filename, data_dir='../gaussian_filter/data', cv=False, length=data_length)

gdf_9064_len10000_r1.0_s1.0_K50
gdf_9061_len10000_r1.0_s1.0_K50
gdf_9265_len10000_r1.0_s1.0_K50


In [5]:
for stock in stocks:
    dfs[stock]['queue_imbalance'] = dfs_reg[stock]['queue_imbalance']
    dfs[stock]['prev_queue_imbalance'] = dfs[stock]['queue_imbalance'].shift()
    dfs[stock].dropna(inplace=True)
    dfs_test[stock]['queue_imbalance'] = dfs_reg_test[stock]['queue_imbalance']
    dfs_test[stock]['prev_queue_imbalance'] = dfs_test[stock]['queue_imbalance'].shift()
    dfs_test[stock].dropna(inplace=True)

In [6]:
feature_columns_dict = {
    'gdf_24-26_que_prev': ['gdf_24', 'gdf_25', 'queue_imbalance', 'prev_queue_imbalance'], 
    'pca_gdf_que3': ['gdf_{}'.format(i) for i in range(0, 50)] + ['queue_imbalance'],
    'pca_gdf_que7': ['gdf_{}'.format(i) for i in range(0, 50)] + ['queue_imbalance'],
}

In [35]:
def get_number_of_pca_components(feature_name: str):
    if 'pca_gdf_que_prev' in feature_name:
        return int(feature_name.replace('pca_gdf_que_prev', ''))
    if 'pca_gdf_que' in feature_name:
        return int(feature_name.replace('pca_gdf_que', ''))
    return None

def get_pca(dfs, stock, feature_name):
    df = dfs[stock][feature_columns_dict[feature_name]]
    train_x = dfs[stock][feature_columns_dict[feature_name]]
    n_components = get_number_of_pca_components(feature_name)
    pca = None
    if n_components:
        pca = PCA(n_components=n_components)
        pca.fit(train_x)
        train_x = pca.transform(train_x)
    else:
        return None
    return {k: v for v, k in zip(pca.transform(np.identity(df.shape[1])), df.columns)}
#     return pd.DataFrame(
#         pca.components_, columns=dfs[stock][feature_columns_dict[feature_name]].columns, 
#         index=['PC_{}'.format(i) for i in range(pca.n_components)])

In [36]:
pca1 = get_pca(dfs, stock='9265', feature_name='pca_gdf_que3')
pca2 = get_pca(dfs, stock='9061', feature_name='gdf_24-26_que_prev')
pca3 = get_pca(dfs, stock='9064', feature_name='pca_gdf_que7')

In [39]:
pca3

{'gdf_0': array([ 0.01634983, -0.6006981 , -0.23097131,  0.20585547,  0.22575358,
        -0.23938899,  0.21677998]),
 'gdf_1': array([ 0.01647498, -0.59795531, -0.22972273,  0.19596229,  0.2112014 ,
        -0.20428797,  0.17703729]),
 'gdf_2': array([ 0.01655608, -0.59523115, -0.22747879,  0.18282889,  0.18906579,
        -0.15995478,  0.12612774]),
 'gdf_3': array([ 0.01657997, -0.59249857, -0.22425028,  0.16612836,  0.15977412,
        -0.10834544,  0.06396481]),
 'gdf_4': array([ 0.01657915, -0.5897468 , -0.21965481,  0.14462753,  0.12284755,
        -0.04836414, -0.00733333]),
 'gdf_5': array([ 0.01655214, -0.58706691, -0.21426254,  0.12211703,  0.0850272 ,
         0.01031724, -0.07518165]),
 'gdf_6': array([ 0.01650224, -0.58475028, -0.2079043 ,  0.09899451,  0.04667467,
         0.06486314, -0.1300321 ]),
 'gdf_7': array([ 0.01645215, -0.58258217, -0.2011979 ,  0.07567671,  0.00919847,
         0.11196672, -0.16699912]),
 'gdf_8': array([ 0.01643133, -0.58056317, -0.19419508, 