In [1]:
%load_ext autoreload

%autoreload 2
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.svm import SVC
 
from sklearn import metrics

from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from ast import literal_eval

from sklearn.decomposition import PCA
import warnings
import numpy as np
from collections import OrderedDict

from lob_data_utils import lob, db_result, model
from lob_data_utils.svm_calculation import lob_svm
import os


sns.set_style('whitegrid')
warnings.filterwarnings('ignore')

In [2]:
data_length = 10000
stocks = ['9064', '9061', '9265']

In [3]:
def convert_scores(df, column):
    scores = []
    for i, row in df.iterrows():
        try:
            scores.append(np.mean(row[column]))
        except:
            scores.append(np.mean(np.array(literal_eval(row[column])).astype(np.float64)))
    return scores
scores_columns = ['f1', 'kappa', 'matthews', 'precision', 'recall', 'roc_auc', 'train_f1', 'train_kappa',
       'train_matthews', 'train_precision', 'train_recall', 'train_roc_auc']

In [4]:
dfs = {}
dfs_test = {}
dfs_reg = {}
dfs_reg_test = {}
data_dir='../gaussian_filter/data_gdf'
for stock in stocks:
    r = 1.0
    s = 1.0
    gdf_filename = 'gdf_{}_len{}_r{}_s{}_K50'.format(stock, data_length, r, s)
    reg_filename = '{}'.format(stock)
    print(gdf_filename)
    dfs[stock], dfs_test[stock] = lob.load_prepared_data(
        gdf_filename, data_dir=data_dir, cv=False, length=data_length)
    dfs_reg[stock], dfs_reg_test[stock] = lob.load_prepared_data(
        reg_filename, data_dir='../gaussian_filter/data', cv=False, length=data_length)

gdf_9064_len10000_r1.0_s1.0_K50
gdf_9061_len10000_r1.0_s1.0_K50
gdf_9265_len10000_r1.0_s1.0_K50


In [5]:
for stock in stocks:
    dfs[stock]['queue_imbalance'] = dfs_reg[stock]['queue_imbalance']
    dfs[stock]['prev_queue_imbalance'] = dfs[stock]['queue_imbalance'].shift()
    dfs[stock].dropna(inplace=True)
    dfs_test[stock]['queue_imbalance'] = dfs_reg_test[stock]['queue_imbalance']
    dfs_test[stock]['prev_queue_imbalance'] = dfs_test[stock]['queue_imbalance'].shift()
    dfs_test[stock].dropna(inplace=True)

In [43]:
feature_columns_dict = {
    'gdf_24-26_que_prev': ['gdf_24', 'gdf_25', 'queue_imbalance', 'prev_queue_imbalance'], 
    'pca_gdf_que3': ['gdf_{}'.format(i) for i in range(0, 50)] + ['queue_imbalance'],
    'pca_gdf_que7': ['gdf_{}'.format(i) for i in range(0, 50)] + ['queue_imbalance'],
    'pca_gdf_que1': ['gdf_{}'.format(i) for i in range(0, 50)] + ['queue_imbalance'],
    'pca_gdf_que_mle': ['gdf_{}'.format(i) for i in range(0, 50)] + ['queue_imbalance'],
}

In [65]:
def get_number_of_pca_components(feature_name: str):
    if 'mle' in feature_name:
        return 'mle'
    if 'pca_gdf_que_prev' in feature_name:
        return int(feature_name.replace('pca_gdf_que_prev', ''))
    if 'pca_gdf_que' in feature_name:
        return int(feature_name.replace('pca_gdf_que', ''))
    return None

def get_pca(dfs, stock, feature_name):
    df = dfs[stock][feature_columns_dict[feature_name]]
    train_x = dfs[stock][feature_columns_dict[feature_name]]
    n_components = get_number_of_pca_components(feature_name)
    pca = None
    if n_components:
        pca = PCA(n_components=n_components, svd_solver='full', whiten=True)
        pca.fit(train_x)
        train_x = pca.transform(train_x)
        print(len(train_x[0]))
    else:
        return None
    return pca
#     return pd.DataFrame(
#         pca.components_, columns=dfs[stock][feature_columns_dict[feature_name]].columns, 
#         index=['PC_{}'.format(i) for i in range(pca.n_components)])

In [86]:
pca_9061 = {}
pca = get_pca(dfs, stock='9064', feature_name='pca_gdf_que1')
print(pca.n_components_)
d = {k: v for v, k in zip(pca.components_[0], dfs[stock][feature_columns_dict['pca_gdf_que3']].columns)}
pca_9061 = d
sorted(pca_9061.items(), key=lambda x: (x[1],x[0]))

1
1


[('queue_imbalance', -0.99997647814160162),
 ('gdf_0', -0.0016180957165767267),
 ('gdf_8', -0.0015366001113582312),
 ('gdf_7', -0.0015157789227477629),
 ('gdf_9', -0.0015034843288352856),
 ('gdf_1', -0.0014929511767705905),
 ('gdf_10', -0.0014695220446824305),
 ('gdf_6', -0.0014656822699881846),
 ('gdf_11', -0.0014287235479413288),
 ('gdf_5', -0.0014157825158893876),
 ('gdf_2', -0.0014118435040223476),
 ('gdf_4', -0.0013887749324736824),
 ('gdf_3', -0.0013879607480006134),
 ('gdf_12', -0.0013701486126352321),
 ('gdf_13', -0.0013266580672073617),
 ('gdf_14', -0.0013012460853458366),
 ('gdf_15', -0.0012778257961710612),
 ('gdf_16', -0.0012313079678227411),
 ('gdf_17', -0.0011886137972627513),
 ('gdf_18', -0.001135364319109903),
 ('gdf_19', -0.0010823085851116511),
 ('gdf_20', -0.001012177011642043),
 ('gdf_21', -0.00091213080413764139),
 ('gdf_22', -0.00081819616699830709),
 ('gdf_23', -0.00072902213081048881),
 ('gdf_24', -0.00064619320070085013),
 ('gdf_47', -0.00060496764622002441),
 

In [40]:
pca_9061 = {}
for k, v in get_pca(dfs, stock='9064', feature_name='pca_gdf_que1').items():
    pca_9061[k] = v[0]
sorted(pca_9061.items(), key=lambda x: (x[1],x[0]))

[('queue_imbalance', -0.9820085513213842),
 ('gdf_0', 0.016349831103641355),
 ('gdf_8', 0.016431326708859835),
 ('gdf_7', 0.01645214789747031),
 ('gdf_9', 0.016464442491382781),
 ('gdf_1', 0.016474975643447437),
 ('gdf_10', 0.016498404775535633),
 ('gdf_6', 0.016502244550229885),
 ('gdf_11', 0.016539203272276738),
 ('gdf_5', 0.016552144304328685),
 ('gdf_2', 0.016556083316195829),
 ('gdf_4', 0.016579151887744385),
 ('gdf_3', 0.016579966072217456),
 ('gdf_12', 0.016597778207582837),
 ('gdf_13', 0.0166412687530107),
 ('gdf_14', 0.016666680734872229),
 ('gdf_15', 0.016690101024046989),
 ('gdf_16', 0.016736618852395319),
 ('gdf_17', 0.016779313022955308),
 ('gdf_18', 0.016832562501108157),
 ('gdf_19', 0.016885618235106412),
 ('gdf_20', 0.016955749808576014),
 ('gdf_21', 0.017055796016080413),
 ('gdf_22', 0.017149730653219744),
 ('gdf_23', 0.017238904689407565),
 ('gdf_24', 0.0173217336195172),
 ('gdf_47', 0.017362959173998004),
 ('gdf_46', 0.017368237705898907),
 ('gdf_28', 0.0174097839460