In [2]:
%load_ext autoreload

%autoreload 2
%matplotlib inline
import functools
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score, roc_curve

from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA

from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras import regularizers
from keras.layers import LSTM
from keras.utils import plot_model
from keras import backend as K
import keras.metrics
import tensorflow as tf


from keras.models import Sequential
from keras.layers import Dense, TimeDistributed
from keras.layers import LSTM, Bidirectional
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence


import warnings
import numpy as np
from collections import OrderedDict

import os

from lob_data_utils import lob, db_result, gdf_pca
from lob_data_utils.svm_calculation import lob_svm
from numpy.random import seed
seed(1)


sns.set_style('whitegrid')
warnings.filterwarnings('ignore')

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [3]:
data_length = 24000
stock = '9761'
gdf_filename_pattern = 'gdf_{}_r{}_s{}_K50'
gdf_parameters = [(0.1, 0.1), (0.1, 0.5), (0.01, 0.5), (0.01, 0.1)]

In [4]:
df_log = pd.read_csv('../gdf_pca/res_log_que.csv')   
df_log = df_log[df_log['stock'] == int(stock)]
columns = [c for c in df_log.columns if 'matthews' in c or 'roc_auc' in c]
df_log[columns]

Unnamed: 0,matthews,roc_auc,test_matthews,test_roc_auc,train_matthews,train_roc_auc,train_val_matthews,train_val_roc_auc
35,0.146427,0.579966,0.107514,0.558467,0.125276,0.568539,0.125167,0.568139


In [5]:
def convert_scores_to_df(scores):
    scores2 = []
    for l in scores:
        res = {}
        for k, v in l.items():
            if isinstance(v, list):
                res[k] = np.mean(v)
            else:
                res[k] = v
        scores2.append(res)
    return pd.DataFrame(scores2)

In [6]:
def as_keras_metric(method):
    @functools.wraps(method)
    def wrapper(self, args, **kwargs):
        """ Wrapper for turning tensorflow metrics into keras metrics """
        value, update_op = method(self, args, **kwargs)
        K.get_session().run(tf.local_variables_initializer())
        with tf.control_dependencies([update_op]):
            value = tf.identity(value)
        return value

    return wrapper

auc_roc = as_keras_metric(tf.metrics.auc)

In [7]:
gdf_dfs = []
for r, s in gdf_parameters:
    gdf_dfs.append(gdf_pca.SvmGdfResults(
        stock,  r=r, s=s, data_length=data_length, data_dir='../gaussian_filter/data_gdf_not_synced',
        gdf_filename_pattern=gdf_filename_pattern))

## Model 1

In [8]:
def get_model(shape):
    model = Sequential()
    unit = 20
    model.add(LSTM(unit, input_shape=(1, shape)))
    # model.add(Dropout(0.5, input_shape=(unit, unit)))
    model.add(Dense(1, input_shape=(unit, 1), activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[auc_roc])
    return model

In [9]:
scores = []
epochs = 3
batch_size = 10
for i in range(len(gdf_dfs)):
    print(i)
    pca = gdf_dfs[i].get_pca('pca_n_gdf_que_prev')
    model = get_model(pca.n_components)
    score = gdf_dfs[i].train_lstm(model, feature_name='pca_n_gdf_que_prev', 
                           fit_kwargs={
                               'epochs': epochs, 'batch_size': batch_size, 'verbose': 1,
                               'class_weight': {0: 0.71002219, 1: 1.69035041}, 'shuffle': False}, 
                                  plot_name='here.png',
                                 compile_kwargs=
                                  { 'loss': 'binary_crossentropy', 'optimizer': 'adam', 'metrics': [auc_roc]})
    scores.append({**score, 'r': gdf_dfs[i].r, 's': gdf_dfs[i].r, 'stock': gdf_dfs[i].stock, 
                  'epochs': epochs, 'batch_size': batch_size})

0
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
1
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
2
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3


Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [12]:
df_res = convert_scores_to_df(scores)
df_res[columns + ['r', 's']]

Unnamed: 0,matthews,roc_auc,test_matthews,test_roc_auc,train_matthews,train_roc_auc,train_val_matthews,train_val_roc_auc,r,s
0,0.131803,0.571018,0.102864,0.555548,0.135669,0.573999,0.134683,0.573061,0.1,0.1
1,0.143261,0.576531,0.105268,0.556925,0.137048,0.574942,0.139955,0.575303,0.1,0.1
2,0.140087,0.575627,0.103954,0.556505,0.126269,0.569162,0.13,0.57003,0.01,0.01
3,0.142044,0.576819,0.111925,0.560666,0.130832,0.571423,0.131406,0.570894,0.01,0.01


In [13]:
df_res.sort_values(by='matthews', ascending=False)[columns]

Unnamed: 0,matthews,roc_auc,test_matthews,test_roc_auc,train_matthews,train_roc_auc,train_val_matthews,train_val_roc_auc
1,0.143261,0.576531,0.105268,0.556925,0.137048,0.574942,0.139955,0.575303
3,0.142044,0.576819,0.111925,0.560666,0.130832,0.571423,0.131406,0.570894
2,0.140087,0.575627,0.103954,0.556505,0.126269,0.569162,0.13,0.57003
0,0.131803,0.571018,0.102864,0.555548,0.135669,0.573999,0.134683,0.573061
