In [1]:
%load_ext autoreload

%autoreload 2
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score, roc_curve

from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn import metrics

from keras.models import Sequential
from keras.layers import Dense, Dropout, regularizers
from keras.layers import LSTM


from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence


import warnings
import numpy as np
from collections import OrderedDict

import os

from lob_data_utils import lob, db_result, gdf_pca, model
from lob_data_utils.svm_calculation import lob_svm


sns.set_style('whitegrid')
warnings.filterwarnings('ignore')

Using TensorFlow backend.


In [2]:
data_length = 24000
stock = '9062'
gdf_filename_pattern = 'gdf_{}_r{}_s{}_K50'
gdf_parameters = [(0.1, 0.1), (0.01, 0.1), (0.1, 0.5), (0.01, 0.5)]

In [3]:
df_log = pd.read_csv('../../gdf_pca/res_log_que.csv')   
df_log = df_log[df_log['stock'] == int(stock)]
columns = [c for c in df_log.columns if 'matthews' in c or 'roc_auc' in c]
df_log[columns]

Unnamed: 0,matthews,roc_auc,test_matthews,test_roc_auc,train_matthews,train_roc_auc,train_val_matthews,train_val_roc_auc
29,0.057241,0.528676,0.047951,0.524117,0.062033,0.531175,0.066864,0.533577


In [5]:
gdf_dfs = []
for r, s in gdf_parameters:
    gdf_dfs.append(gdf_pca.SvmGdfResults(
        stock,  r=r, s=s, data_length=data_length, data_dir='../../data/data_gdf', 
        reg_data_dir='../../data/prepared',
        gdf_filename_pattern=gdf_filename_pattern))

In [6]:
def as_keras_metric(method):
    import functools
    from keras import backend as K
    import tensorflow as tf
    @functools.wraps(method)
    def wrapper(self, args, **kwargs):
        """ Wrapper for turning tensorflow metrics into keras metrics """
        value, update_op = method(self, args, **kwargs)
        K.get_session().run(tf.local_variables_initializer())
        with tf.control_dependencies([update_op]):
            value = tf.identity(value)
        return value
    return wrapper
import tensorflow as tf
auc_roc = as_keras_metric(tf.metrics.auc)

In [7]:
df = gdf_dfs[0].df
df_test = gdf_dfs[0].df_test
n_components = gdf_dfs[0].get_pca('pca_n_gdf_que').n_components_
class_weights = gdf_dfs[0].get_classes_weights()
print(n_components, class_weights)

1 {0.0: 0.9082694673100578, 1.0: 1.11234067207416}


In [8]:
## With validation

In [9]:
model = Sequential()
model.add(LSTM(32, input_shape=(5, 1)))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[auc_roc])

In [10]:
epochs = 10
batch_size = 300
score = gdf_dfs[1].train_lstm(
    model, feature_name='pca_n_gdf_que', plot_name='here.png', n_steps=5,
    fit_kwargs={'epochs': epochs, 'batch_size': batch_size, 'verbose': 0, 'shuffle': False}, 
    class_weight=class_weights,
    compile_kwargs= { 'loss': 'binary_crossentropy', 'optimizer': 'adam', 'metrics': [auc_roc]})

Training fold  5 19195
train:  0 8725
val: 8725 12215
Training fold  6 19195
train:  1745 10470
val: 10470 13960
Training fold  7 19195
train:  3490 12215
val: 12215 15705
Training fold  8 19195
train:  5235 13960
val: 13960 17450
Training fold  9 19195
train:  6980 15705
val: 15705 19195


In [11]:
score['matthews'], score['test_matthews']

(0.05956143947966545, 0.05203252263954407)

In [13]:
score = {**score, 'arch': model.to_json(), 'batch_size': batch_size, 'n_steps': 5, 'epochs': epochs, 
         'r': gdf_dfs[1].r, 's': gdf_dfs[1].s}
pd.DataFrame([score]).to_csv(
    '../../gdf_pca/res_lstm_weird/res_lstm_pca_que_{}_len24000_r{}_s{}.csv'.format(stock, gdf_dfs[1].r, gdf_dfs[1].s))

In [None]:
model = Sequential()
model.add(LSTM(128, input_shape=(2, 1)))
model.add(Dense(64, activation='tanh'))
model.add(Dense(32, activation='tanh'))
model.add(Dense(16, activation='tanh'))
model.add(Dense(8, activation='tanh'))
model.add(Dense(4, activation='tanh'))
model.add(Dense(2, activation='tanh'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[auc_roc])

In [None]:
epochs = 3
batch_size = 25
score = gdf_dfs[1].train_lstm(
    model, feature_name='pca_n_gdf_que', plot_name='here.png', n_steps=2,
    fit_kwargs={'epochs': epochs, 'batch_size': batch_size, 'verbose': 1, 'shuffle': False}, 
    class_weight=class_weights,
    compile_kwargs= { 'loss': 'binary_crossentropy', 'optimizer': 'adam', 'metrics': [auc_roc]})

In [None]:
score['matthews'], score['test_matthews'], score['roc_auc'], score['test_roc_auc']

In [None]:
model = Sequential()
model.add(LSTM(256, input_shape=(2, 1)))
model.add(Dense(128, activation='tanh'))
model.add(Dense(64, activation='tanh'))
model.add(Dense(32, activation='tanh'))
model.add(Dense(16, activation='tanh'))
model.add(Dense(8, activation='tanh'))
model.add(Dense(4, activation='tanh'))
model.add(Dense(2, activation='tanh'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[auc_roc])

In [None]:
epochs = 3
batch_size = 300
score = gdf_dfs[1].train_lstm(
    model, feature_name='pca_n_gdf_que', plot_name='here.png', n_steps=2,
    fit_kwargs={'epochs': epochs, 'batch_size': batch_size, 'verbose': 1, 'shuffle': False}, 
    class_weight=class_weights,
    compile_kwargs= { 'loss': 'binary_crossentropy', 'optimizer': 'adam', 'metrics': [auc_roc]})

In [None]:
score['matthews'], score['test_matthews'], score['roc_auc'], score['test_roc_auc']

In [None]:
score = {**score, 'arch': model.to_json(), 'batch_size': batch_size, 'n_steps': 2, 'epochs': epochs, 
         'r': gdf_dfs[1].r, 's': gdf_dfs[1].s}

In [None]:
pd.DataFrame([score]).to_csv('../gdf_pca/res_lstm_weird/res_lstm_pca_que_9062_len24000_r{}_s{}.csv'.format(
    gdf_dfs[1].r, gdf_dfs[1].s))

In [None]:
model = Sequential()
model.add(LSTM(8, input_shape=(2, 1)))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[auc_roc])

In [None]:
epochs = 10
batch_size = 300
score = gdf_dfs[1].train_lstm(
    model, feature_name='pca_n_gdf_que', plot_name='here.png', n_steps=2,
    fit_kwargs={'epochs': epochs, 'batch_size': batch_size, 'verbose': 1, 'shuffle': False}, 
    class_weight=class_weights,
    compile_kwargs= { 'loss': 'binary_crossentropy', 'optimizer': 'adam', 'metrics': [auc_roc]})

In [None]:
score['matthews'], score['test_matthews'], score['roc_auc'], score['test_roc_auc']

In [None]:
score = {**score, 'arch': model.to_json(), 'batch_size': batch_size, 'n_steps': 5, 'epochs': epochs, 
         'r': gdf_dfs[1].r, 's': gdf_dfs[1].s}
pd.DataFrame([score]).to_csv(
    '../gdf_pca/res_lstm_weird/res_lstm_pca_que_{}_len24000_r{}_s{}.csv'.format(stock, gdf_dfs[1].r, gdf_dfs[1].s))