In [1]:
%load_ext autoreload

%autoreload 2
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score, roc_curve

from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA

import warnings
import numpy as np
from collections import OrderedDict

import os

from lob_data_utils import lob, db_result, gdf_pca, model
from lob_data_utils.svm_calculation import lob_svm


sns.set_style('whitegrid')
warnings.filterwarnings('ignore')

In [2]:
data_length = 15000
stock = '9064'
gdf_filename_pattern = 'gdf_{}_' + 'len{}'.format(data_length) + '_r{}_s{}_K50'
gdf_parameters = [(1.0, 1.0), (0.1, 0.1), (0.1, 1.0), (1.0, 0.1), (0.01, 0.1)]

In [3]:
gdf_dfs = []
for r, s in gdf_parameters:
    gdf_dfs.append(gdf_pca.SvmGdfResults(
        stock,  r=r, s=s, data_length=data_length, 
        gdf_filename_pattern=gdf_filename_pattern))

In [4]:
def get_kernel_from_method(method):
    return method.split('_')[1]

class CFR():
    def __init__(self, clf, feature_name, is_svm=True):
        self.clf = clf
        self.feature_name = feature_name
        self.is_svm = is_svm

    def get_result(self, gdf_df, should_validate=False, method=None):
        if self.is_svm:
            return gdf_df.train_svm(C=self.clf.C, gamma=self.clf.gamma, coef0=self.clf.coef0, 
                                    kernel=self.clf.kernel, 
                                    feature_name=self.feature_name, should_validate=should_validate)
        else:
            res = gdf_df.train_clf(self.clf, self.feature_name, should_validate=should_validate)
            if method:
                res['kernel'] = method
            return res

In [5]:
from sklearn.neural_network import MLPClassifier
alphas = [0.0001, 0.001, 0.01, 0.1, 1.0]
activations = ['tanh']
solvers = ['adam']
hidden_layer_sizes = [(8), (16), (8, 8), (16, 8), (8, 16), (16, 16)]
features = ['pca_gdf_que10', 'pca_gdf_que_prev10', 'pca_gdf10']
res1 = []

for i in range(len(gdf_dfs)):
    for feature in features:
        for hidden_layer_size in hidden_layer_sizes:
            for solver in solvers:
                for activation in activations:
                    for alpha in alphas:
                        clf = MLPClassifier(solver=solver, alpha=alpha, activation=activation,
                                                  hidden_layer_sizes=hidden_layer_size, random_state=1)

                        c = CFR(clf=clf, feature_name=feature, is_svm=False)
                        r = c.get_result(gdf_dfs[i], should_validate=True, method='mlp')
                        res1.append({**r, 'hidden_layer_size': hidden_layer_size, 
                                     'r': gdf_dfs[i].r,
                                     's': gdf_dfs[i].s,
                                    'alpha': alpha, 'activation': activation, 'solver': solver})
df_res = pd.DataFrame(res1).sort_values(by='matthews', ascending=False)

In [10]:
df_res.columns

Index(['activation', 'alpha', 'f1', 'features', 'hidden_layer_size', 'kappa',
       'kernel', 'matthews', 'precision', 'recall', 'roc_auc', 'solver',
       'stock', 'test_f1', 'test_kappa', 'test_matthews', 'test_precision',
       'test_recall', 'test_roc_auc', 'train_f1', 'train_kappa',
       'train_matthews', 'train_precision', 'train_recall', 'train_roc_auc'],
      dtype='object')

In [8]:
columns = ['alpha', 'features', 'hidden_layer_size',
       'kernel', 'matthews', 'r', 'roc_auc', 's',
       'stock', 'test_matthews', 'test_roc_auc', 'train_matthews', 
       'train_roc_auc']

df_res[columns].groupby(['r', 's']).head(1)

Unnamed: 0,alpha,features,hidden_layer_size,kernel,matthews,r,roc_auc,s,stock,test_matthews,test_roc_auc,train_matthews,train_roc_auc
192,0.01,pca_gdf_que10,"(8, 8)",mlp,0.094845,0.1,0.547098,1.0,9064,0.094068,0.546258,0.077814,0.538364
38,0.1,pca_gdf_que_prev10,16,mlp,0.09314,1.0,0.546029,1.0,9064,0.089992,0.544162,0.08237,0.540617
410,0.0001,pca_gdf_que_prev10,"(8, 16)",mlp,0.092657,0.01,0.54593,0.1,9064,0.106084,0.550487,0.076905,0.53778
305,0.0001,pca_gdf_que_prev10,16,mlp,0.092487,1.0,0.545801,0.1,9064,0.110373,0.554192,0.088152,0.543344
112,0.01,pca_gdf_que10,"(8, 16)",mlp,0.091724,0.1,0.545627,0.1,9064,0.095771,0.546206,0.077608,0.5385


In [10]:
df_res[df_res['r'] == 0.01][columns]

Unnamed: 0,alpha,features,hidden_layer_size,kernel,matthews,r,roc_auc,s,stock,test_matthews,test_roc_auc,train_matthews,train_roc_auc
410,0.0001,pca_gdf_que_prev10,"(8, 16)",mlp,0.092657,0.01,0.545930,0.1,9064,0.106084,0.550487,0.076905,0.537780
411,0.0010,pca_gdf_que_prev10,"(8, 16)",mlp,0.092261,0.01,0.545733,0.1,9064,0.103970,0.549492,0.076875,0.537766
417,0.0100,pca_gdf_que_prev10,"(16, 16)",mlp,0.091888,0.01,0.545608,0.1,9064,0.107198,0.552165,0.080277,0.539726
412,0.0100,pca_gdf_que_prev10,"(8, 16)",mlp,0.091488,0.01,0.545337,0.1,9064,0.102590,0.548813,0.076542,0.537610
380,0.0001,pca_gdf_que10,"(8, 16)",mlp,0.091367,0.01,0.545352,0.1,9064,0.093006,0.544872,0.072958,0.536132
381,0.0010,pca_gdf_que10,"(8, 16)",mlp,0.090938,0.01,0.545145,0.1,9064,0.093006,0.544872,0.072900,0.536105
382,0.0100,pca_gdf_que10,"(8, 16)",mlp,0.089210,0.01,0.544259,0.1,9064,0.095020,0.545937,0.073395,0.536363
415,0.0001,pca_gdf_que_prev10,"(16, 16)",mlp,0.089196,0.01,0.544203,0.1,9064,0.109907,0.553535,0.082155,0.540624
416,0.0010,pca_gdf_que_prev10,"(16, 16)",mlp,0.088769,0.01,0.543995,0.1,9064,0.109907,0.553535,0.082154,0.540624
395,0.0001,pca_gdf_que_prev10,16,mlp,0.087995,0.01,0.543643,0.1,9064,0.098814,0.548646,0.078910,0.538975


In [12]:
df_res.to_csv('res_9064_mlp.csv')

In [None]:
from sklearn.neural_network import MLPClassifier
alphas = [0.01, 0.1, 1.0]
activations = ['tanh']
solvers = ['adam']
hidden_layer_sizes = [(8), (20), (14), (8, 8), (14, 8), (8, 14), (20, 8), (8, 20), (14, 14), (20, 20), (14, 20), (20, 14), (8,8,8)]
features = ['pca_gdf_que10']
res1 = []

for feature in features:
    for hidden_layer_size in hidden_layer_sizes:
        for solver in solvers:
            for activation in activations:
                for alpha in alphas:
                    clf = MLPClassifier(solver=solver, alpha=alpha, activation=activation,
                                              hidden_layer_sizes=hidden_layer_size, random_state=1)

                    c = CFR(clf=clf, feature_name=feature, is_svm=False)
                    r = c.get_result(gdf_dfs[0], should_validate=True, method='mlp')
                    res1.append({**r, 'hidden_layer_size': hidden_layer_size, 
                                'alpha': alpha, 'activation': activation, 'solver': solver})
df_res = pd.DataFrame(res1).sort_values(by='matthews', ascending=False)

#### Let's check if I need more than 1 hidden layer

In [8]:
from sklearn.neural_network import MLPClassifier
alphas = [0.01, 0.1, 1.0]
activations = ['tanh']
solvers = ['adam']
hidden_layer_sizes = [(8), (20), (14), (8, 8), (14, 8), (8, 14), (20, 8), (8, 20), (14, 14), (20, 20), (14, 20), (20, 14), (8,8,8)]
features = ['pca_gdf_que10']
res1 = []

for feature in features:
    for hidden_layer_size in hidden_layer_sizes:
        for solver in solvers:
            for activation in activations:
                for alpha in alphas:
                    clf = MLPClassifier(solver=solver, alpha=alpha, activation=activation,
                                              hidden_layer_sizes=hidden_layer_size, random_state=1)

                    c = CFR(clf=clf, feature_name=feature, is_svm=False)
                    r = c.get_result(gdf_dfs[0], should_validate=True, method='mlp')
                    res1.append({**r, 'hidden_layer_size': hidden_layer_size, 
                                'alpha': alpha, 'activation': activation, 'solver': solver})
df_res = pd.DataFrame(res1).sort_values(by='matthews', ascending=False)

Unnamed: 0,activation,alpha,f1,features,hidden_layer_size,kappa,kernel,matthews,precision,recall,...,test_matthews,test_precision,test_recall,test_roc_auc,train_f1,train_kappa,train_matthews,train_precision,train_recall,train_roc_auc
36,tanh,0.01,0.518217,pca_gdf_que10,"(8, 8, 8)",0.102548,mlp,0.106582,0.558486,0.499931,...,0.074544,0.532637,0.548387,0.537264,0.520045,0.090609,0.09117,0.538047,0.506219,0.545171
17,tanh,1.0,0.493565,pca_gdf_que10,"(8, 14)",0.100969,mlp,0.106574,0.564105,0.458924,...,0.067264,0.533235,0.485215,0.533475,0.490843,0.089062,0.090967,0.543894,0.453363,0.544293
32,tanh,1.0,0.497601,pca_gdf_que10,"(14, 20)",0.100274,mlp,0.105559,0.561371,0.468079,...,0.065408,0.530398,0.510081,0.532671,0.49574,0.088056,0.089731,0.541961,0.462789,0.543798
25,tanh,0.1,0.501713,pca_gdf_que10,"(14, 14)",0.099924,mlp,0.104131,0.559801,0.469447,...,0.071955,0.535793,0.487903,0.535811,0.50029,0.090848,0.09198,0.542749,0.466328,0.545233
3,tanh,0.01,0.505607,pca_gdf_que10,20,0.099595,mlp,0.104114,0.559184,0.479552,...,0.077992,0.53863,0.49664,0.538856,0.50557,0.090697,0.09166,0.541562,0.476837,0.545165
29,tanh,1.0,0.505381,pca_gdf_que10,"(20, 20)",0.098819,mlp,0.104055,0.56062,0.48056,...,0.068648,0.533094,0.497984,0.534233,0.504631,0.090906,0.092231,0.541723,0.47804,0.545247
23,tanh,1.0,0.490621,pca_gdf_que10,"(8, 20)",0.097841,mlp,0.103967,0.562717,0.457549,...,0.065241,0.53293,0.473118,0.53239,0.492233,0.091057,0.093015,0.544823,0.455007,0.545279
16,tanh,0.1,0.514099,pca_gdf_que10,"(8, 14)",0.100711,mlp,0.103554,0.554902,0.49219,...,0.075379,0.536068,0.509409,0.53763,0.509425,0.089387,0.090245,0.539964,0.485025,0.544532
2,tanh,1.0,0.502781,pca_gdf_que10,8,0.09919,mlp,0.103372,0.558323,0.473333,...,0.067928,0.53393,0.481183,0.533775,0.503773,0.093015,0.09423,0.543391,0.472919,0.546301
12,tanh,0.01,0.490724,pca_gdf_que10,"(14, 8)",0.098645,mlp,0.103283,0.560597,0.453383,...,0.080716,0.539007,0.510753,0.540287,0.491986,0.088086,0.089718,0.542778,0.45373,0.543821


In [10]:
df_res

Unnamed: 0,activation,alpha,f1,features,hidden_layer_size,kappa,kernel,matthews,precision,recall,...,test_matthews,test_precision,test_recall,test_roc_auc,train_f1,train_kappa,train_matthews,train_precision,train_recall,train_roc_auc
36,tanh,0.01,0.518217,pca_gdf_que10,"(8, 8, 8)",0.102548,mlp,0.106582,0.558486,0.499931,...,0.074544,0.532637,0.548387,0.537264,0.520045,0.090609,0.09117,0.538047,0.506219,0.545171
17,tanh,1.0,0.493565,pca_gdf_que10,"(8, 14)",0.100969,mlp,0.106574,0.564105,0.458924,...,0.067264,0.533235,0.485215,0.533475,0.490843,0.089062,0.090967,0.543894,0.453363,0.544293
32,tanh,1.0,0.497601,pca_gdf_que10,"(14, 20)",0.100274,mlp,0.105559,0.561371,0.468079,...,0.065408,0.530398,0.510081,0.532671,0.49574,0.088056,0.089731,0.541961,0.462789,0.543798
25,tanh,0.1,0.501713,pca_gdf_que10,"(14, 14)",0.099924,mlp,0.104131,0.559801,0.469447,...,0.071955,0.535793,0.487903,0.535811,0.50029,0.090848,0.09198,0.542749,0.466328,0.545233
3,tanh,0.01,0.505607,pca_gdf_que10,20,0.099595,mlp,0.104114,0.559184,0.479552,...,0.077992,0.53863,0.49664,0.538856,0.50557,0.090697,0.09166,0.541562,0.476837,0.545165
29,tanh,1.0,0.505381,pca_gdf_que10,"(20, 20)",0.098819,mlp,0.104055,0.56062,0.48056,...,0.068648,0.533094,0.497984,0.534233,0.504631,0.090906,0.092231,0.541723,0.47804,0.545247
23,tanh,1.0,0.490621,pca_gdf_que10,"(8, 20)",0.097841,mlp,0.103967,0.562717,0.457549,...,0.065241,0.53293,0.473118,0.53239,0.492233,0.091057,0.093015,0.544823,0.455007,0.545279
16,tanh,0.1,0.514099,pca_gdf_que10,"(8, 14)",0.100711,mlp,0.103554,0.554902,0.49219,...,0.075379,0.536068,0.509409,0.53763,0.509425,0.089387,0.090245,0.539964,0.485025,0.544532
2,tanh,1.0,0.502781,pca_gdf_que10,8,0.09919,mlp,0.103372,0.558323,0.473333,...,0.067928,0.53393,0.481183,0.533775,0.503773,0.093015,0.09423,0.543391,0.472919,0.546301
12,tanh,0.01,0.490724,pca_gdf_que10,"(14, 8)",0.098645,mlp,0.103283,0.560597,0.453383,...,0.080716,0.539007,0.510753,0.540287,0.491986,0.088086,0.089718,0.542778,0.45373,0.543821
