In [1]:
import os, sys
import copy
from collections import Counter
from datetime import datetime

sys.path.append('../src/')

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import VarianceThreshold, SelectFromModel, RFECV
from sklearn.impute import SimpleImputer
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score, classification_report
from sklearn.metrics import plot_confusion_matrix
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, PolynomialFeatures
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

import data.dataset as dtst
import features.preprocessing as prep
import visualization.visualize as vis
import models.evaluate_model as evl

c:\users\weldl\miniconda3\envs\i2a2-fm\lib\site-packages\numpy\.libs\libopenblas.4SP5SUA7CBGXUEOC35YP2ASOICYYEQZZ.gfortran-win_amd64.dll
c:\users\weldl\miniconda3\envs\i2a2-fm\lib\site-packages\numpy\.libs\libopenblas.GK7GX5KEQ4F6UYO3P26ULGBQYHGQO7J4.gfortran-win_amd64.dll


In [2]:
# Loading the dataset
folderpath = '../data/processed'
df_train, df_test = dtst.load_stocks_data(folderpath)

In [3]:
df_train

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,macd,signal,histogram,...,pref_ema21_perc,pref_ema50_perc,ema9gt21,ema9gt50,ema21gt50,open_close_diff_perc,open_close_diff_ratio,min_max_diff_ratio,y_target,ticker
2000-01-03,2000-01-03,99.724503,100.157043,96.408363,96.840897,71.915710,1291300.0,4.450243,4.348044,0.102199,...,0.054779,0.132978,1,1,1,-0.028916,-0.029777,0.038710,1.0,AA
2000-01-04,2000-01-04,96.840897,97.946281,96.360298,97.561798,72.451111,1859900.0,4.341700,4.346775,-0.005075,...,0.056615,0.135117,1,1,1,0.007444,0.007389,0.016256,2.0,AA
2000-01-05,2000-01-05,97.561798,103.713478,97.321503,103.329002,76.733902,2598000.0,4.667242,4.410869,0.256374,...,0.107091,0.192759,1,1,1,0.059113,0.055814,0.061860,2.0,AA
2000-01-06,2000-01-06,103.329002,103.617363,101.550781,101.598839,75.449089,3740800.0,4.731090,4.474913,0.256177,...,0.079860,0.164894,1,1,1,-0.016744,-0.017029,0.020341,2.0,AA
2000-01-07,2000-01-07,101.598839,104.001839,101.358543,101.406601,75.306320,3774100.0,4.711863,4.522303,0.189560,...,0.070246,0.155319,1,1,1,-0.001892,-0.001896,0.026066,1.0,AA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-03-26,2020-03-26,10.230000,11.430000,10.230000,11.100000,11.100000,189500.0,-0.252450,-0.125091,-0.127359,...,0.008091,0.022480,0,0,1,0.085044,0.078378,0.108108,2.0,ZYXI
2020-03-27,2020-03-27,10.700000,10.980000,10.060000,10.300000,10.300000,145000.0,-0.272685,-0.154610,-0.118076,...,-0.059041,-0.049303,0,0,1,-0.037383,-0.038835,0.089320,2.0,ZYXI
2020-03-30,2020-03-30,10.160000,11.060000,10.160000,10.800000,10.800000,162300.0,-0.245546,-0.172797,-0.072749,...,-0.012164,-0.003029,0,0,1,0.062992,0.059259,0.083333,2.0,ZYXI
2020-03-31,2020-03-31,10.680000,11.140000,10.590000,11.070000,11.070000,280400.0,-0.199946,-0.178227,-0.021719,...,0.011380,0.021019,0,0,1,0.036517,0.035230,0.049684,1.0,ZYXI


In [4]:
df_test

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,macd,signal,histogram,...,pref_ema21_perc,pref_ema50_perc,ema9gt21,ema9gt50,ema21gt50,open_close_diff_perc,open_close_diff_ratio,min_max_diff_ratio,y_target,ticker
2000-01-31,2000-01-31,48.327969,48.372677,46.316166,47.344421,40.718338,1040300.0,2.146637,2.865437,-0.718800,...,0.004712,0.116332,1,1,1,-0.020352,-0.020774,0.043437,1.0,A
2000-02-01,2000-02-01,47.389126,51.502148,47.389126,50.786839,43.678993,1404200.0,2.180930,2.728536,-0.547606,...,0.070199,0.188297,1,1,1,0.071698,0.066901,0.080986,2.0,A
2000-02-02,2000-02-02,51.412731,54.721031,51.189198,54.721031,47.062561,1945100.0,2.496782,2.682185,-0.185403,...,0.137273,0.266425,1,1,1,0.064348,0.060458,0.064543,2.0,A
2000-02-03,2000-02-03,53.782188,55.615166,52.798641,55.615166,47.831566,1779500.0,2.787118,2.703172,0.083947,...,0.139707,0.272787,1,1,1,0.034081,0.032958,0.050643,2.0,A
2000-02-04,2000-02-04,55.615166,55.615166,53.648067,54.542202,46.908756,1145600.0,2.897235,2.741984,0.155251,...,0.105885,0.236198,1,1,1,-0.019293,-0.019672,0.036066,1.0,A
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-03-26,2020-03-26,6.830000,7.460000,6.640000,7.370000,7.370000,349000.0,-1.255083,-1.218581,-0.036502,...,-0.071352,-0.217503,0,0,0,0.079063,0.073270,0.111262,2.0,ZTR
2020-03-27,2020-03-27,7.040000,7.640000,7.000000,7.470000,7.470000,157400.0,-1.129932,-1.200851,0.070919,...,-0.053697,-0.200399,0,0,0,0.061080,0.057564,0.085676,2.0,ZTR
2020-03-30,2020-03-30,7.370000,7.560000,7.210000,7.390000,7.390000,262700.0,-1.025384,-1.165758,0.140374,...,-0.058367,-0.202426,0,0,0,0.002714,0.002706,0.047361,1.0,ZTR
2020-03-31,2020-03-31,7.180000,7.450000,7.140000,7.200000,7.200000,259800.0,-0.946944,-1.121995,0.175051,...,-0.075638,-0.216079,0,0,0,0.002786,0.002778,0.043056,0.0,ZTR


In [5]:
df_train.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Open,7305243.0,10545.09,824334.8,0.001,8.6,16.67,33.98,191406300.0
High,7305243.0,10917.66,850981.4,0.001,8.75,16.9,34.439999,191406300.0
Low,7305243.0,10083.62,788957.8,0.001,8.45,16.43,33.490002,185156300.0
Close,7305243.0,10463.5,817020.2,0.001,8.6,16.67,33.98,187500000.0
Adj Close,7305243.0,10458.24,817020.2,-1.2029,6.121684,12.85848,28.19797,187500000.0
Volume,7305243.0,1400163.0,7340627.0,0.0,19600.0,137200.0,703800.0,1855410000.0
macd,7305243.0,-177.8518,43820.9,-9982415.0,-0.173774,0.01475,0.24378,11758480.0
signal,7305243.0,-177.0724,41441.01,-8893624.0,-0.162481,0.014977,0.232655,9118558.0
histogram,7305243.0,-0.77937,12775.06,-3348873.0,-0.060053,0.000698,0.062878,5941457.0
williams_r,7305243.0,-48.32798,30.7139,-1540.0,-75.0,-47.555584,-21.176497,9150.0


In [6]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 7305243 entries, 2000-01-03 to 2020-04-01
Data columns (total 34 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   Date                   object 
 1   Open                   float64
 2   High                   float64
 3   Low                    float64
 4   Close                  float64
 5   Adj Close              float64
 6   Volume                 float64
 7   macd                   float64
 8   signal                 float64
 9   histogram              float64
 10  williams_r             float64
 11  sma9                   float64
 12  sma21                  float64
 13  sma50                  float64
 14  pref_sma9_perc         float64
 15  pref_sma21_perc        float64
 16  pref_sma50_perc        float64
 17  sma9gt21               int32  
 18  sma9gt50               int32  
 19  sma21gt50              int32  
 20  ema9                   float64
 21  ema21                  float64
 22  ema

In [7]:
df_test.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Open,7586904.0,9213983.0,918136800.0,0.002,9.21,18.389999,36.799999,182498400000.0
High,7586904.0,9394641.0,934622700.0,0.002,9.3725,18.65,37.310001,182498400000.0
Low,7586904.0,8984557.0,895611100.0,0.00167,9.05216,18.110001,36.27,170478000000.0
Close,7586904.0,9174038.0,913768400.0,0.00167,9.21,18.389999,36.806343,173502000000.0
Adj Close,7586904.0,9174032.0,913768400.0,-1.529784,6.401956,14.0,30.455268,173502000000.0
Volume,7586904.0,1314177.0,20046500.0,0.0,17300.0,124400.0,679100.0,4483504000.0
macd,7586904.0,-93195.88,38687630.0,-13866990000.0,-0.189418,0.015269,0.259857,12602680000.0
signal,7586904.0,-87999.35,36644300.0,-12539740000.0,-0.176813,0.015538,0.247893,10588800000.0
histogram,7586904.0,-5196.528,12810830.0,-5143802000.0,-0.064806,0.000759,0.067672,4440019000.0
williams_r,7586904.0,-48.40207,30.40703,-149.9999,-75.000025,-47.701151,-21.186452,94.99999


In [8]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 7586904 entries, 2000-01-31 to 2020-04-01
Data columns (total 34 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   Date                   object 
 1   Open                   float64
 2   High                   float64
 3   Low                    float64
 4   Close                  float64
 5   Adj Close              float64
 6   Volume                 float64
 7   macd                   float64
 8   signal                 float64
 9   histogram              float64
 10  williams_r             float64
 11  sma9                   float64
 12  sma21                  float64
 13  sma50                  float64
 14  pref_sma9_perc         float64
 15  pref_sma21_perc        float64
 16  pref_sma50_perc        float64
 17  sma9gt21               int32  
 18  sma9gt50               int32  
 19  sma21gt50              int32  
 20  ema9                   float64
 21  ema21                  float64
 22  ema

# Data Exploration


In [9]:
df_train_desc = df_train.describe().transpose()
features_list = list(df_train_desc.index)
df_train_desc

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Open,7305243.0,10545.09,824334.8,0.001,8.6,16.67,33.98,191406300.0
High,7305243.0,10917.66,850981.4,0.001,8.75,16.9,34.439999,191406300.0
Low,7305243.0,10083.62,788957.8,0.001,8.45,16.43,33.490002,185156300.0
Close,7305243.0,10463.5,817020.2,0.001,8.6,16.67,33.98,187500000.0
Adj Close,7305243.0,10458.24,817020.2,-1.2029,6.121684,12.85848,28.19797,187500000.0
Volume,7305243.0,1400163.0,7340627.0,0.0,19600.0,137200.0,703800.0,1855410000.0
macd,7305243.0,-177.8518,43820.9,-9982415.0,-0.173774,0.01475,0.24378,11758480.0
signal,7305243.0,-177.0724,41441.01,-8893624.0,-0.162481,0.014977,0.232655,9118558.0
histogram,7305243.0,-0.77937,12775.06,-3348873.0,-0.060053,0.000698,0.062878,5941457.0
williams_r,7305243.0,-48.32798,30.7139,-1540.0,-75.0,-47.555584,-21.176497,9150.0


In [10]:
# vis.plot_features_hist(df_train, features_list)

# Data Preparation

In [11]:
df_train = df_train[0:100000]

In [12]:
# Useful variables:
Y_TARGET = 'y_target'

df_train[Y_TARGET].value_counts()

2.0    34940
0.0    32898
1.0    32162
Name: y_target, dtype: int64

In [13]:
# Splitting X and y:
y_train = df_train[Y_TARGET]
X_train = df_train.drop(columns=[Y_TARGET])

y_test = df_test[Y_TARGET]
X_test = df_test.drop(columns=[Y_TARGET])

In [14]:
print('Train dataset %s' % Counter(y_train))
print('Test dataset  %s' % Counter(y_test))

Train dataset Counter({2.0: 34940, 0.0: 32898, 1.0: 32162})
Test dataset  Counter({1.0: 2644610, 2.0: 2551487, 0.0: 2390807})


# Preprocessing

In [15]:
df_train.columns

Index(['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume', 'macd',
       'signal', 'histogram', 'williams_r', 'sma9', 'sma21', 'sma50',
       'pref_sma9_perc', 'pref_sma21_perc', 'pref_sma50_perc', 'sma9gt21',
       'sma9gt50', 'sma21gt50', 'ema9', 'ema21', 'ema50', 'pref_ema9_perc',
       'pref_ema21_perc', 'pref_ema50_perc', 'ema9gt21', 'ema9gt50',
       'ema21gt50', 'open_close_diff_perc', 'open_close_diff_ratio',
       'min_max_diff_ratio', 'y_target', 'ticker'],
      dtype='object')

In [16]:
# Listing all the features by type
numeric_features = ['macd', 'signal', 'histogram', 'williams_r',
                    'pref_sma9_perc', 'pref_sma21_perc', 'pref_sma50_perc',
                    'pref_ema9_perc', 'pref_ema21_perc', 'pref_ema50_perc',
                    'open_close_diff_perc', 'open_close_diff_ratio', 'min_max_diff_ratio',
                   ]
boolean_features = ['sma9gt21', 'sma9gt50','sma21gt50',
                    'ema9gt21', 'ema9gt50', 'ema21gt50',
                   ]
ordinary_features = []
categoric_features = []

print(len(numeric_features + boolean_features + ordinary_features + categoric_features), 'features selected')

19 features selected


In [17]:
# Features encoding
features_encoder = ColumnTransformer([("numeric features", Pipeline([('num imputer', SimpleImputer(missing_values=np.nan, strategy='median')),
                                                                     ('num scaling', MinMaxScaler()),
                                                                ]), numeric_features),
                                      ("boolean features", Pipeline([('bool imputer', SimpleImputer(missing_values=np.nan, strategy='most_frequent')),
                                                                ]), boolean_features),
                                     ], remainder='drop')

In [18]:
# Features creation
features_creator = PolynomialFeatures(degree=1, interaction_only=True, include_bias=False)

In [19]:
# Features selection
# estimator = SVC(kernel="linear")
estimator = RandomForestClassifier()
features_selector = RFECV(estimator, step=1, cv=5, n_jobs=-1)
# features_selector = VarianceThreshold(threshold=0)

In [20]:
# Pre-processing pipeline:
preprocessor = Pipeline([('features encoding', copy.deepcopy(features_encoder)),
                         ('features creation', copy.deepcopy(features_creator)),
                         ('features selection', copy.deepcopy(features_selector)),
                        ])

In [21]:
X_train_encoded = preprocessor.fit_transform(X_train, y_train)
X_test_encoded = preprocessor.transform(X_test)

MemoryError: could not allocate 3670016 bytes

In [None]:
pd.DataFrame(X_train_encoded)

In [None]:
pd.DataFrame(X_test_encoded)

# Model Training

In [None]:
# Training settings:
n_jobs = 4
n_iter = 100
cv = 10
scoring = 'balanced_accuracy'

X_train_ = X_train_encoded
y_train_ = y_train

In [None]:
# DecisionTreeClassifier:
param_grid = {'criterion': ["gini", "entropy"],
              'max_depth': [3,4,5,8,10,15],
              'min_samples_split': [2,4,6,8,10,15,20],
              'max_features': [3, 5, 8, 10, "auto", "sqrt", "log2"],
             }

tree_clf = RandomizedSearchCV(DecisionTreeClassifier(), param_distributions=param_grid, n_iter=n_iter, cv=cv,
                              scoring=scoring, n_jobs=n_jobs, verbose=1, random_state=42)
tree_clf.fit(X_train_, y_train_)

In [None]:
# RandomForest:
param_grid = {'n_estimators': [1,3,5,8,10,12,15,20],
              'criterion': ["gini", "entropy"],
              'max_depth': [3,4,5,8,10,15],
              'min_samples_split': [2,4,6,8,10],
              'max_features': [3, 5, 8, 10, "auto", "sqrt", "log2"],
             }

rf_clf = RandomizedSearchCV(RandomForestClassifier(), param_distributions=param_grid, n_iter=n_iter, cv=cv,
                             scoring=scoring, n_jobs=n_jobs, verbose=1, random_state=42)
rf_clf.fit(X_train_, y_train_)

In [None]:
# # KNeighbors:
# param_grid = {'n_neighbors': [1, 3, 5, 10],
#               'weights': ["uniform", "distance"],
#               'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
#               'leaf_size': [3, 5, 8, 10, 20, 30, 40],
#              }

# knn_clf = RandomizedSearchCV(KNeighborsClassifier(), param_distributions=param_grid, n_iter=n_iter, cv=cv,
#                              scoring=scoring, n_jobs=n_jobs, verbose=1, random_state=42)
# knn_clf.fit(X_train_, y_train_)

In [None]:
# # MLP:
# param_grid = {'hidden_layer_sizes': [4, 8, 16, 32, 64],
#               'activation': ['identity', 'logistic', 'tanh'],
#               'solver': ['lbfgs', 'sgd', 'adam'],
#               'alpha': [1e-5, 1e-4, 1e-3, 1e-2],
#               'learning_rate': ['constant', 'invscaling', 'adaptive'],
#              }

# mlp_clf = RandomizedSearchCV(MLPClassifier(), param_distributions=param_grid, n_iter=n_iter, cv=cv,
#                              scoring=scoring, n_jobs=n_jobs, verbose=1, random_state=42)
# mlp_clf.fit(X_train_, y_train_)

In [None]:
estimators_list = [tree_clf, rf_clf]
evl.plot_estimators_cvperf(estimators_list, error_metric=scoring)

In [None]:
# Selecting the best model and saving it
estimator = estimators_list[np.argmax([pd.DataFrame(m.cv_results_)['mean_test_score'].dropna().max() for m in estimators_list])].best_estimator_
estimator.fit(X_train_, y_train_)

model = Pipeline([('preprocessor', copy.deepcopy(preprocessor)),
                  ('estimator', estimator),
                 ])

In [None]:
from joblib import dump, load

modelpath = '../models/model_enhanced_v4.joblib'
dump(model, modelpath) 

# Model Evaluation

In [None]:
model = load(modelpath)

In [None]:
plot_confusion_matrix(model, X_train, y_train, cmap='viridis', normalize='true')
plt.show()

In [None]:
print(classification_report(y_train_, model.predict(X_train)))

In [None]:
plot_confusion_matrix(model, X_test, y_test, cmap='viridis', normalize='true')
plt.show()

In [None]:
print(classification_report(y_test, model.predict(X_test)))