In [4]:
import numpy as np
import pandas as pd
from importlib import reload
import datetime

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

import shared
import config
import provider_yfinance as provider
import model
import plot

reload(shared)
reload(config)
reload(provider)
reload(model)
reload(plot)

plot.Notebook()

mpl.rcParams['figure.figsize'] = (16, 9)
mpl.rcParams['axes.grid'] = False
mpl.rcParams['axes.unicode_minus']=False
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_columns', None)

# most recent data
cfg = config.get_config('^GDAXI')

# select model to validate against 
mdl_cfg = config.get_config('^GDAXI')
config.overwrite_end_dt(mdl_cfg, '2020-01-07')
# overwrite download_end_dt: use cached data
# config.save_config(cfg)

config> created config from file: './config.json'
config> config
        - base:
            - config_file_path: /mnt/c/notebooks/sandbox/config.json
        - datasets:
            - stocks: 30
            - benchmarks: 69
        - prepare:
            - data_start_dt: 2018-02-16
            - data_end_dt: 2020-01-16
            - cache_dir: /mnt/c/notebooks/sandbox/cache/20200116/
        - train:            
            - window_trading_days: [3, 5, 21, 35, 50]
            - lag_trading_days: [1, 2, 3, 4, 5]
            - label_max_high_weight: 3.0
            - label_max_close_weight: 1.0
            - settings: 12
        - model:
            - max_samples: 40
            - batch_size: 32
            - learning_rate: 0.1
            - learning_rate_decay: 0.75
            - lstm_hidden_size: 256
            - early_stopping_patience: 10
            - validation_monitor: val_mean_squared_error
            - max_epochs: 1000
            - base_dir: /mnt/c/notebooks/sandbox/model/20

In [2]:
import csv
import os

In [10]:
eval_result = {}
verbose=0
for submodel_settings in cfg.train.settings:
    print(f'============\n {submodel_settings.id}\n ============')
    rs = {}
    mdl_data = provider.prepare_submodel_data(cfg, submodel_settings)
    tickers = mdl_data.ticker.unique().tolist()
    for ticker_name in tickers:        
        ticker_data = mdl_data[(mdl_data.ticker==ticker_name) & (mdl_data.date==mdl_cfg.train.end_dt)]
        base_date = str(ticker_data.date[-1:].tolist()[0].date())
        print(f'eval> {submodel_settings.id} - {ticker_name} - {base_date} ...')
        mdl = model.create_model(mdl_cfg, submodel_settings, ticker_data, ticker_name, train_mode=False)
        mdl0 = model.create_model(mdl_cfg, submodel_settings, ticker_data, train_mode=False)
        num_samples = ticker_data.shape[0]    
        num_features = len(ticker_data.X.head(1).tolist()[0][0][0][0])
        input_dim = num_features    
        input_length = submodel_settings.lookback_days
        output_dim = 1
        X = np.hstack(np.asarray(ticker_data.X)).reshape(num_samples, input_length, input_dim)[-1:]
        y = np.hstack(np.asarray(ticker_data.y)).reshape(num_samples, output_dim)[-1:]
        X0 = np.hstack(np.asarray(ticker_data.X)).reshape(num_samples, input_length, input_dim)[-1:]
        y0 = np.hstack(np.asarray(ticker_data.y)).reshape(num_samples, output_dim)[-1:]
        mdl_metrics = dict(zip(mdl.metrics_names, mdl.evaluate(X, y, verbose=verbose)))
        mdl0_metrics = dict(zip(mdl.metrics_names, mdl0.evaluate(X0, y0, verbose=verbose)))
        rs[ticker_name] = {
            'date': [base_date],
            'metrics': [
                mdl_metrics['loss'], mdl_metrics['mean_absolute_error'], mdl_metrics['mean_squared_error'],
                mdl0_metrics['loss'], mdl0_metrics['mean_absolute_error'], mdl0_metrics['mean_squared_error'],
            ],
            'y': [round(mdl.predict(X)[0][0]*100)/100, round(y[0][0]*100)/100]
        }
    eval_result[submodel_settings.id] = rs

 lookback_3-label_1
eval> lookback_3-label_1 - 1COV.DE - 2020-01-07 ...
eval> lookback_3-label_1 - ADS.DE - 2020-01-07 ...
eval> lookback_3-label_1 - ALV.DE - 2020-01-07 ...
eval> lookback_3-label_1 - BAS.DE - 2020-01-07 ...
eval> lookback_3-label_1 - BAYN.DE - 2020-01-07 ...
eval> lookback_3-label_1 - BEI.DE - 2020-01-07 ...
eval> lookback_3-label_1 - BMW.DE - 2020-01-07 ...
eval> lookback_3-label_1 - CON.DE - 2020-01-07 ...
eval> lookback_3-label_1 - DAI.DE - 2020-01-07 ...
eval> lookback_3-label_1 - DB1.DE - 2020-01-07 ...
eval> lookback_3-label_1 - DBK.DE - 2020-01-07 ...
eval> lookback_3-label_1 - DPW.DE - 2020-01-07 ...
eval> lookback_3-label_1 - DTE.DE - 2020-01-07 ...
eval> lookback_3-label_1 - EOAN.DE - 2020-01-07 ...
eval> lookback_3-label_1 - FME.DE - 2020-01-07 ...
eval> lookback_3-label_1 - FRE.DE - 2020-01-07 ...
eval> lookback_3-label_1 - HEI.DE - 2020-01-07 ...
eval> lookback_3-label_1 - HEN3.DE - 2020-01-07 ...
eval> lookback_3-label_1 - IFX.DE - 2020-01-07 ...
eval> l

In [11]:
csv_output_stocks = []
rs = ['ticker_name']
for submodel_settings in cfg.train.settings:
    prefix = submodel_settings.id + '_'
    rs += [
        'date', 'y_predicted', 'y_actual', prefix + 'mdl_loss', prefix + 'mdl_mae', prefix + 'mdl_mse', prefix + 'mdl0_loss', prefix + 'mdl0_mae', prefix + 'mdl0_mse'
    ]
csv_output_stocks.append(rs)
for ticker_name in cfg.datasets.raw.stocks:
    rs = [ticker_name]
    for submodel_settings in cfg.train.settings:
        if ticker_name in eval_result[submodel_settings.id]:
            ticker_result = eval_result[submodel_settings.id][ticker_name]
            rs += ticker_result['date']
            rs += ticker_result['y']
            rs += ticker_result['metrics']
        else:
            rs += [None] * 9
    csv_output_stocks.append(rs)  
    
with open(os.path.join(cfg.model.base_dir, 'model_eval_pivot.tsv'), 'w', newline='\n', encoding='utf-8') as fp:
    writer = csv.writer(fp, delimiter='\t')
    for rs in csv_output_stocks:
        writer.writerow(rs)


In [12]:
csv_output = [
    ['ticker_name', 'submodel', 'date', 'y_predicted', 'y_actual', 'mdl_loss', 'mdl_mae', 'mdl_mse', 'mdl0_loss', 'mdl0_mae', 'mdl0_mse']
]
for ticker_name in cfg.datasets.raw.stocks:    
    for submodel_settings in cfg.train.settings:
        rs = [ticker_name, submodel_settings.id]
        if ticker_name in eval_result[submodel_settings.id]:
            ticker_result = eval_result[submodel_settings.id][ticker_name]
            rs += ticker_result['date']
            rs += ticker_result['y']
            rs += ticker_result['metrics']
        else:
            rs += [None] * 9
        csv_output.append(rs)
    
with open(os.path.join(cfg.model.base_dir, 'model_eval.tsv'), 'w', newline='\n', encoding='utf-8') as fp:
    writer = csv.writer(fp, delimiter='\t')
    for rs in csv_output:
        writer.writerow(rs)


In [53]:
# rank models by performance
df_eval = pd.read_csv(os.path.join(cfg.model.base_dir, 'model_eval.tsv'), sep='\t', low_memory=False)
# ticker model scores
scores = [10,8,5,3,2,1]
s_scores = None
for ticker_name in cfg.datasets.raw.stocks:        
    idx = df_eval[df_eval.ticker_name==ticker_name].sort_values(by='mdl_mae').index    
    scores = scores + [0] * (len(cfg.train.settings) - len(scores))
    if s_scores is None:
        s_scores = pd.Series(scores, index=idx)
    else:
        s_scores = pd.concat([s_scores, pd.Series(scores, index=idx)])
df_eval['scores'] = s_scores
# overall model scores
scores0 = np.array(scores) / 2
s_scores0 = None
for ticker_name in cfg.datasets.raw.stocks:        
    idx = df_eval[df_eval.ticker_name==ticker_name].sort_values(by='mdl0_mae').index    
    scores = scores + [0] * (len(cfg.train.settings) - len(scores))
    if s_scores0 is None:
        s_scores0 = pd.Series(scores, index=idx)
    else:
        s_scores0 = pd.concat([s_scores0, pd.Series(scores, index=idx)])
df_eval['scores0'] = s_scores0
df_eval['scores_sum'] = df_eval.scores + df_eval.scores0
df_eval['ensemble_weight'] = df_eval.submodel.apply(lambda x: [s for s in cfg.train.settings if x==s.id][0].ensemble_weight)
df_eval['scores_weighted'] = df_eval.scores_sum * df_eval.ensemble_weight
df_eval.groupby(['submodel']).agg(sum).sort_values('scores_weighted', ascending=False)[['scores_weighted']]

Unnamed: 0_level_0,scores_weighted
submodel,Unnamed: 1_level_1
lookback_5-label_1,743.4
lookback_7-label_1,424.2
lookback_3-label_1,399.1
lookback_5-label_3,262.0
lookback_21-label_5,229.0
lookback_3-label_2,151.5
lookback_100-label_7,146.0
lookback_7-label_3,57.8
lookback_14-label_3,56.0
lookback_10-label_3,17.0


In [40]:
df = df_eval[df_eval.submodel.isin([
    'lookback_3-label_1',
    'lookback_3-label_2',
    'lookback_5-label_1',
    'lookback_21-label_5',
    'lookback_100-label_7'    
])].groupby('ticker_name').agg('mean').sort_values('y_predicted', ascending=False)[:10]
df = df[df.y_predicted > 1.]
top3_stocks = df.index.tolist()
df

Unnamed: 0_level_0,y_predicted,y_actual,mdl_loss,mdl_mae,mdl_mse,mdl0_loss,mdl0_mae,mdl0_mse,scores,scores0,scores_sum
ticker_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
WDI.DE,2.752,5.24,37.508652,3.542301,37.508651,41.3756,4.182084,41.3756,3.0,1.4,4.4
1COV.DE,2.372,2.93,3.391878,1.564816,3.391878,2.069866,1.103788,2.069866,4.6,5.0,9.6
FME.DE,2.1,2.154,0.887311,0.712088,0.887311,1.74134,1.055166,1.74134,5.4,3.0,8.4
HEN3.DE,1.83,1.822,1.409984,0.949741,1.409984,1.117052,0.753138,1.117052,4.2,4.4,8.6
HEI.DE,1.754,1.536,2.382038,1.442502,2.382038,1.434784,1.161316,1.434784,3.0,1.8,4.8
BMW.DE,1.646,0.886,2.023732,0.98921,2.023732,2.442053,1.305365,2.442053,4.6,2.2,6.8
IFX.DE,1.594,3.366,5.309187,2.121759,5.309186,3.15344,1.451604,3.15344,5.4,5.2,10.6
SAP.DE,1.552,2.292,1.304195,0.981571,1.304195,1.289976,0.896865,1.289976,5.4,4.8,10.2
CON.DE,1.444,3.702,6.07903,2.262204,6.07903,4.836563,1.888777,4.836563,4.6,5.0,9.6
LIN.DE,1.44,1.884,0.665589,0.57001,0.665589,1.261204,0.825652,1.261204,5.2,3.8,9.0


In [42]:
df = df_eval[df_eval.submodel.isin([
    'lookback_3-label_1',
    'lookback_3-label_2',
    'lookback_5-label_1',
]) & df_eval.ticker_name.isin(top3_stocks)].groupby('ticker_name').agg('max').sort_values('y_predicted', ascending=False)
df = df[df.y_predicted > 1.]
df

Unnamed: 0_level_0,submodel,date,y_predicted,y_actual,mdl_loss,mdl_mae,mdl_mse,mdl0_loss,mdl0_mae,mdl0_mse,scores,scores0,scores_sum
ticker_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
WDI.DE,lookback_5-label_1,2020-01-07,2.21,1.92,3.615715,1.901503,3.615715,5.493834,2.343893,5.493834,10,5,12
HEN3.DE,lookback_5-label_1,2020-01-07,1.23,2.05,3.375018,1.837122,3.375018,3.827623,1.956431,3.827623,10,8,18
FME.DE,lookback_5-label_1,2020-01-07,1.06,2.39,3.648036,1.909983,3.648036,4.933257,2.221094,4.933257,5,8,13
1COV.DE,lookback_5-label_1,2020-01-07,1.01,3.47,8.10773,2.847408,8.10773,7.129764,2.670162,7.129764,8,8,16
