In [42]:
import numpy as np
import pandas as pd
from importlib import reload
import pathlib
import munch
import datetime

import shared
import config
import provider_yfinance as provider

reload(shared)
reload(config)
reload(provider)

cfg = config.get_config('^GDAXI')

# overwrite download_end_dt: use cached data
# config.overwrite_end_dt(cfg, '2019-12-19')
# config.save_config(cfg)

config> created config from file: './config.json'
config> config
        - base:
            - config_file_path: /mnt/c/notebooks/sandbox/config.json
        - datasets:
            - stocks: 30
            - benchmarks: 69
        - prepare:
            - data_start_dt: 2018-02-08
            - data_end_dt: 2020-01-07
            - cache_dir: /mnt/c/notebooks/sandbox/cache/20200107/
        - train:            
            - window_trading_days: [3, 5, 21, 35, 50]
            - lag_trading_days: [1, 2, 3, 4, 5]
            - label_max_high_weight: 3.0
            - label_max_close_weight: 1.0
            - settings: 12
        - model:
            - max_samples: 40
            - batch_size: 200
            - max_epochs: 1000
            - base_dir: /mnt/c/notebooks/sandbox/model/20200109/            
        


In [39]:
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model, Sequential
from keras.layers import LSTM, Dense, BatchNormalization, Masking
from keras.callbacks import EarlyStopping, ModelCheckpoint
import keras.backend as K


In [43]:
submodel_settings = cfg.train.settings[-1]
print(f"sm-{submodel_settings.id}> training submodel ...")
model_data = provider.prepare_submodel_data(cfg, submodel_settings)

sm-nr_12-lookback_100-label_7> training submodel ...


In [44]:
def load_weights(cfg, submodel_settings, model):
    pth_submodel = pathlib.Path(f"{cfg.model.base_dir}/{submodel_settings.id}")
    f_model_weights = pth_submodel.joinpath(cfg.model.model_weights_file_name)
    f_optimizer_weights = pth_submodel.joinpath(cfg.model.optimizer_weights_file_name)
    shared.mkdirs(pth_submodel)
    if f_model_weights.is_file():
        model.load_weights(f_model_weights)
        print(f"model> loaded model weights from '{f_model_weights.resolve()}'")
    if f_optimizer_weights.is_file():
        model._make_train_function()
        with open(f_optimizer_weights.resolve(), 'rb') as f:
            model.optimizer.set_weights(pickle.load(f))    
            print(f"model> loaded optimizer weights from '{f_optimizer_weights.resolve()}'")

def save_weights(cfg, submodel_settings, model):
    pth_submodel = pathlib.Path(f"{cfg.model.base_dir}/{submodel_settings.id}")
    f_model_weights = pth_submodel.joinpath(cfg.model.model_weights_file_name)
    f_optimizer_weights = pth_submodel.joinpath(cfg.model.optimizer_weights_file_name)
    shared.mkdirs(pth_submodel)
    model.save_weights(f_model_weights)
    print(f"model> saved model weights to '{f_model_weights.resolve()}'")
    with open(f_optimizer_weights.resolve(), 'wb') as f:
        pickle.dump(K.batch_get_value(getattr(model.optimizer, 'weights')), f)
        print(f"model> saved optimizer weights to '{f_optimizer_weights.resolve()}'")

def create_model(cfg, submodel_settings, model_data):
    num_samples = df.shape[0]
    num_features = len(df.X.head(1).tolist()[0][0][0][0])
    input_length = submodel_settings.lookback_days
    input_dim = num_features    
    model = Sequential()
    model.add(BatchNormalization(input_shape=(input_length, input_dim)))
    model.add(Masking())
    model.add(LSTM(input_dim, dropout=.3, return_sequences=True, activation="softsign"))
    model.add(LSTM(input_dim//2, dropout=.3, return_sequences=True, activation="softsign"))
    model.add(LSTM(input_dim//2, dropout=.3, activation="softsign"))
    model.add(Dense(output_dim))
    model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mean_absolute_error', 'mean_squared_error'])
    print(f'model> model created\n:{model.summary()}')
    load_weights(cfg, submodel_settings, model)
    return model

def train_model(cfg, submodel_settings, model, model_data):
    num_samples = model_data.shape[0]
    num_features = len(model_data.X.head(1).tolist()[0][0][0][0])
    input_length = submodel_settings.lookback_days
    input_dim = num_features    
    output_dim = 1
    X = np.hstack(np.asarray(model_data.X)).reshape(num_samples, input_length, input_dim)
    y = np.hstack(np.asarray(model_data.y)).reshape(num_samples, output_dim)
    pth_submodel = f"{cfg.model.base_dir}/{submodel_settings.id}"
    shared.mkdirs(pth_submodel)
    monitor = cfg.model.validaion_monitor
    fit_params = {
        "batch_size": cfg.model.batch_size,
        "epochs": cfg.model.max_epochs,
        "verbose": 1,
        "validation_split": 0.1,
        "shuffle": True,
        "callbacks": [
            EarlyStopping(verbose=True, patience=cfg.model.early_stopping_patience, monitor=monitor),
            ModelCheckpoint(f"{pth_submodel}/best_model_weights_{{epoch:02d}}_{{{monitor}:.4f}}.hdf5", monitor=monitor, verbose=1, save_best_only=True)
        ]
    }
    print('model> fitting ... (Hit CTRL-C to stop early)')
    history = None
    try:
        history = model.fit(X, y, **fit_params)
    except KeyboardInterrupt:
        print('model> training stopped early!')
        history = model.history    
    print("model> average +-5 around best epoch: " + np.mean(history.history[monitor][(np.max(history.epoch)-patience-10):(np.max(history.epoch)-patience+10)]))
    save_weights(cfg, submodel_settings, model)
    return history 

model = create_model(cfg, submodel_settings, model_data)
history = train_model(cfg, submodel_settings, model, model_data)
# single stock
# all stocks

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
batch_normalization_10 (Batc (None, 100, 1319)         5276      
_________________________________________________________________
masking_10 (Masking)         (None, 100, 1319)         0         
_________________________________________________________________
lstm_26 (LSTM)               (None, 100, 1319)         13923364  
_________________________________________________________________
lstm_27 (LSTM)               (None, 100, 659)          5216644   
_________________________________________________________________
lstm_28 (LSTM)               (None, 659)               3476884   
_________________________________________________________________
dense_9 (Dense)              (None, 1)                 660       
Total params: 22,622,828
Trainable params: 22,620,190
Non-trainable params: 2,638
____________________________________________________________

TypeError: Unrecognized keyword arguments: {'show_accuracy': True}

In [None]:
df_test = pd.DataFrame({
    'X': pd.Series(model.predict(X_train).flatten()),
    'y': y_train.flatten()
})
df_test['diff'] = df_test.X - df_test.y
df_test

https://stackoverflow.com/questions/39674713/neural-network-lstm-input-shape-from-dataframe
https://stackoverflow.com/questions/49803503/lstm-preprocessing-build-3d-arrays-from-pandas-data-frame-based-on-id