In [1]:
import numpy as np
import pandas as pd

from nbeats_keras.model import NBeatsNet as NBeatsKeras
from kerasbeats import prep_time_series, NBeatsModel
from tensorflow import keras

2023-11-25 22:25:39.181724: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
import pandas as pd
import tscv

def generate_label(df, threshold = 0.002):
    df['label'] = 0
    df.loc[(df['target_15m'] <= -1*threshold), 'label'] = 1
    df.loc[(df['target_15m'] >= threshold), 'label'] = 2
    return df

def get_na_features(df, train_features):
    tmp = pd.DataFrame(df[train_features].isnull().sum())
    tmp = tmp[tmp[0] > 0].reset_index()
    tmp.columns = ['feat', 'cnt']
    tmp = tmp.sort_values('cnt')
    feat_groups = dict(tmp.groupby('cnt')['feat'].agg(lambda x: list(x)))
    return feat_groups

def normalize_float_columns(df, features):
  float_cols = df[features].select_dtypes(include = [float]).columns
  means = df[float_cols].mean().astype('float32')
  std = df[float_cols].std().astype('float32')
  df[float_cols] = df[float_cols].ffill().fillna(means)
  df[float_cols] = (df[float_cols] - means) / std
  return df, means, std

In [3]:
window = 30
file = f'target_{window}m_feature.csv'
corr = pd.read_csv(f'../output/feature_corr/{file}', header = 0, index_col = 0)

In [4]:
df = pd.read_feather('../data/df_btc_with_features_5m_spot.feather')

df['target_5m'] = df['close'].pct_change(1)
df = df.dropna(subset = ['target_5m'], axis = 0)

start_time = df['open_time'].min()
end_time = df['open_time'].max()
dates = df['open_time'].unique()
n = len(dates)
train_idx = int(0.7 * n)
valid_idx = int(0.9 * n)
train_end = dates[train_idx]
valid_end = dates[valid_idx]

train_df = df.loc[df['open_time'] < train_end].reset_index(drop=True)
valid_df = df.loc[(train_end <= df['open_time']) & (df['open_time'] < valid_end)].reset_index(drop=True)

test_df = df.loc[(df['open_time'] >= valid_end)].reset_index(drop=True)

In [5]:
groups = pd.factorize(
    train_df['open_time'].dt.day.astype(str) + '_' + train_df['open_time'].dt.month.astype(str) + '_' + train_df[
        'open_time'].dt.year.astype(str))[0]

cv = tscv.PurgedGroupTimeSeriesSplit(
    n_splits=5,
    group_gap=31,
)

In [6]:
lookback = 10
horizon = 6

In [7]:
directory = 'spot_data_11_25'
date = '11_25'

In [8]:
import tensorflow as tf
tf.random.set_seed(42)
import tensorflow.python.keras.backend as K
import tensorflow.python.keras.layers as layers
from tensorflow.python.keras.callbacks import Callback, ReduceLROnPlateau, ModelCheckpoint, EarlyStopping

def create_nbeat_mlp(num_columns, num_labels, lookback, horizon, hidden_units, dropout_rates, ls=1e-2, lr=1e-3):
    nbeats = NBeatsModel(model_type = 'generic', lookback = lookback, horizon = horizon,
                         learning_rate = lr, batch_size = 4096,
                         num_generic_neurons = hidden_units[0]) # set as default
    nbeats.build_layer()
    time_input = keras.layers.Input(shape = (lookback * horizon, ))
    x_nb = nbeats.model_layer(time_input)

    xcons = keras.layers.Input(shape = (num_columns, ))
    x = keras.layers.Concatenate()([xcons, x_nb])
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.Dropout(dropout_rates[0])(x)

    for i in range(1, len(hidden_units)):
        x = tf.keras.layers.Dense(hidden_units[i])(x)
        x = tf.keras.layers.BatchNormalization()(x)
        x = tf.keras.layers.Activation('swish')(x)
        x = tf.keras.layers.Dropout(dropout_rates[i])(x)

    out = tf.keras.layers.Dense(num_labels, name = 'action')(x)
    model = tf.keras.models.Model(inputs = [time_input, xcons], outputs = out)
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=lr),
                  loss = {'action' : tf.keras.losses.MeanSquaredError()},
                  metrics = {'action' : tf.metrics.MeanSquaredError(name = 'mse')})
    return model

In [9]:
train_df.shape

(460235, 143)

In [10]:
train_features_test = corr.iloc[:50].index.tolist()

params = {'num_columns': len(train_features_test),
          'num_labels': 1,
          'lookback' : 10,
          'horizon' : 6,
          'hidden_units': [896, 448, 448, 256],
          'dropout_rates': [0.42409238408801436, 0.10431484318345882,
                            0.49230389137187497, 0.32024444956111164, 0.2716856145683449, 0.4379233941604448],
          'ls': 0,
          'lr': 1e-3,
          }

In [11]:
scores = []
batch_size = 4096
for fold, (train_idx, val_idx) in enumerate(cv.split(train_df, train_df[f'target_5m'], groups)):
    x_train, x_valid = train_df['target_5m'].iloc[train_idx], train_df['target_5m'].iloc[val_idx]

    min_train, max_train = min(train_df['open_time'].iloc[train_idx]).to_pydatetime(), max(
                train_df['open_time'].iloc[train_idx]).to_pydatetime()
    min_valid, max_valid = min(train_df['open_time'].iloc[val_idx]).to_pydatetime(), max(
                train_df['open_time'].iloc[val_idx]).to_pydatetime()

    x_tr, y_tr = prep_time_series(x_train, lookback = lookback, horizon = horizon)
    x_val, y_val = prep_time_series(x_valid, lookback = lookback, horizon = horizon)

    cutoff_tr, cutoff_val = x_train.shape[0] - x_tr.shape[0], x_valid.shape[0] - x_val.shape[0]
    x_tr_const, x_val_const = train_df[train_features_test].iloc[train_idx], train_df[train_features_test].iloc[val_idx]
    x_tr_const, x_val_const = x_tr_const.iloc[cutoff_tr:, :], x_val_const.iloc[cutoff_val:, :]

    print(f'Shape of X_const is {x_tr_const.shape}, x_tr is {x_tr.shape}, y_tr is {y_tr.shape}')

    ckp_path = f'../output/{directory}/NBEATS_MSE_{fold}_returns{horizon}m_{lookback}m_{date}.hdf5'
    model = create_nbeat_mlp(**params)
    ckp = ModelCheckpoint(ckp_path, monitor='val_action_mse', verbose=0,
                                  save_best_only=True, save_weights_only=True, mode='min')
    es = EarlyStopping(monitor='val_action_mse', min_delta=1e-4, patience=10, mode='min',
                        baseline=None, restore_best_weights=True, verbose=0)


    history = model.fit([x_tr, x_tr_const.values], y_tr,
                        validation_data = ([x_val, x_val_const.values], y_val),
                        epochs = 100, batch_size = batch_size, callbacks = [ckp, es])

    hist = pd.DataFrame(history.history)
    score = hist['val_action_mse'].min()
    print(f'Fold {fold} MSE:\t', score)
    scores.append(score)
    K.clear_session()

Shape of X_const is (68187, 50), x_tr is (68187, 60), y_tr is (68187, 6)


2023-11-25 22:26:13.865844: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Epoch 1/100
Epoch 2/100
 1/17 [>.............................] - ETA: 1:17 - loss: nan - mse: nan

KeyboardInterrupt: 