In [None]:
!pip install tensorflow-addons keras-beats joblib

In [1]:
from kerasbeats import prep_time_series, NBeatsModel
from tensorflow import keras
import gc

2023-12-02 19:21:58.839164: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
import pandas as pd
import tscv

def generate_label(df, threshold = 0.002):
    df['label'] = 0
    df.loc[(df['target_15m'] <= -1*threshold), 'label'] = 1
    df.loc[(df['target_15m'] >= threshold), 'label'] = 2
    return df

def get_na_features(df, train_features):
    tmp = pd.DataFrame(df[train_features].isnull().sum())
    tmp = tmp[tmp[0] > 0].reset_index()
    tmp.columns = ['feat', 'cnt']
    tmp = tmp.sort_values('cnt')
    feat_groups = dict(tmp.groupby('cnt')['feat'].agg(lambda x: list(x)))
    return feat_groups

def normalize_float_columns(df, features):
  float_cols = df[features].select_dtypes(include = [float]).columns
  means = df[float_cols].mean().astype('float32')
  std = df[float_cols].std().astype('float32')
  df[float_cols] = df[float_cols].ffill().fillna(means)
  df[float_cols] = (df[float_cols] - means) / std
  return df, means, std

In [3]:
window = 30
file = f'mean_corr.csv'
corr = pd.read_csv(f'../output/feature_corr_1m/{file}', header = 0, index_col = 0)

In [4]:
df = pd.read_feather('../data/df_btc_with_features_1m_spot.feather')

df['target'] = df['close'].pct_change(1)
df = df.dropna(subset = ['target'], axis = 0)
start_time = df['open_time'].min()
end_time = df['open_time'].max()
dates = df['open_time'].unique()
n = len(dates)
train_idx = int(0.7 * n)
valid_idx = int(0.9 * n)
train_end = dates[train_idx]
valid_end = dates[valid_idx]

train_df = df.loc[df['open_time'] < train_end].reset_index(drop=True)
valid_df = df.loc[(train_end <= df['open_time']) & (df['open_time'] < valid_end)].reset_index(drop=True)

train_df = pd.concat([train_df, valid_df], axis = 0)

test_df = df.loc[(df['open_time'] >= valid_end)].reset_index(drop=True)
valid_df = test_df.copy()

In [None]:
groups = pd.factorize(
    train_df['open_time'].dt.day.astype(str) + '_' + train_df['open_time'].dt.month.astype(str) + '_' + train_df[
        'open_time'].dt.year.astype(str))[0]

cv = tscv.PurgedGroupTimeSeriesSplit(
    n_splits=5,
    group_gap=31,
)

In [None]:
import tensorflow as tf
tf.random.set_seed(42)

def create_nbeat_mlp(num_columns, num_labels, lookback, horizon, hidden_units, dropout_rates, batch_size, ls=1e-2, lr=1e-3, ):
    nbeats = NBeatsModel(model_type = 'generic', lookback = lookback, horizon = horizon,
                         learning_rate = lr, batch_size = batch_size,
                         num_generic_neurons = hidden_units[0]) # set as default
    nbeats.build_layer()
    time_input = keras.layers.Input(shape = (lookback * horizon, ))
    x_nb = nbeats.model_layer(time_input)

    xcons = keras.layers.Input(shape = (num_columns, ))
    x = keras.layers.Concatenate()([xcons, x_nb])
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.Dropout(dropout_rates[0])(x)

    for i in range(1, len(hidden_units)):
        x = tf.keras.layers.Dense(hidden_units[i])(x)
        x = tf.keras.layers.BatchNormalization()(x)
        x = tf.keras.layers.Activation('swish')(x)
        x = tf.keras.layers.Dropout(dropout_rates[i])(x)

    out = tf.keras.layers.Dense(num_labels, name = 'action')(x)
    model = tf.keras.models.Model(inputs = [time_input, xcons], outputs = out)
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=lr),
                  loss = {'action' : tf.keras.losses.Huber(name = 'huber')},
                  metrics = {'action' : tf.keras.metrics.R2Score(name = 'mse')})
    return model

In [None]:
from pathlib import Path

directory = 'spot_data_1130_nbeats_1m'
date = '11_27'
lib = Path(f"../output/{directory}").mkdir(parents=True, exist_ok=True)

In [None]:
train_features = corr.iloc[:50].index.tolist()

In [None]:
train_df, means, std = normalize_float_columns(train_df, train_features)

nan_features = get_na_features(train_df, train_features)
for k, v in nan_features.items():
  for feat in v:
    train_features.remove(feat)

In [None]:
from joblib import load, dump
lookback = 3
horizon = 500

train_features_test = train_features

In [None]:
dump(means, f'../output/{directory}/means_nbeats_huber_l={lookback}_h={horizon}_50feats.joblib')
dump(std, f'../output/{directory}/std_return_nbeats_l={lookback}_h={horizon}_50feats.joblib')
dump(train_features_test, f'../output/{directory}/train_features_test_return_nbeats_l={lookback}_h={horizon}_50feats.joblib')

In [None]:
batch_size = 2048
lookback = 2
horizon = 100
train_features_test += ['target']
params = {'num_columns': len(train_features_test),
          'num_labels': 100,
          'lookback' : lookback,
          'horizon' : horizon,
          'batch_size' : batch_size,
          'hidden_units': [200, 200, 300, 256],
          'dropout_rates': [0.6, 0.5,
                            0.6, 0.5],
          'ls': 0,
          'lr': 1e-3,
          }
# train_df[train_features_test] = train_df[train_features_test].ffill().fillna(0)

In [None]:
scores = []
for fold, (train_idx, val_idx) in enumerate(cv.split(train_df, train_df[f'target'], groups)):
    if fold >= 4:
        x_train, x_valid = train_df['target'].iloc[train_idx], train_df['target'].iloc[val_idx]

        min_train, max_train = min(train_df['open_time'].iloc[train_idx]).to_pydatetime(), max(
                    train_df['open_time'].iloc[train_idx]).to_pydatetime()
        min_valid, max_valid = min(train_df['open_time'].iloc[val_idx]).to_pydatetime(), max(
                    train_df['open_time'].iloc[val_idx]).to_pydatetime()


        print(f'{fold} : Train Date is from {min_train} - {max_train}')
        print(f'{fold} : Valid Date is from {min_valid} - {max_valid}')

        x_tr, y_tr = prep_time_series(x_train, lookback = lookback, horizon = horizon)
        x_val, y_val = prep_time_series(x_valid, lookback = lookback, horizon = horizon)

        cutoff_tr, cutoff_val = x_train.shape[0] - x_tr.shape[0], x_valid.shape[0] - x_val.shape[0]
        del x_train, x_valid
        gc.collect()

        # x_tr_const, x_val_const = train_df[train_features_test].iloc[train_idx], train_df[train_features_test].iloc[val_idx]
        # x_tr_const, x_val_const = x_tr_const.iloc[(lookback * horizon) - 1:-horizon, :], x_val_const.iloc[(lookback * horizon) - 1:-horizon, :]

        # print(f'Shape of X_const is {x_tr_const.shape}, x_tr is {x_tr.shape}, y_tr is {y_tr.shape}')

        # ckp_path = f'../output/{directory}/NBEATS_HUBER_{fold}_returns{horizon}m_{lookback}m_{date}_allfeats_highdropout.hdf5'
        # model = create_nbeat_mlp(**params)
        # ckp = ModelCheckpoint(ckp_path, monitor='val_r2score', verbose=0,
        #                             save_best_only=True, save_weights_only=True, mode='max')
        # es = EarlyStopping(monitor='val_r2score', min_delta=1e-4, patience=10, mode='max',
        #                     baseline=None, restore_best_weights=True, verbose=0)


        # history = model.fit([x_tr, x_tr_const.values], y_tr,
        #                     validation_data = ([x_val, x_val_const.values], y_val),
        #                     epochs = 100, batch_size = batch_size, callbacks = [ckp, es])

        # hist = pd.DataFrame(history.history)
        # score = hist['val_r2score'].max()
        # print(f'Fold {fold} R2:\t', score)
        # scores.append(score)

        # del x_tr, y_tr, x_val, y_val
        # gc.collect()
        # K.clear_session()

# custom tensorflow metrics


In [None]:
def weighted_mse(true, pred, weights):
    sum_weights = tf.reduce_sum(weights)
    resid = tf.sqrt(tf.reduce_sum(weights * tf.square(true - pred)))
    return resid / sum_weights

In [None]:
from tensorflow.python.keras import backend

def custom_mse(class_weights):
    def weighted_mse(gt, pred):
        # Formula:
        # w_1*(y_1-y'_1)^2 + ... + w_100*(y_100-y'_100)^2 / sum(weights)
        return backend.sum(class_weights * backend.square(gt - pred)) / backend.sum(class_weights)
    return weighted_mse

model.compile(loss=custom_mse(weights))


In [6]:
import numpy as np

weights = np.zeros((2427199))
weights[::5] = 1

In [11]:
weights = np.zeros()