List of Ideas:
- See bookmarks:
- Automating Feature Engineering (Part II)
- Use LightGBM for feature importance (Crypto Forecasting - lgbm feval+feature importance)
- Crypto Forecasting - Common Factors
- Correlation as loss function ? https://www.kaggle.com/competitions/open-problems-multimodal/discussion/347595#1916337
-

In [4]:
import tensorflow as tf
tf.random.set_seed(42)
import tensorflow.python.keras.backend as K
import tensorflow.python.keras.layers as layers
from tensorflow.python.keras.callbacks import Callback, ReduceLROnPlateau, ModelCheckpoint, EarlyStopping


def create_ae_mlp(num_columns, num_labels, hidden_units, dropout_rates, ls=1e-2, lr=1e-3):
    inp = tf.keras.layers.Input(shape=(num_columns,))
    x0 = tf.keras.layers.BatchNormalization()(inp)

    encoder = tf.keras.layers.GaussianNoise(dropout_rates[0])(x0)
    encoder = tf.keras.layers.Dense(hidden_units[0])(encoder)
    encoder = tf.keras.layers.BatchNormalization()(encoder)
    encoder = tf.keras.layers.Activation('swish')(encoder)

    decoder = tf.keras.layers.Dropout(dropout_rates[1])(encoder)
    decoder = tf.keras.layers.Dense(num_columns, name='decoder')(decoder)

    x_ae = tf.keras.layers.Dense(hidden_units[1])(decoder)
    x_ae = tf.keras.layers.BatchNormalization()(x_ae)
    x_ae = tf.keras.layers.Activation('swish')(x_ae)
    x_ae = tf.keras.layers.Dropout(dropout_rates[2])(x_ae)

    out_ae = tf.keras.layers.Dense(num_labels, activation='sigmoid', name='ae_action')(x_ae)

    x = tf.keras.layers.Concatenate()([x0, encoder])
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.Dropout(dropout_rates[3])(x)

    for i in range(2, len(hidden_units)):
        x = tf.keras.layers.Dense(hidden_units[i])(x)
        x = tf.keras.layers.BatchNormalization()(x)
        x = tf.keras.layers.Activation('swish')(x)
        x = tf.keras.layers.Dropout(dropout_rates[i + 2])(x)

    out = tf.keras.layers.Dense(num_labels, activation='softmax', name='action')(x)

    model = tf.keras.models.Model(inputs=inp, outputs=[decoder, out_ae, out])
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=lr),
                  loss={'decoder': tf.keras.losses.MeanSquaredError(),
                        'ae_action': tf.keras.losses.CategoricalCrossentropy(),
                        'action': tf.keras.losses.CategoricalCrossentropy(),
                        },
                  metrics={'decoder': tf.keras.metrics.MeanAbsoluteError(name='MAE'),
                           'ae_action': tf.keras.metrics.AUC(name='AUC'),
                           'action': tf.keras.metrics.AUC(name='AUC'),
                           },
                  )

    return model

In [5]:
import tensorflow_addons as tfa
def create_model(n_in, n_out, layers, dropout_rate, optimizer, metrics):

    inp = tf.keras.layers.Input(shape = (n_in, ))

    x=inp
    for i,hidden_units in enumerate(layers):
        x = tf.keras.layers.BatchNormalization()(x)
        if i>0:
            x = tf.keras.layers.Dropout(dropout_rate)(x)
        else:
            x = tf.keras.layers.Dropout(.01)(x)
        x = tf.keras.layers.Dense(hidden_units)(x)
        x = tf.keras.layers.Activation('relu')(x)

    out = tf.keras.layers.Dense(n_out, activation = 'softmax', name = 'action')(x)

    model = tf.keras.models.Model(inputs = inp, outputs = out)
    model.compile(optimizer = optimizer,
                  loss = tf.keras.losses.CategoricalCrossentropy(),
                  metrics = metrics,
#                   run_eagerly=True
                 )

    return model

In [7]:
import lightgbm as lgb
import pandas as pd
import numpy as np
from itertools import islice
import tscv

def generate_label(df, threshold = 0.002):
    df['label'] = 0
    df.loc[(df['target_15m'] <= -1*threshold), 'label'] = 1
    df.loc[(df['target_15m'] >= threshold), 'label'] = 2
    return df

def get_na_features(df, train_features):
    tmp = pd.DataFrame(df[train_features].isnull().sum())
    tmp = tmp[tmp[0] > 0].reset_index()
    tmp.columns = ['feat', 'cnt']
    tmp = tmp.sort_values('cnt')
    feat_groups = dict(tmp.groupby('cnt')['feat'].agg(lambda x: list(x)))
    return feat_groups

def normalize_float_columns(df, features):
  float_cols = df[features].select_dtypes(include = [float]).columns
  grouped_df = df.groupby(['token'])
  for col in float_cols:
      df[col] = grouped_df[col].transform(lambda x: (x - x.mean()) / (x.std()))
  df[float_cols] = (df[float_cols].replace([np.inf, -np.inf], np.nan).fillna(0.0))
  return df

class Params: pass
param = Params()

In [8]:
df = pd.read_feather('../data/df_btc_eth_with_features.feather')

In [10]:
cols = pd.DataFrame(df.columns)

In [11]:
cols

Unnamed: 0,0
0,open_time
1,open
2,high
3,low
4,close
...,...
155,sma_diff_12
156,sma_diff_192
157,sma_diff_768
158,sma_diff_3072


In [39]:
train = False
df = pd.read_feather('../data/df_btc_eth_with_features.feather')
cols_to_drop = ['open_time', 'close_time', 'ignore',
                'create_time', 'symbol', 'returns', 'returns_5m',
                'open', 'hi gh', 'low', 'close', 'target_15m', 'label']

df = df.sort_values(by='open_time', ignore_index=True)
df = generate_label(df, threshold=0.002)

start_time = df['open_time'].min()
end_time = df['open_time'].max()
dates = df['open_time'].unique()
n = len(dates)
train_idx = int(0.7 * n)
valid_idx = int(0.9 * n)
train_end = dates[train_idx]
valid_end = dates[valid_idx]

train_df = df.loc[df['open_time'] < train_end].reset_index(drop=True)
valid_df = df.loc[(train_end <= df['open_time']) & (df['open_time'] < valid_end)].reset_index(drop=True)
test_df = df.loc[(df['open_time'] >= valid_end)].reset_index(drop=True)

In [41]:
train_features = [x for x in df.columns if (x not in cols_to_drop)]

valid_df['token'] = valid_df['token'].astype('category').cat.codes
object_cols = valid_df[train_features].select_dtypes(include=object).columns
valid_df[object_cols] = valid_df[object_cols].astype(float)

nan_features = get_na_features(valid_df, train_features)
grouped_train = valid_df.groupby(['token'])
for k, v in nan_features.items():
    for value in v:
        valid_df[value] = grouped_train[value].transform(lambda x: x.ffill().fillna(0.0))

feature_cols = pd.DataFrame(train_features)
dtype_df = pd.DataFrame(valid_df[train_features].select_dtypes(exclude=[float]).columns)
train_features = [x for x in train_features if x not in dtype_df.values]

# params = {'num_columns': len(train_features_test),
#           'num_labels': 3,
#           'hidden_units': [96, 96, 896, 448, 448, 256],
#           'dropout_rates': [0.03527936123679956, 0.038424974585075086, 0.42409238408801436, 0.10431484318345882,
#                             0.49230389137187497, 0.32024444956111164, 0.2716856145683449, 0.4379233941604448],
#           'ls': 0,
#           'lr': 1e-3,
#           }


In [12]:
# 1) get rid of 0's
# 2) get rid of [-np.inf, np.inf]

In [42]:
### model parameters
param.layers = [500,350,200]
param.dropout_rate = 0.35

###training parameters
param.bs = 8192
param.lr = 0.002
param.epochs = 30
param.wd = 0.02

In [43]:
groups = pd.factorize(
    train_df['open_time'].dt.day.astype(str) + '_' + train_df['open_time'].dt.month.astype(str) + '_' + train_df[
        'open_time'].dt.year.astype(str))[0]

cv = tscv.PurgedGroupTimeSeriesSplit(
    n_splits=5,
    group_gap=31,
)

In [44]:
train_features_test = train_features
valid_df = normalize_float_columns(valid_df, train_features_test)

In [45]:
def get_weights(weights):
    weights_inv = 1/weights
    final_weights = weights_inv / weights_inv.sum()
    return final_weights

In [46]:
### adding overall AuC as a metric
### for early stopping I only look at resp and resp_sum because they start overfitting earlier
use_weights = True
metrics =  [tf.keras.metrics.CategoricalCrossentropy(name='loss'),
            tf.keras.metrics.AUC(name='AUC')]
            # tf.keras.metrics.AUC(name='AUC')]

scores = []
batch_size = 4096

In [51]:
scores = []
batch_size = 512
df_pred = []
min_train, max_train = min(valid_df['open_time']).to_pydatetime(), max(
  valid_df['open_time']).to_pydatetime()

x_train, x_val = valid_df[train_features_test], valid_df[train_features_test]

print(f'Train Date is from {min_train} - {max_train}')
print(f'Valid Date is from {min_valid} - {max_valid}')

y_train, y_val = valid_df['label'].values, valid_df['label'].values

print(f'Shape of Xtrain is {x_train.shape}, Shape of yTrain is {y_train.shape}')

if use_weights:
    weights = []
    for val in np.unique(y_train):
        prop = (y_train == val).sum() / y_train.shape[0]
        print(f'Class 0: train: {prop}')
        weights.append(prop)
    weights = np.array(weights)
    loss_weights = get_weights(weights)
    weights = {}
    for i in range(len(loss_weights)):
        weights[i] = loss_weights[i]

y_train = tf.one_hot(y_train, depth = 3)
y_val = tf.one_hot(y_val, depth = 3)

ckp_path = f'../output/MLP_4.hdf5'
model = create_model(len(train_features_test), 3, param.layers, param.dropout_rate,
                    optimizer=tfa.optimizers.Lookahead(
                        tfa.optimizers.LAMB(learning_rate=param.lr, weight_decay_rate=param.wd)
                    ),
                    metrics=metrics)
model.load_weights(f'../output/MLP_4.hdf5')

predictions = model.predict(x_val.values)
ypred = pd.DataFrame(predictions, columns = [f'prob_{i}' for i in range(3)])
cols_to_keep = ['open_time', 'label', 'target_15m', 'token', 'close', 'open']
x_df = valid_df[cols_to_keep].reset_index(drop = True)
x_df.columns = cols_to_keep
for i in range(3):
  x_df[f'prob_{i}'] = ypred[f'prob_{i}']
df_pred.append(x_df)

# cbs = [tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.3,
#                                             patience=3, verbose=1),
#         tf.keras.callbacks.EarlyStopping(
#             monitor='val_AUC', patience=10, verbose=1,
#             mode='max', restore_best_weights=True, min_delta = 1e-4)
#         ]
#
# history = model.fit(x_train.values, y_train, validation_data=(x_val.values, y_val),
#                     epochs=param.epochs,
#                     batch_size=param.bs, callbacks=[ckp, cbs], class_weight = weights)
# hist = pd.DataFrame(history.history)
# hist.to_csv(f'../output/AEMLP_{fold}_training_history.csv')
# # hist.head(50)
# score = hist['val_AUC'].max()
# print(f'Fold {fold} ACC:\t', score)
# scores.append(score)
# K.clear_session()

Train Date is from 2022-08-16 07:12:00 - 2023-05-17 02:23:00
Valid Date is from 2022-08-16 07:12:00 - 2023-05-17 02:23:00
Shape of Xtrain is (788544, 147), Shape of yTrain is (788544,)
Class 0: train: 0.7189617827286746
Class 0: train: 0.138683446960474
Class 0: train: 0.1423547703108514


In [56]:
df_pred_test = pd.concat(df_pred, axis = 0)
prob_cols = [f'prob_{i}' for i in range(3)]
df_pred_test['pred_label'] = np.argmax(df_pred_test[prob_cols].values, axis = 1)

In [57]:
df_pred_test.to_feather('../data/back_test_output_valid.feather')

In [58]:
### model parameters
param.layers = [500,350,200]
param.dropout_rate = 0.35

###training parameters
param.bs = 8192
param.lr = 0.002
param.epochs = 30
param.wd = 0.02

In [53]:
groups = pd.factorize(
    valid_df['open_time'].dt.day.astype(str) + '_' + valid_df['open_time'].dt.month.astype(str) + '_' + valid_df[
        'open_time'].dt.year.astype(str))[0]

cv = tscv.PurgedGroupTimeSeriesSplit(
    n_splits=5,
    group_gap=31,
)

In [11]:
train_features_test = train_features
valid_df = normalize_float_columns(valid_df, train_features_test)

def get_weights(weights):
    weights_inv = 1/weights
    final_weights = weights_inv / weights_inv.sum()
    return final_weights

In [59]:
### adding overall AuC as a metric
### for early stopping I only look at resp and resp_sum because they start overfitting earlier
use_weights = True
metrics =  [tf.keras.metrics.CategoricalCrossentropy(name='loss'),
            tf.keras.metrics.AUC(name='AUC')]
            # tf.keras.metrics.AUC(name='AUC')]

scores = []
batch_size = 4096

In [60]:
params = {'num_columns': len(train_features_test),
          'num_labels': 3,
          'hidden_units': [96, 96, 896, 448, 448, 256],
          'dropout_rates': [0.03527936123679956, 0.038424974585075086, 0.42409238408801436, 0.10431484318345882,
                            0.49230389137187497, 0.32024444956111164, 0.2716856145683449, 0.4379233941604448],
          'ls': 0,
          'lr': 1e-3,
          }

In [63]:
scores = []
batch_size = 4096
df_pred = []

min_train, max_train = min(valid_df['open_time']).to_pydatetime(), max(valid_df['open_time']).to_pydatetime()
min_valid, max_valid = min(valid_df['open_time']).to_pydatetime(), max(
              valid_df['open_time']).to_pydatetime()

x_train, x_val = valid_df[train_features_test], valid_df[train_features_test]
y_train, y_val = valid_df['label'], valid_df['label']


print(f'Train Date is from {min_train} - {max_train}')
print(f'Valid Date is from {min_valid} - {max_valid}')

y_train, y_val = valid_df['label'], valid_df['label']
print(f'Shape of Xtrain is {x_train.shape}, Shape of yTrain is {y_train.shape}')

if use_weights:
    weights = []
    for val in np.unique(y_train):
        prop = (y_train == val).sum() / y_train.shape[0]
        print(f'Class 0: train: {prop}')
        weights.append(prop)
    weights = np.array(weights)
    loss_weights = get_weights(weights)
    weights = {}
    for i in range(len(loss_weights)):
        weights[i] = loss_weights[i]

    y_train = tf.one_hot(y_train, depth = 3)
    y_val = tf.one_hot(y_val, depth = 3)

model = create_ae_mlp(**params)
model.load_weights(f'../output/AEMLP_4_{batch_size}.hdf5')

predictions = model.predict(x_val.values)
preds = predictions[-1]
ypred = pd.DataFrame(preds, columns = [f'prob_{i}' for i in range(3)])
cols_to_keep = ['open_time', 'label', 'target_15m', 'token', 'close', 'open']
x_df = valid_df[cols_to_keep].reset_index(drop = True)
x_df.columns = cols_to_keep
for i in range(3):
  x_df[f'prob_{i}'] = ypred[f'prob_{i}']

df_pred.append(x_df)


Train Date is from 2022-08-16 07:12:00 - 2023-05-17 02:23:00
Valid Date is from 2022-08-16 07:12:00 - 2023-05-17 02:23:00
Shape of Xtrain is (788544, 147), Shape of yTrain is (788544,)
Class 0: train: 0.7189617827286746
Class 0: train: 0.138683446960474
Class 0: train: 0.1423547703108514


In [68]:
df_pred_test = pd.concat(df_pred, axis = 0)
prob_cols = [f'prob_{i}' for i in range(3)]
df_pred_test['pred_label'] = np.argmax(df_pred_test[prob_cols].values, axis = 1)

In [69]:
df_pred_test.to_feather('../data/back_test_output_valid_aemlp.feather')