In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [None]:
crypto_df = pd.read_csv("../input/g-research-crypto-forecasting/" + 'train.csv')
assets = pd.read_csv('../input/g-research-crypto-forecasting/asset_details.csv')
assets_names = dict(zip(assets.Asset_ID, assets.Asset_Name))
assets_order = pd.read_csv('../input/g-research-crypto-forecasting/supplemental_train.csv').Asset_ID[:14]
assets_order = dict((t,i) for i,t in enumerate(assets_order))

In [None]:
train = crypto_df.copy().set_index("timestamp")
train['assets'] = 1
train['assets'] = train.groupby(by = train.index)['assets'].sum()
train['asset_name'] = train.Asset_ID.map(assets_names)
train['asset_name'].value_counts()
all_same_time = train[train['assets'] == 14][['Asset_ID', 'Count', 'Open', 'High', 'Low', 'Close', 'Volume', 'asset_name','VWAP','Target']]

In [None]:
import os
import gc
import traceback
import numpy as np
import pandas as pd
import seaborn as sns
import gresearch_crypto #note: this notebook have to run on Kaggle, as this package is provided by the competition organizor and only available on Kaggle
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt
import pandas as pd, numpy as np
from tensorflow.keras import layers
import tensorflow_probability as tfp
import tensorflow.keras.backend as K
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import RobustScaler
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator
pd.set_option('display.max_columns', None)

In [None]:
DEVICE = "GPU" 
SEED = 50

EPOCHS = 1000
DEBUG = False
N_ASSETS = 14
WINDOW_SIZE = 15
BATCH_SIZE = 1024
PCT_VALIDATION = 10 

In [None]:
strategy = tf.distribute.get_strategy()
AUTO     = tf.data.experimental.AUTOTUNE
REPLICAS = strategy.num_replicas_in_sync

In [None]:
data_path = '../input/g-research-crypto-forecasting/'
orig_df_train = pd.read_csv('/kaggle/input/g-research-crypto-forecasting/train.csv')
supp_df_train = pd.read_csv('../input/g-research-crypto-forecasting/supplemental_train.csv')
df_asset_details = pd.read_csv('/kaggle/input/g-research-crypto-forecasting/asset_details.csv').sort_values("Asset_ID")


def load_training_data_for_asset(asset_id):
    dfs = []        
    dfs.append(orig_df_train[orig_df_train["Asset_ID"] == asset_id].copy())
    df = pd.concat(dfs, axis = 0) if len(dfs) > 1 else dfs[0]
    df['date'] = pd.to_datetime(df['timestamp'], unit = 's')        
    df = df.sort_values('date')
    return df

def load_data_for_all_assets():
    dfs = []
    for asset_id in range(14): dfs.append(load_training_data_for_asset(asset_id))
    return pd.concat(dfs) 

train = load_data_for_all_assets().sort_values('timestamp').set_index("timestamp")

In [None]:
test = pd.read_csv(data_path + 'example_test.csv')
sample_prediction_df = pd.read_csv(data_path + 'example_sample_submission.csv')
assets = pd.read_csv('../input/g-research-crypto-forecasting/asset_details.csv')
assets_order = pd.read_csv('../input/g-research-crypto-forecasting/supplemental_train.csv').Asset_ID[:N_ASSETS]
assets_order = dict((t,i) for i,t in enumerate(assets_order))

In [None]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype.name
        
        if col_type not in ['object', 'category', 'datetime64[ns, UTC]', 'datetime64[ns]']:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df


In [None]:
def get_features(df):
    df['Upper_Shadow'] = df['High'] - np.maximum(df['Close'], df['Open'])
    df['Lower_Shadow'] = np.minimum(df['Close'], df['Open']) - df['Low']
    df['spread'] = df['High'] - df['Low']
    df['mean_trade'] = df['Volume']/df['Count']
    df['log_price_change'] = np.log(df['Close']/df['Open'])
    df['close_vwap_diff'] = (df['Close'] - df['VWAP']).abs()
    return df

train[['Count', 'Open', 'High', 'Low', 'Close', 'Volume', 'VWAP', 'Target']] = train[['Count', 'Open', 'High', 'Low', 'Close', 'Volume', 'VWAP', 'Target']].astype(np.float32)
print(train.shape)
train['Target'] = train['Target'].fillna(0)
VWAP_max = np.max(train[np.isfinite(train.VWAP)].VWAP)
VWAP_min = np.min(train[np.isfinite(train.VWAP)].VWAP)
train['VWAP'] = np.nan_to_num(train.VWAP, posinf=VWAP_max, neginf=VWAP_min)
df = train[['Asset_ID', 'Target']].copy()
times = dict((t,i) for i,t in enumerate(df.index.unique()))
df['id'] = df.index.map(times)
df['id'] = df['id'].astype(str) + '_' + df['Asset_ID'].astype(str)
ids = df.id.copy()
del df
train = get_features(train)
train_features = [i for i in train.columns if i not in ['Target', 'date', 'timestamp', 'Asset_ID', 'groups']]

In [None]:
train = train.sort_index()
ind = train.index.unique()
def reindex(df):
    df = df.reindex(range(ind[0],ind[-1]+60,60),method='nearest')
    df = df.fillna(method="ffill").fillna(method="bfill")
    return df
train = train.groupby('Asset_ID').apply(reindex).reset_index(0, drop=True).sort_index()
gc.collect()
train.shape

In [None]:
train['group_num'] = train.index.map(times)
train = train.dropna(subset=['group_num'])
train['group_num'] = train['group_num'].astype('int')
train['id'] = train['group_num'].astype(str) + '_' + train['Asset_ID'].astype(str)
train['is_real'] = train.id.isin(ids) * 1
train = train.drop('id', axis=1)

In [None]:
features = train.columns.drop(['Asset_ID','group_num','is_real'])
train.loc[train.is_real == 0, features] = 0.

In [None]:
train['asset_order'] = train.Asset_ID.map(assets_order)
train = train.sort_values(by=['group_num', 'asset_order'])
train = reduce_mem_usage(train)
train.head(20)
gc.collect()

In [None]:
targets = train['Target'].to_numpy().reshape(-1, N_ASSETS)
features = train.columns.drop(['Asset_ID', 'Target', 'group_num', 'is_real', 'date'])
train = train[features]
train = train.values
train = train.reshape(-1, N_ASSETS, train.shape[-1])

In [None]:
class item_generate(keras.utils.Sequence):
    def __init__(self, x_set, y_set, batch_size, length):
        self.x, self.y = x_set, y_set
        self.batch_size = batch_size
        self.length = length
        self.size = len(x_set)
    def __len__(self): return int(np.ceil(len(self.x) / float(self.batch_size)))
    def __getitem__(self, idx):
        batch_x=[]
        batch_y=[]
        for i in range(self.batch_size):
            start_ind = self.batch_size*idx + i
            end_ind = start_ind + self.length 
            if end_ind <= self.size:
                batch_x.append(self.x[start_ind : end_ind])
                batch_y.append(self.y[end_ind -1])
        return np.array(batch_x), np.array(batch_y)

In [None]:
X_train, X_test = train[:-len(train)//PCT_VALIDATION], train[-len(train)//PCT_VALIDATION:]
y_train, y_test = targets[:-len(train)//PCT_VALIDATION], targets[-len(train)//PCT_VALIDATION:]

In [None]:
train_generator = item_generate(X_train, y_train, length = WINDOW_SIZE, batch_size = BATCH_SIZE)
val_generator = item_generate(X_test, y_test, length = WINDOW_SIZE, batch_size = BATCH_SIZE)
print(f'Sample shape: {train_generator[0][0].shape}')
print(f'Target shape: {train_generator[0][1].shape}')

In [None]:
import math
import tensorflow as tf
from keras.models import Model
from keras import backend as K
from tensorflow.keras.losses import MSE
from keras.layers import Dense, Activation, Dropout, Layer, LayerNormalization, Conv1D, Input, Concatenate, Permute, Add, GlobalAvgPool1D

class FixedPositionalEncoding(Layer):
    def __init__(self, d_model, dropout = 0.1, max_len = 1024, scale_factor = 1.0, batch_size = BATCH_SIZE):
        super(FixedPositionalEncoding, self).__init__()
        self.dropout = Dropout(dropout)
        self.batch_size = batch_size
        self.pe = np.zeros((max_len, d_model))
        position = np.expand_dims(np.arange(0, max_len, dtype = np.float32), 1)
        div_term = np.exp(np.arange(0, d_model, 2)) * (-np.log(10000.0) / d_model)
        self.pe[:, 0::2] = tf.sin(position * div_term)
        self.pe[:, 1::2] = tf.cos(position * div_term)
        self.pe = scale_factor * np.expand_dims(self.pe, 1)
    def call(self, x):
        x = x + self.pe
        return self.dropout(x)

class LearnablePositionalEncoding(Layer):
    def __init__(self, d_model, dropout = 0.1, max_len = 1024):
        super(LearnablePositionalEncoding, self).__init__()
        self.dropout = Dropout(dropout)
        self.pe = tf.Variable((max_len, 1, d_model))
        self.pe = nn.init.uniform_(self.pe, -0.02, 0.02)

    def call(self, inputs, *args, **kwargs):
        x = inputs + self.pe[:inputs.size(0), :]
        return self.dropout(x)
    
class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.d_model = d_model
        self.num_heads = num_heads
        self.depth = d_model // self.num_heads
        self.wq = tf.keras.layers.Dense(d_model, kernel_initializer='glorot_uniform', use_bias=False)
        self.wk = tf.keras.layers.Dense(d_model, kernel_initializer='glorot_uniform', use_bias=False)
        self.wv = tf.keras.layers.Dense(d_model, kernel_initializer='glorot_uniform', use_bias=False)
        self.ff = tf.keras.layers.Dense(d_model, kernel_initializer='glorot_uniform', use_bias=False)

    def split_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, inputs, *args, **kwargs):
        q, k, v, mask = inputs
        batch_size = tf.shape(q)[0]
        q = self.wq(q)
        k = self.wk(k)
        v = self.wv(v)
        q = self.split_heads(q, batch_size)
        k = self.split_heads(k, batch_size)
        v = self.split_heads(v, batch_size)
        scaled_attention, attention_weights = scaled_dot_product_attention(q, k, v, mask)
        scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])
        concat_attention = tf.reshape(scaled_attention, (batch_size, -1, self.d_model))
        output = self.ff(concat_attention)
        return output, attention_weights

def scaled_dot_product_attention(q, k, v, mask):
    matmul_qk = tf.matmul(q, k, transpose_b=True)
    dk = tf.cast(tf.shape(k)[-1], tf.float32)
    scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)
    if mask is not None: scaled_attention_logits += (mask * -1e9)
    attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)
    output = tf.matmul(attention_weights, v)
    return output, attention_weights

class PointwiseFeedforward(tf.keras.layers.Layer):
    def __init__(self, d_model):
        super(PointwiseFeedforward, self).__init__()
        self.ff1 = tf.keras.layers.Dense(d_model, kernel_initializer='glorot_uniform')
        self. ff2 = tf.keras.layers.Dense(d_model, kernel_initializer='glorot_uniform')

    def call(self, x):
        x = self.ff1(x)
        x = tf.nn.leaky_relu(x)
        x = self.ff2(x)
        return x

class TransformerLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads):
        super(TransformerLayer, self).__init__()
        self.mha = MultiHeadAttention(d_model, num_heads)
        self.ffn = PointwiseFeedforward(d_model)
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

    def call(self, x, y, mask):
        attn_output, _ = self.mha(x, y, y, mask)
        out1 = self.layernorm1(x + attn_output)
        ffn_output = self.ffn(out1)
        out2 = self.layernorm2(out1 + ffn_output)
        return out2

class SingleAttention(Layer):
    def __init__(self, d_k, d_v):
        super(SingleAttention, self).__init__()
        self.d_k = d_k
        self.d_v = d_v

    def build(self, input_shape):
        self.query = Dense(self.d_k, input_shape = input_shape, kernel_initializer = 'glorot_uniform', bias_initializer = 'glorot_uniform')
        self.key   = Dense(self.d_k, input_shape = input_shape, kernel_initializer = 'glorot_uniform', bias_initializer = 'glorot_uniform')
        self.value = Dense(self.d_v, input_shape = input_shape, kernel_initializer = 'glorot_uniform', bias_initializer = 'glorot_uniform')

    def call(self, inputs):
        q = self.query(inputs[0])
        k = self.key(inputs[1])
        v = self.value(inputs[2])
        attn_weights = tf.matmul(q, k, transpose_b = True)
        attn_weights = tf.map_fn(lambda x: x / np.sqrt(self.d_k), attn_weights)
        attn_weights = tf.nn.softmax(attn_weights, axis = -1)
        attn_out = tf.matmul(attn_weights, v)
        return attn_out

class MultiAttention(Layer):
    def __init__(self, d_k, d_v, n_heads):
        super(MultiAttention, self).__init__()
        self.d_k = d_k
        self.d_v = d_v
        self.n_heads = n_heads
        self.attn_heads = list()

    def build(self, input_shape):
        for n in range(self.n_heads):
            self.attn_heads.append(SingleAttention(self.d_k, self.d_v))
        self.linear = Dense(input_shape[0][-1], kernel_initializer = 'glorot_uniform', bias_initializer = 'glorot_uniform')

    def call(self, inputs):
        attn = [self.attn_heads[i](inputs) for i in range(self.n_heads)]
        concat_attn = Concatenate(axis = 2)(attn)
        multi_linear = self.linear(concat_attn)
        return multi_linear

class TransformerEncoder(Layer):
    def __init__(self, d_k, n_heads, ff_dim, dropout = 0.1, **kwargs):
        super(TransformerEncoder, self).__init__()
        self.d_k = d_k
        self.ff_dim = ff_dim
        self.n_heads = n_heads
        self.attn_heads = list()
        self.dropout_rate = dropout

    def build(self, input_shape):
        self.attn_multi = MultiAttention(self.d_k, self.d_k, self.n_heads)
        self.attn_dropout = Dropout(self.dropout_rate)
        self.attn_normalize = LayerNormalization(epsilon = 1e-6)
        self.ff_conv1D_1 = Conv1D(filters = self.ff_dim, kernel_size = 1, activation = 'relu', padding = 'same')
        self.ff_conv1D_2 = Conv1D(filters = input_shape[-1], kernel_size = 1, padding = 'same')
        self.ff_dropout = Dropout(self.dropout_rate)
        self.ff_normalize = LayerNormalization(epsilon = 1e-6)
        self.addition = Add()
        self.addition_2 = Add()

    def call(self, inputs):
        attn_layer = self.attn_multi([inputs, inputs, inputs])
        attn_layer = self.attn_dropout(attn_layer)
        attn_layer = self.attn_normalize(self.addition_2([inputs, attn_layer]))
        ff_layer = self.ff_conv1D_1(attn_layer)
        ff_layer = self.ff_conv1D_2(ff_layer)
        ff_layer = self.ff_dropout(ff_layer)
        ff_layer = self.ff_normalize(self.addition([inputs, ff_layer]))
        return ff_layer

    def get_config(self):
        config = super().get_config().copy()
        config.update({'d_k': self.d_k, 'd_v': self.d_v, 'n_heads': self.n_heads, 'ff_dim': self.ff_dim, 'attn_heads': self.attn_heads, 'dropout_rate': self.dropout_rate})
        return config

def batch_generator(batch_size):
    while True:
        x_train, y_train = train_features, targets
        for i in range(0, len(x_train), batch_size):
            x_batch = x_train[i:i + batch_size]
            y_batch = y_train[i:i + batch_size]
            for idx in range(x_batch.shape[0]):
                mask = noise_mask(x_batch[0], masking_ratio, mean_mask_length, 'separate', 'geometric', None)
                x_batch[idx][np.where(mask)] = -1.0
                y_batch[idx][np.where(np.sum(mask, axis = -1))] = -1.0
            yield (x_batch, y_batch)

class TSTransformer(Model):
    def __init__(self, feat_dim, max_len, d_model, n_heads, num_layers, dim_feedforward, dropout = 0.1, pos_encoding = 'fixed', activation = 'gelu', norm = 'BatchNorm', freeze = False):
        super(TSTransformer, self).__init__()
        self.max_len     = max_len
        self.d_model     = d_model
        self.n_heads     = n_heads
        self.project_inp = Dense(d_model)
        self.pos_enc     = FixedPositionalEncoding(d_model, dropout = dropout * (1.0 - freeze), max_len = max_len)
        self.encoder_layers = [TransformerEncoder(d_model, self.n_heads, dim_feedforward, dropout * (1.0 - freeze), activation = activation) for i in range(num_layers)]
        self.output_layer = Dense(feat_dim)
        self.act          = Activation('relu')
        self.dropout1     = Dropout(dropout)
        self.feat_dim     = feat_dim

    def call(self, X):
        inp = K.permute_dimensions(X, (1, 0, 2))
        inp = self.project_inp(inp) * math.sqrt(self.d_model)
        inp = self.pos_enc(inp)
        x = inp
        for lay in self.encoder_layers: x = lay(x)
        output = self.act(x)
        output = K.permute_dimensions(output, (1, 0, 2))
        output = self.dropout1(output)
        output = self.output_layer(output)
        return output
def noise_mask(X, masking_ratio, lm = 3, mode = 'separate', distribution = 'geometric', exclude_feats = None):
    if exclude_feats is not None: exclude_feats = set(exclude_feats)
    if distribution == 'geometric':
        if mode == 'separate':
            mask = np.ones(X.shape, dtype = bool)
            for m in range(X.shape[1]):
                if exclude_feats is None or m not in exclude_feats: mask[:, m] = geom_noise_mask_single(X.shape[0], lm, masking_ratio)  
        else: mask = np.tile(np.expand_dims(geom_noise_mask_single(X.shape[0], lm, masking_ratio), 1), X.shape[1])
    else:
        if mode == 'separate': mask = np.random.choice(np.array([True, False]), size = X.shape, replace = True, p = (1 - masking_ratio, masking_ratio))
        else: mask = np.tile(np.random.choice(np.array([True, False]), size = (X.shape[0], 1), replace = True, p = (1 - masking_ratio, masking_ratio)), X.shape[1])
    return mask

def geom_noise_mask_single(L, lm, masking_ratio):
    keep_mask = np.ones(L, dtype = bool)
    p_m = 1 / lm
    p_u = p_m * masking_ratio / (1 - masking_ratio)
    p = [p_m, p_u]
    state = int(np.random.rand() > masking_ratio)
    for i in range(L):
        keep_mask[i] = state
        if np.random.rand() < p[state]: state = 1 - state
    return keep_mask

def padding_mask(lengths, max_len = None):
    batch_size = lengths.numel()
    max_len = max_len or lengths.max_val()
    return (tf.range(0, max_len, device = lengths.device).type_as(lengths).repeat(batch_size, 1).lt(tf.expand_dims(lengths, 1)))

def collate_unsuperv(data, max_len = None, mask_compensation = False):
    batch_size = len(data)
    features, masks, u_out = zip(*data)
    lengths = [X.shape[0] for X in features]
    if max_len is None:
        max_len = max(lengths)
    X = tf.zeros(batch_size, max_len, features[0].shape[-1])
    target_masks = tf.zeros_like(X, dtype = tf.bool)
    for i in range(batch_size):
        end = min(lengths[i], max_len)
        X[i, :end, :] = features[i][:end, :]
        target_masks[i, :end, :] = masks[i][:end, :]
    targets = X.clone()
    X = X * target_masks
    if mask_compensation: X = compensate_masking(X, target_masks)
    padding_masks = tf.zeros(batch_size, max_len, dtype = tf.bool)
    for i in range(batch_size): padding_masks[i, :] = tf.where(u_out[i] == 0, 1, 0)
    target_masks = ~target_masks
    return X, targets, target_masks, padding_masks

def masked_mse(y_true, y_pred):
    is_mask = K.equal(tf.cast(y_true, tf.float32), tf.cast(-1.0, tf.float32))
    is_mask = K.cast(is_mask, dtype = K.floatx())
    is_mask = 1 - is_mask
    y_true = tf.cast(y_true, tf.float32) * tf.cast(is_mask, tf.float32)
    y_pred = tf.cast(y_pred, tf.float32) * tf.cast(is_mask, tf.float32)
    return MSE(y_true, y_pred)

def get_transformer_model(feat_dim, max_len, d_model, n_heads, num_layers, dim_feedforward, dropout = 0.1, pos_encoding = 'fixed', activation = 'gelu', norm = 'BatchNorm', freeze = False):
    input_layer = Input((breath_steps, train_features.shape[-1]))
    inp = input_layer
    inp = Dense(d_model)(inp) * math.sqrt(d_model)
    x = inp
    for idx, lay in enumerate(range(num_layers)):
        x = TransformerEncoder(d_model, n_heads, dim_feedforward, dropout * (1.0 - freeze), activation = activation)(x)
    output = Activation('relu')(x)
    output = Dropout(dropout)(output)
    output = Dense(feat_dim)(output)
    output = Permute([2, 1])(output)
    output = GlobalAvgPool1D()(output)
    return Model(inputs = input_layer, outputs = output)


In [None]:
from tensorflow.keras.layers import Concatenate, Add, GRU, GlobalAvgPool1D, Dense, Dropout, Input, Bidirectional, LSTM, Conv1D, Multiply

def MaxCorrelation(y_true,y_pred): return -tf.math.abs(tfp.stats.correlation(y_pred,y_true, sample_axis=None, event_axis=None))
def Correlation(y_true,y_pred): return tf.math.abs(tfp.stats.correlation(y_pred,y_true, sample_axis=None, event_axis=None))

def masked_mse(y_true, y_pred):
    mask = tf.math.not_equal(y_true, 0.)
    y_true_masked = tf.boolean_mask(y_true, mask)
    y_pred_masked = tf.boolean_mask(y_pred, mask)
    return tf.keras.losses.mean_squared_error(y_true = y_true_masked, y_pred = y_pred_masked)

def masked_mae(y_true, y_pred):
    mask = tf.math.not_equal(y_true, 0.)
    y_true_masked = tf.boolean_mask(y_true, mask)
    y_pred_masked = tf.boolean_mask(y_pred, mask)
    return tf.keras.losses.mean_absolute_error(y_true = y_true_masked, y_pred = y_pred_masked)

def masked_cosine(y_true, y_pred):
    mask = tf.math.not_equal(y_true, 0.)
    y_true_masked = tf.boolean_mask(y_true, mask)
    y_pred_masked = tf.boolean_mask(y_pred, mask)
    return tf.keras.losses.cosine_similarity(y_true_masked, y_pred_masked)

def get_transformer(x):
    n_heads = 8
    d_model = 128
    dropout = 0.1
    freeze = False
    num_layers = 3
    activation = 'gelu'
    dim_feedforward = 256
    
    x = Dense(d_model)(x) * math.sqrt(d_model)
    x = TransformerEncoder(d_model, n_heads, dim_feedforward, dropout * (1.0 - freeze), activation = activation)(x)
    
    return x

def get_wavenet(x):
    
    def wave_block(x, filters, kernel_size, n):
        dilation_rates = [2 ** i for i in range(n)]
        x = Conv1D(filters = filters, kernel_size = 1, padding = 'same')(x)
        res_x = x
        for dilation_rate in dilation_rates:
            tanh_out = Conv1D(filters = filters, kernel_size = kernel_size, padding = 'same', activation = 'tanh', dilation_rate = dilation_rate)(x)
            sigm_out = Conv1D(filters = filters, kernel_size = kernel_size, padding = 'same', activation = 'sigmoid', dilation_rate = dilation_rate)(x)
            x = Multiply()([tanh_out, sigm_out])
            x = Conv1D(filters = filters, kernel_size = 1, padding = 'same')(x)
            res_x = Add()([res_x, x])
        return res_x

    x = wave_block(x, 16, 3, 8)
    x = wave_block(x, 32, 3, 5)

    return x

def get_lstm(x):
    x = layers.LSTM(units=32, return_sequences=True)(x)
    return x

def get_model(n_assets = 14, model_name = 'wavenet'):
    x_input = keras.Input(shape=(train_generator[0][0].shape[1], n_assets, train_generator[0][0].shape[-1]))
    branch_outputs = []
    for i in range(n_assets):
        a = layers.Lambda(lambda x: x[:,:, i])(x_input) 
        a = layers.Masking(mask_value = 0., )(a)
        if model_name == 'wavenet':
            a = get_wavenet(a)
        elif model_name == 'transformer':
            a = get_transformer(a)
        elif model_name == 'lstm':
            a = get_lstm(a)
        a = layers.GlobalAvgPool1D()(a)
        branch_outputs.append(a)
    x = layers.Concatenate()(branch_outputs)
    x = layers.Dense(units = 128)(x)
    out = layers.Dense(units = n_assets)(x)
    model = keras.Model(inputs=x_input, outputs=out)
    model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3), loss = masked_cosine, metrics=[Correlation])
    return model
    
wavenet = wavenet_model()
transfomer = 

In [None]:
tf.keras.utils.plot_model(get_model(n_assets=1, 'wavenet'), show_shapes=True)

In [None]:
print(features)

tf.random.set_seed(0)
estop = keras.callbacks.EarlyStopping(monitor = 'val_loss', patience = 7, verbose = 0, mode = 'min',restore_best_weights = True)
scheduler = keras.optimizers.schedules.ExponentialDecay(1e-2, (0.5 * len(X_train) / BATCH_SIZE), 1e-2)
lr = keras.callbacks.LearningRateScheduler(scheduler, verbose = 1)
history = model.fit(train_generator, validation_data = (val_generator), epochs = EPOCHS, callbacks = [lr, estop])

fig, ax = plt.subplots(1, 2, figsize=(16, 8))
histories = pd.DataFrame(history.history)
epochs = list(range(1,len(histories)+1))
loss = histories['loss']
val_loss = histories['val_loss']
Correlation = histories['Correlation']
val_Correlation = histories['val_Correlation']
ax[0].plot(epochs, loss, label = 'Train Loss')
ax[0].plot(epochs, val_loss, label = 'Val Loss')
ax[0].set_title('Losses')
ax[0].set_xlabel('Epoch')
ax[0].legend(loc='upper right')
ax[1].plot(epochs, Correlation, label = 'Train Correlation')
ax[1].plot(epochs, val_Correlation, label = 'Val Correlation')
ax[1].set_title('Correlations')
ax[1].set_xlabel('Epoch')
ax[1].legend(loc='upper right')
fig.show()
gc.collect()

predictions = model.predict(val_generator)

for i in range(N_ASSETS):
    # drop first 14 values in the y_test, since they are absent in val_generator labels
    y_true = np.squeeze(y_test[WINDOW_SIZE - 1:, i])
    y_pred = np.squeeze(predictions[:, i])
    real_target_ind = np.argwhere(y_true!=0)
    asset_id = list(assets_order.keys())[i]
    asset_name = assets[assets.Asset_ID == asset_id]['Asset_Name'].item()
    print(f"{asset_name}: {np.corrcoef(y_pred[real_target_ind].flatten(), y_true[real_target_ind].flatten())[0,1]:.4f}")

In [None]:
# Submission
# placeholder for first 15 samples
sup = pd.read_csv('../input/g-research-crypto-forecasting/supplemental_train.csv')[:WINDOW_SIZE * (N_ASSETS)]
placeholder = get_features(sup)
placeholder['asset_order'] = placeholder.Asset_ID.map(assets_order)
test_sample = np.array(placeholder[features])
test_sample = test_sample.reshape(-1, (N_ASSETS), test_sample.shape[-1])
test_sample = np.expand_dims(test_sample, axis=0)

In [None]:
# for test gap filling
example = pd.read_csv('../input/g-research-crypto-forecasting/example_test.csv')[:WINDOW_SIZE - 1]
example['asset_order'] = example.Asset_ID.map(assets_order) 
example = example[['Asset_ID','asset_order']]

In [None]:
from backtesting import Strategy
from backtesting.lib import crossover
from backtesting.test import SMA
from backtesting.lib import plot_heatmaps

Cash = 100000

def BBANDS(data, n_lookback, n_std):
    """Bollinger bands indicator"""
    hlc3 = (data.High + data.Low + data.Close) / 3
    mean, std = hlc3.rolling(n_lookback).mean(), hlc3.rolling(n_lookback).std()
    upper = mean + n_std*std
    lower = mean - n_std*std
    return upper, lower



class MyStrategy(Strategy):
    n1 = 50
    n2 = 100
    n_enter = 20
    n_exit = 10
    
    def init(self):
        self.corr = self.I(SMA, self.data.Close, self.n1)
        self.sma_enter = self.I(SMA, self.data.Close, self.n_enter)
        self.sma_exit = self.I(SMA, self.data.Close, self.n_exit)
        
    def next(self):
        
        if not self.position:
            pos_corr_count = 0
            neg_corr_count = 0
            if self.corr > 0 and pos_corr_count >= 3:
                self.buy()
            elif self.corr < 0 and neg_corr_count >= 3:
                self.sell()
            if self.corr > 0:
                pos_corr_count += 1
                neg_corr_count = 0
            else:
                pos_corr_count = 0
                neg_corr_count += 1


In [None]:
env = gresearch_crypto.make_env()
iter_test = env.iter_test()

for (test_df, sample_prediction_df) in iter_test:
    test_df = get_features(test_df)
    test_data = test_df.merge(example, how='outer', on='Asset_ID').sort_values('asset_order')
    test = np.array(test_data[features].fillna(0))
    test = test.reshape(-1, 1, N_ASSETS, test.shape[-1])
    test_sample = np.hstack([test_sample, test])[:,-1 * WINDOW_SIZE:]
    y_pred = model.predict(test_sample).squeeze().reshape(-1, 1).squeeze()
    test_data['Target'] = y_pred
    for _, row in test_df.iterrows():
        try: sample_prediction_df.loc[sample_prediction_df['row_id'] == row['row_id'], 'Target'] = test_data.loc[test_data['row_id'] == row['row_id'], 'Target'].item()
        except: sample_prediction_df.loc[sample_prediction_df['row_id'] == row['row_id'], 'Target'] = 0
    env.predict(sample_prediction_df)