In [1]:
import os
import random
import pandas as pd
import numpy as np
import lightgbm as lgbm
import mlflow

import sys
sys.path.append('../input/g-research-crypto-forecasting')
import gresearch_crypto


TRAIN_CSV = '../input/g-research-crypto-forecasting/train.csv'
ASSET_DETAILS_CSV = '../input/g-research-crypto-forecasting/asset_details.csv'
RESULT_FOLDER = '../result/nb020'
if not os.path.isdir(RESULT_FOLDER):
    os.makedirs(RESULT_FOLDER)
SEED = 2021
DEBUG = False

REMOVE_LB_TEST_OVERLAPPING_DATA = True


In [2]:
def fix_all_seeds(seed):
    np.random.seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

fix_all_seeds(SEED)

In [3]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2 
    dfs = []
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    dfs.append(df[col].astype(np.int8))
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    dfs.append(df[col].astype(np.int16))
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    dfs.append(df[col].astype(np.int32))
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    dfs.append(df[col].astype(np.int64) ) 
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    dfs.append(df[col].astype(np.float16))
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    dfs.append(df[col].astype(np.float32))
                else:
                    dfs.append(df[col].astype(np.float64))
        else:
            dfs.append(df[col])
    
    df_out = pd.concat(dfs, axis=1)
    if verbose:
        end_mem = df_out.memory_usage().sum() / 1024**2
        num_reduction = str(100 * (start_mem - end_mem) / start_mem)
        print(f'Mem. usage decreased to {str(end_mem)[:3]}Mb:  {num_reduction[:2]}% reduction')
    return df_out

In [4]:
df_train = pd.read_csv(TRAIN_CSV)
df_train.head()

Unnamed: 0,timestamp,Asset_ID,Count,Open,High,Low,Close,Volume,VWAP,Target
0,1514764860,2,40.0,2376.58,2399.5,2357.14,2374.59,19.233005,2373.116392,-0.004218
1,1514764860,0,5.0,8.53,8.53,8.53,8.53,78.38,8.53,-0.014399
2,1514764860,1,229.0,13835.194,14013.8,13666.11,13850.176,31.550062,13827.062093,-0.014643
3,1514764860,5,32.0,7.6596,7.6596,7.6567,7.6576,6626.71337,7.657713,-0.013922
4,1514764860,7,5.0,25.92,25.92,25.874,25.877,121.08731,25.891363,-0.008264


## Keep only values _before_ the LB test set

In [5]:
# Remove the future
if REMOVE_LB_TEST_OVERLAPPING_DATA:
    df_train['datetime'] = pd.to_datetime(df_train['timestamp'], unit='s')
    df_test = df_train[df_train['datetime'] >= '2021-06-13 00:00:00']
    df_test = df_test.dropna(how="any")
    df_train = df_train[df_train['datetime'] < '2021-06-13 00:00:00']
    if DEBUG:
        df_train = df_train.iloc[:100000]


In [6]:
df_asset_details = pd.read_csv(ASSET_DETAILS_CSV).sort_values("Asset_ID")
df_asset_details

Unnamed: 0,Asset_ID,Weight,Asset_Name
1,0,4.304065,Binance Coin
2,1,6.779922,Bitcoin
0,2,2.397895,Bitcoin Cash
10,3,4.406719,Cardano
13,4,3.555348,Dogecoin
3,5,1.386294,EOS.IO
5,6,5.894403,Ethereum
4,7,2.079442,Ethereum Classic
11,8,1.098612,IOTA
6,9,2.397895,Litecoin


# Training

In [7]:
# !pip install --no-index --find-links ../input/talibbinary/talib_binary-0.4.19-cp37-cp37m-manylinux1_x86_64.whl talib-binary

## Utility functions to train a model for one asset

In [12]:
import talib
from sklearn.preprocessing import StandardScaler
sys.path.append('../src')
from utils import eval_w_corr

# Two new features from the competition tutorial
def upper_shadow(df):
    return df['High'] - np.maximum(df['Close'], df['Open'])

def lower_shadow(df):
    return np.minimum(df['Close'], df['Open']) - df['Low']

def moving_average(a, n):
    ret = np.cumsum(a, dtype=float)
    ret[n:] = ret[n:] - ret[:-n]
    return ret / n

# A utility function to build features from the original df
# It works for rows to, so we can reutilize it.
def get_features(df):
    df_feat = df[['Count', 'Open', 'High', 'Low', 'Close', 'Volume', 'VWAP', 'Asset_ID', 'Target', 'timestamp']].copy()
    df_feat['upper_shadow'] = upper_shadow(df_feat)
    df_feat['lower_shadow'] = lower_shadow(df_feat)
    
    df_feat['ln_Close'] = np.log(df['Close'])
    
    asset_group_close = df_feat.groupby('Asset_ID')['Close']

    df_feat['high_low_div'] = df_feat['High'] / df_feat['Low']
    df_feat['open_close_div'] = df_feat['Open'] / df_feat['Close']

    df_feat['hlco_ration'] = (df_feat["Open"] - df_feat["Close"]) / (df_feat["High"] - df_feat["Low"])
    
    df_feat['log_return_1'] = df_feat['ln_Close'] - df_feat.groupby('Asset_ID')['ln_Close'].shift(1)
    df_feat['log_return_5'] = df_feat['ln_Close'] - df_feat.groupby('Asset_ID')['ln_Close'].shift(5)
    df_feat['log_return_15'] = df_feat['ln_Close'] - df_feat.groupby('Asset_ID')['ln_Close'].shift(15)
    df_feat['log_return_60'] = df_feat['ln_Close'] - df_feat.groupby('Asset_ID')['ln_Close'].shift(60)
    
    for i in [5, 15, 60]:
        df_feat['realized_volatility_'+str(i)] = df_feat.groupby('Asset_ID').log_return_1.transform(lambda x: x.rolling(i).std(ddof=0))
#         df_data['RV_'+str(i)+'_rank'] = df_data.groupby('timestamp')['realized_volatility_'+str(i)].transform('rank')
    
        df_feat['moving_average_'+str(i)] = asset_group_close.transform(lambda x: moving_average(x.values, i))
        df_feat['moving_std_'+str(i)] = asset_group_close.transform(lambda x: x.rolling(window=i, min_periods=1).std())
        df_feat['volume_moving_average_'+str(i)] = df_feat.groupby('Asset_ID').Volume.transform(lambda x: moving_average(x.values, i))
        df_feat['RSI_'+str(i)] = asset_group_close.transform(lambda x: talib.RSI(x.values.astype(np.float64), i))
        
        df_feat['close_div_ma_'+str(i)] = df_feat['Close'] / df_feat['moving_average_'+str(i)]
        df_feat['volume_div_ma_'+str(i)] = df_feat['Volume'] / df_feat['volume_moving_average_'+str(i)]

    # df_feat['MACD'], df_feat['MACD_signal'], df_feat['MACD_hist'] = talib.MACD(df_feat.Close.values, fastperiod=12, slowperiod=26, signalperiod=9)
    # df_feat['adx'] = talib.ADX(df_feat.High, df_feat.Low, df_feat.Close, timeperiod=14)

    df_feat['close_div_ma_5'] = df_feat['Close'] / df_feat['moving_average_5']
    df_feat['close_div_ma_15'] = df_feat['Close'] / df_feat['moving_average_15']
    df_feat['close_div_ma_60'] = df_feat['Close'] / df_feat['moving_average_60']

    df_feat['volume_div_ma_5'] = df_feat['Volume'] / df_feat['volume_moving_average_5']
    df_feat['volume_div_ma_15'] = df_feat['Volume'] / df_feat['volume_moving_average_15']
    df_feat['volume_div_ma_60'] = df_feat['Volume'] / df_feat['volume_moving_average_60']

    df_feat['close_div_ma_15_rank'] = df_feat.groupby('timestamp').close_div_ma_15.transform('rank')
    df_feat['volume_div_ma_15_rank'] = df_feat.groupby('timestamp').volume_div_ma_15.transform('rank')

    df_feat['RSI_5_rank'] = df_feat.groupby('timestamp').RSI_5.transform('rank')
    df_feat['RSI_15_rank'] = df_feat.groupby('timestamp').RSI_15.transform('rank')
    df_feat['RSI_60_rank'] = df_feat.groupby('timestamp').RSI_60.transform('rank')

    df_feat = df_feat.drop(['ln_Close', 'timestamp'], axis=1)
    
    return df_feat


def get_model_for_asset(df_train, df_test, asset_id):
    df_train = df_train[df_train["Asset_ID"] == asset_id]
    df_test = df_test[df_test["Asset_ID"] == asset_id]
    
    df_train = df_train.dropna(how="any")
    df_test = df_test.dropna(how="any")

    
    X_train, y_train = df_train.drop(['Asset_ID', 'Target'], axis=1), df_train["Target"]
    X_valid, y_valid = df_test.drop(['Asset_ID', 'Target'], axis=1), df_test["Target"]
    lgbm_train = lgbm.Dataset(X_train, y_train)
    lgbm_valid = lgbm.Dataset(X_valid, y_valid)
    weight_map_dict = dict(zip(df_asset_details['Asset_ID'], df_asset_details['Weight']))
    lgbm_train.add_w = df_train['Asset_ID'].map(weight_map_dict)
    lgbm_valid.add_w = df_test['Asset_ID'].map(weight_map_dict)
    with mlflow.start_run(experiment_id=5, nested=True):

        params = {
            "objective": "regression", 
            "metric": "l2", 
            "boosting_type": "gbdt",
            # 'early_stopping_rounds': 20,
            'learning_rate': 0.05,
            'num_leaves': 8,
            'feature_fraction': 0.5,
            'bagging_fraction': 0.5,
            'bagging_freq': 1,
            # 'extra_trees': True,
            'seed': 55
        }
        mlflow.log_params(params)

        category_feature = []
        model = lgbm.train(params=params,
            train_set=lgbm_train,
            valid_sets=[lgbm_train, lgbm_valid],
            num_boost_round=50,
            verbose_eval=10,
            feval=eval_w_corr,
            categorical_feature = category_feature,
        )
    
    return model

## Loop over all assets

In [9]:
def weighted_correlation(a, b, weights):
  w = np.ravel(weights)
  a = np.ravel(a)
  b = np.ravel(b)

  sum_w = np.sum(w)
  mean_a = np.sum(a * w) / sum_w
  mean_b = np.sum(b * w) / sum_w
  var_a = np.sum(w * np.square(a - mean_a)) / sum_w
  var_b = np.sum(w * np.square(b - mean_b)) / sum_w

  cov = np.sum((a * b * w)) / np.sum(w) - mean_a * mean_b
  corr = cov / np.sqrt(var_a * var_b)

  return corr

In [13]:

df_train = get_features(df_train)
df_test = get_features(df_test)
print(df_train.columns)

Index(['Count', 'Open', 'High', 'Low', 'Close', 'Volume', 'VWAP', 'Asset_ID',
       'Target', 'upper_shadow', 'lower_shadow', 'high_low_div',
       'open_close_div', 'hlco_ration', 'log_return_1', 'log_return_5',
       'log_return_15', 'log_return_60', 'realized_volatility_5',
       'moving_average_5', 'moving_std_5', 'volume_moving_average_5', 'RSI_5',
       'close_div_ma_5', 'volume_div_ma_5', 'realized_volatility_15',
       'moving_average_15', 'moving_std_15', 'volume_moving_average_15',
       'RSI_15', 'close_div_ma_15', 'volume_div_ma_15',
       'realized_volatility_60', 'moving_average_60', 'moving_std_60',
       'volume_moving_average_60', 'RSI_60', 'close_div_ma_60',
       'volume_div_ma_60', 'close_div_ma_15_rank', 'volume_div_ma_15_rank',
       'RSI_5_rank', 'RSI_15_rank', 'RSI_60_rank'],
      dtype='object')


In [14]:
Xs = {}
ys = {}
models = {}

y_valids = []
mlflow.lightgbm.autolog()
mlflow.set_tracking_uri('../src/mlruns/')
with mlflow.start_run(experiment_id=5):
    for asset_id, asset_name in zip(df_asset_details['Asset_ID'], df_asset_details['Asset_Name']):
        print(f"Training model for {asset_name:<16} (ID={asset_id:<2})")
        
        model = get_model_for_asset(df_train, df_test, asset_id)
        models[asset_id] = model
        
        X_valid = df_test[df_test['Asset_ID'] == asset_id].drop(['Target', 'Asset_ID'], axis=1)
        y_valid = df_test.loc[df_test['Asset_ID'] == asset_id, ['Target']]
        weight_map_dict = dict(zip(df_asset_details['Asset_ID'], df_asset_details['Weight']))
        y_valid['Weight'] = weight_map_dict[asset_id]
        y_valid['Pred'] = model.predict(X_valid)
        
        y_valids.append(y_valid)
        model.save_model(os.path.join(RESULT_FOLDER, f'model{asset_id}.lgb'), num_iteration=model.best_iteration)

    y_valids = pd.concat(y_valids)

    metric = weighted_correlation(y_valids['Pred'], y_valids['Target'], y_valids['Weight'])
    print(f"overall, wcorr: {metric}")
    mlflow.log_metric('score', metric)

del X_valid, y_valid, y_valids

Training model for Binance Coin     (ID=0 )


New categorical_feature is []


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9510
[LightGBM] [Info] Number of data points in the train set: 1765347, number of used features: 42
[LightGBM] [Info] Start training from score 0.000034
[10]	training's l2: 3.1491e-05	training's eval_wcorr: 0.141782	valid_1's l2: 7.20706e-06	valid_1's eval_wcorr: 0.027538
[20]	training's l2: 3.13392e-05	training's eval_wcorr: 0.158304	valid_1's l2: 7.20437e-06	valid_1's eval_wcorr: 0.0316982
[30]	training's l2: 3.12063e-05	training's eval_wcorr: 0.171254	valid_1's l2: 7.21594e-06	valid_1's eval_wcorr: -0.00105995
[40]	training's l2: 3.1121e-05	training's eval_wcorr: 0.179378	valid_1's l2: 7.2351e-06	valid_1's eval_wcorr: -0.033238
[50]	training's l2: 3.10454e-05	training's eval_wcorr: 0.185779	valid_1's l2: 7.23501e-06	valid_1's eval_wcorr: -0.0256211
Training model for Bitcoin          (ID=1 )
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9510
[LightGBM] [Inf

In [None]:
raise ValueError("stop!")

ValueError: stop!

# Predict & submit

References: [Detailed API Introduction](https://www.kaggle.com/sohier/detailed-api-introduction)

Something that helped me understand this iterator was adding a pdb checkpoint inside of the for loop:

```python
import pdb; pdb.set_trace()
```

See [Python Debugging With Pdb](https://realpython.com/python-debugging-pdb/) if you want to use it and you don't know how to.


In [None]:
import time
history = pd.DataFrame()
max_lookback = 60

env = gresearch_crypto.make_env()
iter_test = env.iter_test()

start = time.time()
for i, (df_test, df_pred) in enumerate(iter_test):
    history = pd.concat([history, df_test]).reset_index(drop=True)

    for j , row in df_test.iterrows():
        model = models[row['Asset_ID']]
        x_test = get_features(history[history['Asset_ID'] == row['Asset_ID']]).reset_index(drop=True)
        y_pred = model.predict([x_test.iloc[-1]])[0]
        
        df_pred.loc[df_pred['row_id'] == row['row_id'], 'Target'] = y_pred
        
        # Print just one sample row to get a feeling of what it looks like
        if i == 0 and j == 0:
            display(x_test)

    # Display the first prediction dataframe
    if i == 0:
        display(df_pred)
    history = history.sort_values(by='row_id')
    history = history.iloc[-(max_lookback*14+100):]
    
    # Send submissions
    env.predict(df_pred)
end = time.time()
end-start

# df_train and df_test overlap

In [None]:
# df_test = pd.concat(all_df_test)
# df_test['datetime'] = pd.to_datetime(df_test['timestamp'], unit='s')
# df_train['datetime'] = pd.to_datetime(df_train['timestamp'], unit='s')

In [None]:
# df_train['datetime'].max()

In [None]:
# df_test['datetime'].min()