In [12]:
import os
import random
import pandas as pd
import numpy as np
from lightgbm import LGBMRegressor
import sys
sys.path.append('../input/g-research-crypto-forecasting')
import gresearch_crypto


TRAIN_CSV = '../input/g-research-crypto-forecasting/train.csv'
ASSET_DETAILS_CSV = '../input/g-research-crypto-forecasting/asset_details.csv'
RESULT_FOLDER = '../result/nb017'
if not os.path.isdir(RESULT_FOLDER):
    os.makedirs(RESULT_FOLDER)
SEED = 2021

REMOVE_LB_TEST_OVERLAPPING_DATA = True


In [13]:
def fix_all_seeds(seed):
    np.random.seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

fix_all_seeds(SEED)

In [14]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2 
    dfs = []
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    dfs.append(df[col].astype(np.int8))
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    dfs.append(df[col].astype(np.int16))
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    dfs.append(df[col].astype(np.int32))
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    dfs.append(df[col].astype(np.int64) ) 
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    dfs.append(df[col].astype(np.float16))
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    dfs.append(df[col].astype(np.float32))
                else:
                    dfs.append(df[col].astype(np.float64))
        else:
            dfs.append(df[col])
    
    df_out = pd.concat(dfs, axis=1)
    if verbose:
        end_mem = df_out.memory_usage().sum() / 1024**2
        num_reduction = str(100 * (start_mem - end_mem) / start_mem)
        print(f'Mem. usage decreased to {str(end_mem)[:3]}Mb:  {num_reduction[:2]}% reduction')
    return df_out

In [15]:
df_train = pd.read_csv(TRAIN_CSV)
df_train.head()

Unnamed: 0,timestamp,Asset_ID,Count,Open,High,Low,Close,Volume,VWAP,Target
0,1514764860,2,40.0,2376.58,2399.5,2357.14,2374.59,19.233005,2373.116392,-0.004218
1,1514764860,0,5.0,8.53,8.53,8.53,8.53,78.38,8.53,-0.014399
2,1514764860,1,229.0,13835.194,14013.8,13666.11,13850.176,31.550062,13827.062093,-0.014643
3,1514764860,5,32.0,7.6596,7.6596,7.6567,7.6576,6626.71337,7.657713,-0.013922
4,1514764860,7,5.0,25.92,25.92,25.874,25.877,121.08731,25.891363,-0.008264


## Keep only values _before_ the LB test set

In [16]:
# Remove the future
if REMOVE_LB_TEST_OVERLAPPING_DATA:
    df_train['datetime'] = pd.to_datetime(df_train['timestamp'], unit='s')
    df_test = df_train[df_train['datetime'] >= '2021-06-13 00:00:00']
    df_test = df_test.dropna(how="any")
    df_train = df_train[df_train['datetime'] < '2021-06-13 00:00:00']


In [17]:
df_asset_details = pd.read_csv(ASSET_DETAILS_CSV).sort_values("Asset_ID")
df_asset_details

Unnamed: 0,Asset_ID,Weight,Asset_Name
1,0,4.304065,Binance Coin
2,1,6.779922,Bitcoin
0,2,2.397895,Bitcoin Cash
10,3,4.406719,Cardano
13,4,3.555348,Dogecoin
3,5,1.386294,EOS.IO
5,6,5.894403,Ethereum
4,7,2.079442,Ethereum Classic
11,8,1.098612,IOTA
6,9,2.397895,Litecoin


# Training

In [18]:
# !pip install --no-index --find-links ../input/talibbinary/talib_binary-0.4.19-cp37-cp37m-manylinux1_x86_64.whl talib-binary

## Utility functions to train a model for one asset

In [19]:
import talib
from sklearn.preprocessing import StandardScaler

# Two new features from the competition tutorial
def upper_shadow(df):
    return df['High'] - np.maximum(df['Close'], df['Open'])

def lower_shadow(df):
    return np.minimum(df['Close'], df['Open']) - df['Low']

def moving_average(a, n):
    ret = np.cumsum(a, dtype=float)
    ret[n:] = ret[n:] - ret[:-n]
    return ret / n

# A utility function to build features from the original df
# It works for rows to, so we can reutilize it.
def get_features(df):
    df_feat = df[['Count', 'Open', 'High', 'Low', 'Close', 'Volume', 'VWAP', 'Asset_ID', 'Target']].copy()
    df_feat['upper_shadow'] = upper_shadow(df_feat)
    df_feat['lower_shadow'] = lower_shadow(df_feat)
    
    df_feat['ln_Close'] = np.log(df['Close'])
    
    asset_group_close = df_feat.groupby('Asset_ID')['Close']
    
    df_feat['log_return_1'] = df_feat['ln_Close'] - df_feat.groupby('Asset_ID')['ln_Close'].shift(1)
    df_feat['log_return_5'] = df_feat['ln_Close'] - df_feat.groupby('Asset_ID')['ln_Close'].shift(5)
    df_feat['log_return_15'] = df_feat['ln_Close'] - df_feat.groupby('Asset_ID')['ln_Close'].shift(15)
    df_feat['log_return_60'] = df_feat['ln_Close'] - df_feat.groupby('Asset_ID')['ln_Close'].shift(60)
    
    for i in [5, 15, 60]:
        df_feat['realized_volatility_'+str(i)] = df_feat.groupby('Asset_ID').log_return_1.transform(lambda x: x.rolling(i).std(ddof=0))
#         df_data['RV_'+str(i)+'_rank'] = df_data.groupby('timestamp')['realized_volatility_'+str(i)].transform('rank')
    
        df_feat['moving_average_'+str(i)] = asset_group_close.transform(lambda x: moving_average(x.values, i))
        df_feat['moving_std_'+str(i)] = asset_group_close.transform(lambda x: x.rolling(window=i, min_periods=1).std())
        df_feat['volume_moving_average_'+str(i)] = df_feat.groupby('Asset_ID').Volume.transform(lambda x: moving_average(x.values, i))
        df_feat['RSI_'+str(i)] = asset_group_close.transform(lambda x: talib.RSI(x.values.astype(np.float64), i))
        
        df_feat['close_div_ma_'+str(i)] = df_feat['Close'] / df_feat['moving_average_'+str(i)]
        df_feat['volume_div_ma_'+str(i)] = df_feat['Volume'] / df_feat['volume_moving_average_'+str(i)]

    # df_feat['MACD'], df_feat['MACD_signal'], df_feat['MACD_hist'] = talib.MACD(df_feat.Close.values, fastperiod=12, slowperiod=26, signalperiod=9)
    # df_feat['adx'] = talib.ADX(df_feat.High, df_feat.Low, df_feat.Close, timeperiod=14)

    df_feat = df_feat.drop(['ln_Close'], axis=1)
    
    return df_feat


def get_Xy_and_model_for_asset(df, asset_id):
    df = df[df["Asset_ID"] == asset_id]
    
    df = df.dropna(how="any")
    
    X = df.drop(['Asset_ID', 'Target'], axis=1)
    y = df["Target"]
    
    # TODO: Try different models here!
#     params = {
#         "metric": "rmse", 
#         "boosting_type": "gbdt",
#         'early_stopping_rounds': 20,
#         'learning_rate': 0.05,
#         'lambda_l1': 5,
#         'lambda_l2': 5,
#         'max_depth': 3,
#         'num_leaves': 4,
#         'feature_fraction': 0.5,
#         'bagging_fraction': 0.5,
#         'extra_trees': True,
#     }
    model = LGBMRegressor(
        n_estimators=50, 
        boosting_type='gbdt',
        metric='rmse',
#         early_stopping_rounds=20,
        learning_rate=0.05,
        num_leaves=8,
        colsample_bytree=0.5,
        subsample=0.5,
        subsample_freq=1,
        seed=55
    )
    model.fit(X, y)
    return X, y, model

## Loop over all assets

In [20]:
def weighted_correlation(a, b, weights):
  w = np.ravel(weights)
  a = np.ravel(a)
  b = np.ravel(b)

  sum_w = np.sum(w)
  mean_a = np.sum(a * w) / sum_w
  mean_b = np.sum(b * w) / sum_w
  var_a = np.sum(w * np.square(a - mean_a)) / sum_w
  var_b = np.sum(w * np.square(b - mean_b)) / sum_w

  cov = np.sum((a * b * w)) / np.sum(w) - mean_a * mean_b
  corr = cov / np.sqrt(var_a * var_b)

  return corr

In [21]:
Xs = {}
ys = {}
models = {}

y_valids = []
df_train = get_features(df_train)
df_test = get_features(df_test)
print(df_train.head())

for asset_id, asset_name in zip(df_asset_details['Asset_ID'], df_asset_details['Asset_Name']):
    print(f"Training model for {asset_name:<16} (ID={asset_id:<2})")
    
    X, y, model = get_Xy_and_model_for_asset(df_train, asset_id)
    Xs[asset_id], ys[asset_id], models[asset_id] = X, y, model
    del X, y
    
    X_valid = df_test[df_test['Asset_ID'] == asset_id].drop(['Target', 'Asset_ID'], axis=1)
    y_valid = df_test.loc[df_test['Asset_ID'] == asset_id, ['Target']]
    weight_map_dict = dict(zip(df_asset_details['Asset_ID'], df_asset_details['Weight']))
    y_valid['Weight'] = weight_map_dict[asset_id]
    y_valid['Pred'] = model.predict(X_valid)
    
    metric = weighted_correlation(y_valid['Pred'], y_valid['Target'], y_valid['Weight'])
    print(f"Asset_ID={asset_id}, wcorr: {metric}")
    y_valids.append(y_valid)
y_valids = pd.concat(y_valids)

metric = weighted_correlation(y_valids['Pred'], y_valids['Target'], y_valids['Weight'])
print(f"overall, wcorr: {metric}")

del X_valid, y_valid, y_valids

   Count        Open        High         Low       Close       Volume  \
0   40.0   2376.5800   2399.5000   2357.1400   2374.5900    19.233005   
1    5.0      8.5300      8.5300      8.5300      8.5300    78.380000   
2  229.0  13835.1940  14013.8000  13666.1100  13850.1760    31.550062   
3   32.0      7.6596      7.6596      7.6567      7.6576  6626.713370   
4    5.0     25.9200     25.9200     25.8740     25.8770   121.087310   

           VWAP  Asset_ID    Target  upper_shadow  ...  RSI_15  \
0   2373.116392         2 -0.004218        22.920  ...     NaN   
1      8.530000         0 -0.014399         0.000  ...     NaN   
2  13827.062093         1 -0.014643       163.624  ...     NaN   
3      7.657713         5 -0.013922         0.000  ...     NaN   
4     25.891363         7 -0.008264         0.000  ...     NaN   

   close_div_ma_15  volume_div_ma_15  realized_volatility_60  \
0             15.0              15.0                     NaN   
1             15.0              15.0

0

In [22]:
raise ValueError("stop!")

ValueError: stop!

# Predict & submit

References: [Detailed API Introduction](https://www.kaggle.com/sohier/detailed-api-introduction)

Something that helped me understand this iterator was adding a pdb checkpoint inside of the for loop:

```python
import pdb; pdb.set_trace()
```

See [Python Debugging With Pdb](https://realpython.com/python-debugging-pdb/) if you want to use it and you don't know how to.


In [None]:
import time
history = pd.DataFrame()
max_lookback = 60

env = gresearch_crypto.make_env()
iter_test = env.iter_test()

start = time.time()
for i, (df_test, df_pred) in enumerate(iter_test):
    history = pd.concat([history, df_test]).reset_index(drop=True)

    for j , row in df_test.iterrows():
        model = models[row['Asset_ID']]
        x_test = get_features(history[history['Asset_ID'] == row['Asset_ID']]).reset_index(drop=True)
        y_pred = model.predict([x_test.iloc[-1]])[0]
        
        df_pred.loc[df_pred['row_id'] == row['row_id'], 'Target'] = y_pred
        
        # Print just one sample row to get a feeling of what it looks like
        if i == 0 and j == 0:
            display(x_test)

    # Display the first prediction dataframe
    if i == 0:
        display(df_pred)
    history = history.sort_values(by='row_id')
    history = history.iloc[-(max_lookback*14+100):]
    
    # Send submissions
    env.predict(df_pred)
end = time.time()
end-start

# df_train and df_test overlap

In [None]:
# df_test = pd.concat(all_df_test)
# df_test['datetime'] = pd.to_datetime(df_test['timestamp'], unit='s')
# df_train['datetime'] = pd.to_datetime(df_train['timestamp'], unit='s')

In [None]:
# df_train['datetime'].max()

In [None]:
# df_test['datetime'].min()