## Reference:

Kaggle \
[G-Research Crypto Forecasting - baseline & FE](https://www.kaggle.com/vbmokin/g-research-crypto-forecasting-baseline-fe)\
[G-Research- Starter LGBM Pipeline(copied)](https://www.kaggle.com/yliu27/g-research-starter-lgbm-pipeline-copied)\
[[GResearch] Simple LGB Starter](https://www.kaggle.com/code1110/gresearch-simple-lgb-starter)\
[LightGBM with Sklearn pipelines](https://www.kaggle.com/paweljankiewicz/lightgbm-with-sklearn-pipelines)\
[Parameter grid search LGBM with scikit-learn](https://www.kaggle.com/bitit1994/parameter-grid-search-lgbm-with-scikit-learn)\
[Pipelines + GridSearch = Awesome ML pipelines](https://www.kaggle.com/evanmiller/pipelines-gridsearch-awesome-ml-pipelines)\
[resampling + gridsearch + lightgbm - magic](https://www.kaggle.com/mommermi/resampling-gridsearch-lightgbm-magic)\
[Crypto Forecasting - lgbm feval+feature importance](https://www.kaggle.com/lucasmorin/crypto-forecasting-lgbm-feval-feature-importance)


External \
[You Are Missing Out on LightGBM. It Crushes XGBoost in Every Aspect](https://towardsdatascience.com/how-to-beat-the-heck-out-of-xgboost-with-lightgbm-comprehensive-tutorial-5eba52195997)\
[Machine Learning Tutorial Python - 16: Hyper parameter Tuning (GridSearchCV)](https://www.youtube.com/watch?v=HdlDYng8g9s&t=16s)\
[Pipelines & Custom Transformers in scikit-learn: The step-by-step guide (with Python code)](https://www.youtube.com/watch?v=mOYJCR0IDk8)\
[Scikit-Learn Pipelines with Custom Transformer — A Step by Step Guide.
](https://medium.com/analytics-vidhya/scikit-learn-pipelines-with-custom-transformer-a-step-by-step-guide-9b9b886fd2cc)\
[Get feature importance from GridSearchCV](https://stackoverflow.com/questions/48377296/get-feature-importance-from-gridsearchcv)\
[How to get the selected features in GridSearchCV in sklearn in python](https://stackoverflow.com/questions/55650782/how-to-get-the-selected-features-in-gridsearchcv-in-sklearn-in-python)\
[Feature Importance with OneHotEncoder and Pipelines in Scikit-learn](https://katstam.com/regression-feature_importance/)\
[How to extract feature importances from an Sklearn pipeline](https://stackoverflow.com/questions/38787612/how-to-extract-feature-importances-from-an-sklearn-pipeline)\
[Sklearn Pipeline: Get feature names after OneHotEncode In ColumnTransformer](https://stackoverflow.com/questions/54646709/sklearn-pipeline-get-feature-names-after-onehotencode-in-columntransformer)\
[eli5 example](https://www.kaggle.com/lopuhin/eli5-example)

# Environment Setup

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/g-research-crypto-forecasting/example_sample_submission.csv
/kaggle/input/g-research-crypto-forecasting/asset_details.csv
/kaggle/input/g-research-crypto-forecasting/example_test.csv
/kaggle/input/g-research-crypto-forecasting/train.csv
/kaggle/input/g-research-crypto-forecasting/supplemental_train.csv
/kaggle/input/g-research-crypto-forecasting/gresearch_crypto/competition.cpython-37m-x86_64-linux-gnu.so
/kaggle/input/g-research-crypto-forecasting/gresearch_crypto/__init__.py


In [2]:
import sys
sys.path.insert(0, '/kaggle/input/g-research-crypto-forecasting')
# somehow need to run this before importing competition API

import gresearch_crypto
import time
from datetime import datetime

import warnings
warnings.simplefilter('ignore')

dir_in = '/kaggle/input/g-research-crypto-forecasting/'
file_train = 'train.csv'
file_asset_details = 'asset_details.csv'

df_train = pd.read_csv(os.path.join(dir_in, file_train))
df_asset_details = pd.read_csv(os.path.join(dir_in, file_asset_details))

In [3]:
import random

def fix_all_seeds(seed):
    np.random.seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

fix_all_seeds(2021)

# Feature Engineering

In [4]:
def get_features(df):
    
    df = df.set_index('timestamp')
    
    df['upper_shadow'] = df['High'] / df[['Close', 'Open']].max(axis=1)
    df['lower_shadow'] = df[['Close', 'Open']].min(axis=1) / df['Low']
    df['open2close'] = df['Close'] / df['Open']
    df['high2low'] = df['High'] / df['Low']
    
    mean_price = df[['Open', 'High', 'Low', 'Close']].mean(axis=1)
    median_price = df[['Open', 'High', 'Low', 'Close']].median(axis=1)
    
    df['high2mean'] = df['High'] / mean_price
    df['low2mean'] = df['Low'] / mean_price
    df['high2median'] = df['High'] / median_price
    df['low2median'] = df['Low'] / median_price
    df['volume2count'] = df['Volume'] / (df['Count'] + 1)
    
    return df    

In [5]:
def get_asset_data(df_train, asset_id):
    
    df = df_train[df_train["Asset_ID"] == asset_id].copy()
    df = df.replace([np.inf, -np.inf], np.nan)
    y = df['Target'].copy()
    y = y.fillna(0)
    X = df.drop('Target', axis=1)
    
    return X, y

In [6]:
def get_corr(y_pred, y):
    corr = np.corrcoef(y_pred, y)[0,1]
    return corr

# Pipeline

In [7]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
# from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_predict, GridSearchCV
from lightgbm import LGBMRegressor
# from category_encoders import OneHotEncoder

# customize class for feature transformation
from sklearn.base import BaseEstimator, TransformerMixin


class GetFeatureTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X_ = get_features(X)
        return X_

In [8]:
cat_cols = ['Asset_ID']
num_cols = ['Count', 'Open', 'High', 'Low', 'Close', 'Volume', 'VWAP',
            'upper_shadow', 'lower_shadow', 'open2close', 'high2low', 'high2mean', 'low2mean', 'high2median', 'low2median', 'volume2count']

[hyper parameter optimization - suggested parameter grid](https://github.com/Microsoft/LightGBM/issues/695)

>For heavily unbalanced datasets such as 1:10000:
>
>max_bin: keep it only for memory pressure, not to tune (otherwise overfitting)\
>learning rate: keep it only for training speed, not to tune (otherwise overfitting)\
>n_estimators: must be infinite (like 9999999) and use early stopping to auto-tune (otherwise overfitting)\
>num_leaves: [7, 4095]\
>max_depth: [2, 63] and infinite (I personally saw metric performance increases with such 63 depth with small number of leaves on sparse unbalanced datasets)\
>scale_pos_weight: [1, 10000] (if over 10000, something might be wrong because I never saw it that good after 5000)\
>min_child_weight: [0.01, (sample size / 1000)] if you are using logloss (think about the hessian possible value range before putting "sample size / 1000", it is dataset-dependent and loss-dependent)\
>subsample: [0.4, 1]\
>bagging_freq: only 1, keep as is (otherwise overfitting)\
>colsample_bytree: [0.4, 1]\
>is_unbalance: false (make your own weighting with scale_pos_weight)\
>USE A CUSTOM METRIC (to reflect reality without weighting, otherwise you have weights inside your metric with premade metrics like xgboost)\
>Never tune these parameters unless you have an explicit requirement to tune them:
>
>Learning rate (lower means longer to train but more accurate, higher means smaller to train but less accurate)\
>Number of boosting iterations (automatically tuned with early stopping and learning rate)\
>Maximum number of bins (RAM dependent)


In [9]:
# params = {
#     'n_estimators': 1000,
#     'objective': 'regression',
#     'metric': 'rmse',
#     'boosting_type': 'gbdt',
#     'max_depth': -1,
#     'num_leaves': 50,
#     'max_bin': 15,
#     'learning_rate': 0.01,
#     'subsample': 0.72,
#     'subsample_freq': 4,
#     'colsample_bytree': 0.4,    #'feature_fraction': 0.4,    
#     'reg_alpha': 1,    #'lambda_l1': 1,
#     'reg_lambda': 1,   #'lambda_l2': 1,   
#     'verbose': -1
# }

params = {
    'model__n_estimators': [100], #[100,150], #[100, 1000],
    'model__objective': ['regression'],
    'model__metric': ['rmse'],
    'model__boosting_type': ['dart'],#['gbdt', 'dart'],
    'model__max_depth': [10], #[10, 20],
    'model__num_leaves': [20], #[20, 50, 100],
    'model__max_bin': [15],
    'model__learning_rate': [0.01], #[0.01, 0.05],
    'model__subsample': [0.72],
    'model__subsample_freq': [4],
    'model__colsample_bytree': [0.4],    
    'model__reg_alpha': [1], #[1, 1.2],    #'lambda_l1': 1,
    'model__reg_lambda': [1], #[1, 1.2],   #'lambda_l2': 1,   
    'model__verbose': [-1]
}

In [10]:
pipe_lgbm = Pipeline(steps=[
    ('get_feature', GetFeatureTransformer()),
    ('transform_columns', ColumnTransformer([
        ('tf_num', StandardScaler(), num_cols),
        ('tf_cat', OneHotEncoder(), cat_cols)
    ])),
    ('model', LGBMRegressor())
])

# Training

In [11]:
import time     # timer
import eli5     # explain feature importance
from IPython.display import display

In [12]:
grid = GridSearchCV(
    pipe_lgbm,
    params,
    cv = 5,
    scoring = 'neg_mean_squared_error'
)

In [13]:
X_train = {}
y_train = {}
model_lgbm = {}
y_insmpl_pred = {}
score_insmpl = {}
# feature_importance = {}

# for asset_id, asset_name in zip([10], ['Maker']):
for asset_id, asset_name in zip(df_asset_details['Asset_ID'], df_asset_details['Asset_Name']):
    start_ts = time.time()
    print("-"*50)
    print(f"Training model for {asset_name:<16} (ID={asset_id:<2})...")
    
    X, y = get_asset_data(df_train, asset_id)
#     print(X.columns)

#     model = pipe_lgbm.fit(X, y)
    model = grid.fit(X, y)
#     y_pred = model.predict(X)

    print("Best parameters:")
    print("\n".join("{}: {}".format(k, v) for k, v in model.best_params_.items()))
    print(f"Best score: {model.best_score_:.4e}")
    
#     y_pred = cross_val_predict(pipe_lgbm, X, y, cv = 5)
    y_pred = model.predict(X)

    score = get_corr(y_pred, y)
    
#     print(f"In-sample test score for {asset_name:<16} {score:.4f}")
    print(f"Cross validation test score for {asset_name}: {score:.4f}")
    
    X_train[asset_id] = X
    y_train[asset_id] = y
    model_lgbm[asset_id] = model
    y_insmpl_pred[asset_id] = y_pred
    score_insmpl[asset_id] = score
    
#     print(model.best_estimator_.named_steps['model'].feature_importances_)
    cat_col_trf = list(model.best_estimator_.named_steps['transform_columns'].named_transformers_['tf_cat'].get_feature_names())
    features = num_cols + cat_col_trf
#     print(f"All features: {features}")
    display(eli5.explain_weights(model.best_estimator_.named_steps['model'], top=5, feature_names=features))
    
    end_ts = time.time()
    print(f"Time consumption: {(end_ts-start_ts)/60:.2f}min")

--------------------------------------------------
Training model for Bitcoin Cash     (ID=2 )...
Best parameters:
model__boosting_type: dart
model__colsample_bytree: 0.4
model__learning_rate: 0.01
model__max_bin: 15
model__max_depth: 10
model__metric: rmse
model__n_estimators: 100
model__num_leaves: 20
model__objective: regression
model__reg_alpha: 1
model__reg_lambda: 1
model__subsample: 0.72
model__subsample_freq: 4
model__verbose: -1
Best score: -4.1858e-05
Cross validation test score for Bitcoin Cash: 0.0704


Weight,Feature
0.1839,Count
0.0926,High
0.0917,Close
0.0917,Volume
0.0862,VWAP
… 12 more …,… 12 more …


Time consumption: 2.55min
--------------------------------------------------
Training model for Binance Coin     (ID=0 )...
Best parameters:
model__boosting_type: dart
model__colsample_bytree: 0.4
model__learning_rate: 0.01
model__max_bin: 15
model__max_depth: 10
model__metric: rmse
model__n_estimators: 100
model__num_leaves: 20
model__objective: regression
model__reg_alpha: 1
model__reg_lambda: 1
model__subsample: 0.72
model__subsample_freq: 4
model__verbose: -1
Best score: -2.9517e-05
Cross validation test score for Binance Coin: 0.0445


Weight,Feature
0.1701,volume2count
0.1515,High
0.1251,Count
0.0945,Close
0.0863,Volume
… 12 more …,… 12 more …


Time consumption: 2.46min
--------------------------------------------------
Training model for Bitcoin          (ID=1 )...
Best parameters:
model__boosting_type: dart
model__colsample_bytree: 0.4
model__learning_rate: 0.01
model__max_bin: 15
model__max_depth: 10
model__metric: rmse
model__n_estimators: 100
model__num_leaves: 20
model__objective: regression
model__reg_alpha: 1
model__reg_lambda: 1
model__subsample: 0.72
model__subsample_freq: 4
model__verbose: -1
Best score: -4.0540e-06
Cross validation test score for Bitcoin: 0.0449


Weight,Feature
0.1401,Count
0.1333,volume2count
0.1289,open2close
0.1211,High
0.0748,Close
… 12 more …,… 12 more …


Time consumption: 2.82min
--------------------------------------------------
Training model for EOS.IO           (ID=5 )...
Best parameters:
model__boosting_type: dart
model__colsample_bytree: 0.4
model__learning_rate: 0.01
model__max_bin: 15
model__max_depth: 10
model__metric: rmse
model__n_estimators: 100
model__num_leaves: 20
model__objective: regression
model__reg_alpha: 1
model__reg_lambda: 1
model__subsample: 0.72
model__subsample_freq: 4
model__verbose: -1
Best score: -2.3271e-05
Cross validation test score for EOS.IO: 0.0464


Weight,Feature
0.1576,high2low
0.0934,Count
0.0855,Close
0.0855,High
0.0705,volume2count
… 12 more …,… 12 more …


Time consumption: 2.77min
--------------------------------------------------
Training model for Ethereum Classic (ID=7 )...
Best parameters:
model__boosting_type: dart
model__colsample_bytree: 0.4
model__learning_rate: 0.01
model__max_bin: 15
model__max_depth: 10
model__metric: rmse
model__n_estimators: 100
model__num_leaves: 20
model__objective: regression
model__reg_alpha: 1
model__reg_lambda: 1
model__subsample: 0.72
model__subsample_freq: 4
model__verbose: -1
Best score: -8.0145e-05
Cross validation test score for Ethereum Classic: 0.0549


Weight,Feature
0.1478,Count
0.1394,High
0.1064,VWAP
0.0990,Close
0.0871,volume2count
… 12 more …,… 12 more …


Time consumption: 2.29min
--------------------------------------------------
Training model for Ethereum         (ID=6 )...
Best parameters:
model__boosting_type: dart
model__colsample_bytree: 0.4
model__learning_rate: 0.01
model__max_bin: 15
model__max_depth: 10
model__metric: rmse
model__n_estimators: 100
model__num_leaves: 20
model__objective: regression
model__reg_alpha: 1
model__reg_lambda: 1
model__subsample: 0.72
model__subsample_freq: 4
model__verbose: -1
Best score: -6.2255e-06
Cross validation test score for Ethereum: 0.0450


Weight,Feature
0.2457,Count
0.1321,open2close
0.1248,volume2count
0.1221,Volume
0.0662,high2low
… 12 more …,… 12 more …


Time consumption: 2.82min
--------------------------------------------------
Training model for Litecoin         (ID=9 )...
Best parameters:
model__boosting_type: dart
model__colsample_bytree: 0.4
model__learning_rate: 0.01
model__max_bin: 15
model__max_depth: 10
model__metric: rmse
model__n_estimators: 100
model__num_leaves: 20
model__objective: regression
model__reg_alpha: 1
model__reg_lambda: 1
model__subsample: 0.72
model__subsample_freq: 4
model__verbose: -1
Best score: -1.2621e-05
Cross validation test score for Litecoin: 0.0397


Weight,Feature
0.1720,Count
0.1058,volume2count
0.0988,High
0.0802,Close
0.0786,high2low
… 12 more …,… 12 more …


Time consumption: 2.86min
--------------------------------------------------
Training model for Monero           (ID=11)...
Best parameters:
model__boosting_type: dart
model__colsample_bytree: 0.4
model__learning_rate: 0.01
model__max_bin: 15
model__max_depth: 10
model__metric: rmse
model__n_estimators: 100
model__num_leaves: 20
model__objective: regression
model__reg_alpha: 1
model__reg_lambda: 1
model__subsample: 0.72
model__subsample_freq: 4
model__verbose: -1
Best score: -3.8037e-05
Cross validation test score for Monero: 0.0546


Weight,Feature
0.1532,Close
0.1114,Count
0.1068,VWAP
0.1062,High
0.0938,volume2count
… 12 more …,… 12 more …


Time consumption: 2.21min
--------------------------------------------------
Training model for TRON             (ID=13)...
Best parameters:
model__boosting_type: dart
model__colsample_bytree: 0.4
model__learning_rate: 0.01
model__max_bin: 15
model__max_depth: 10
model__metric: rmse
model__n_estimators: 100
model__num_leaves: 20
model__objective: regression
model__reg_alpha: 1
model__reg_lambda: 1
model__subsample: 0.72
model__subsample_freq: 4
model__verbose: -1
Best score: -2.3956e-05
Cross validation test score for TRON: 0.0383


Weight,Feature
0.1667,high2low
0.1076,high2median
0.1054,High
0.0891,lower_shadow
0.0891,high2mean
… 12 more …,… 12 more …


Time consumption: 2.61min
--------------------------------------------------
Training model for Stellar          (ID=12)...
Best parameters:
model__boosting_type: dart
model__colsample_bytree: 0.4
model__learning_rate: 0.01
model__max_bin: 15
model__max_depth: 10
model__metric: rmse
model__n_estimators: 100
model__num_leaves: 20
model__objective: regression
model__reg_alpha: 1
model__reg_lambda: 1
model__subsample: 0.72
model__subsample_freq: 4
model__verbose: -1
Best score: -2.6211e-05
Cross validation test score for Stellar: 0.0570


Weight,Feature
0.1537,Close
0.1423,Count
0.1056,VWAP
0.0703,high2low
0.0678,lower_shadow
… 12 more …,… 12 more …


Time consumption: 2.35min
--------------------------------------------------
Training model for Cardano          (ID=3 )...
Best parameters:
model__boosting_type: dart
model__colsample_bytree: 0.4
model__learning_rate: 0.01
model__max_bin: 15
model__max_depth: 10
model__metric: rmse
model__n_estimators: 100
model__num_leaves: 20
model__objective: regression
model__reg_alpha: 1
model__reg_lambda: 1
model__subsample: 0.72
model__subsample_freq: 4
model__verbose: -1
Best score: -2.0128e-05
Cross validation test score for Cardano: 0.0336


Weight,Feature
0.1495,Count
0.1422,Close
0.1083,Volume
0.1068,VWAP
0.0933,volume2count
… 12 more …,… 12 more …


Time consumption: 2.48min
--------------------------------------------------
Training model for IOTA             (ID=8 )...
Best parameters:
model__boosting_type: dart
model__colsample_bytree: 0.4
model__learning_rate: 0.01
model__max_bin: 15
model__max_depth: 10
model__metric: rmse
model__n_estimators: 100
model__num_leaves: 20
model__objective: regression
model__reg_alpha: 1
model__reg_lambda: 1
model__subsample: 0.72
model__subsample_freq: 4
model__verbose: -1
Best score: -5.6516e-05
Cross validation test score for IOTA: 0.0466


Weight,Feature
0.1370,High
0.1290,Count
0.1042,Close
0.0942,VWAP
0.0820,high2low
… 12 more …,… 12 more …


Time consumption: 2.02min
--------------------------------------------------
Training model for Maker            (ID=10)...
Best parameters:
model__boosting_type: dart
model__colsample_bytree: 0.4
model__learning_rate: 0.01
model__max_bin: 15
model__max_depth: 10
model__metric: rmse
model__n_estimators: 100
model__num_leaves: 20
model__objective: regression
model__reg_alpha: 1
model__reg_lambda: 1
model__subsample: 0.72
model__subsample_freq: 4
model__verbose: -1
Best score: -3.3822e-05
Cross validation test score for Maker: 0.0472


Weight,Feature
0.1776,High
0.1177,Close
0.1013,Count
0.0776,volume2count
0.0755,Volume
… 12 more …,… 12 more …


Time consumption: 1.00min
--------------------------------------------------
Training model for Dogecoin         (ID=4 )...
Best parameters:
model__boosting_type: dart
model__colsample_bytree: 0.4
model__learning_rate: 0.01
model__max_bin: 15
model__max_depth: 10
model__metric: rmse
model__n_estimators: 100
model__num_leaves: 20
model__objective: regression
model__reg_alpha: 1
model__reg_lambda: 1
model__subsample: 0.72
model__subsample_freq: 4
model__verbose: -1
Best score: -5.7712e-05
Cross validation test score for Dogecoin: 0.0676


Weight,Feature
0.3527,volume2count
0.1653,Close
0.0972,Count
0.0680,VWAP
0.0649,High
… 12 more …,… 12 more …


Time consumption: 1.36min


In [None]:
# import traceback

# df_test_all = {}
# df_pred_all = {}

# env = gresearch_crypto.make_env()
# iter_test = env.iter_test()

In [None]:
# for i, (df_test, df_pred) in enumerate(iter_test):
    
#     # make predictions
#     for j, row in df_test.iterrows():
#         asset_id = row['Asset_ID']
#         try:
#             y_pred = model_lgbm[asset_id].predict(row.to_frame().T)[0]
#         except:
#             y_pred = 0.0
#             traceback.print_exc()
#         df_pred.loc[df_pred['row_id']==row['row_id'], 'Target'] = y_pred
        
#     # store test dataframes
#     df_test_all[i] = df_test
#     df_pred_all[i] = df_pred
    
#     # submit predictions
#     env.predict(df_pred)

In [None]:
# file_smpl_subm = 'example_sample_submission.csv'
# df_smpl_subm = pd.read_csv(os.path.join(dir_in, file_smpl_subm))

In [None]:
# df_smpl_subm.head()

In [None]:
# df_subm_wgid = pd.DataFrame(columns = df_smpl_subm.columns)
# df_subm = pd.DataFrame(columns = ['row_id', 'Target'])

In [None]:
# for group_num, df_pred in df_pred_all.items():
#     df = df_pred.copy()
    
#     # without group_num
#     df_subm = df_subm.append(df)
    
#     # with group_num
#     df['group_num'] = group_num
#     df_subm_wgid = df_subm_wgid.append(df)

In [None]:
# df_subm_wgid.head()

In [None]:
# df_subm.to_csv('submission.csv', index=False)
# df_subm_wgid.to_csv('submission_with_group_num.csv', index=False)