# Overview
This repository provides code implementation for training Gradient Boosting Models (GBMs), a popular machine learning technique for both classification and regression tasks. GBMs are ensemble methods that combine the predictions of several base estimators to improve accuracy and generalization performance.



# Inference
[[JSR-TMDF] Gradient Boosting Models (Inference)](https://www.kaggle.com/code/takaito/jsr-tmdf-gradient-boosting-models-inference)

# Tips
## 1. CV Strategy
By setting kfold = KFold(n_splits=CFG.N_SPLIT, shuffle=False), the data is being loaded in chronological order, so the splitting is performed based on the time series.

## 2. feature importance
In LightGBM, we save the feature importance. This allows you to check which features are effective and can provide insights for removing unnecessary features or creating new ones, so please make use of it.

To be updated!! (I plan to add more hints if the number of votes increases.)

In [1]:
# ====================================================
# Library
# ====================================================
import os
import gc
import warnings
warnings.filterwarnings('ignore')
import random
import scipy as sp
import numpy as np
import pandas as pd
import polars as pl
from glob import glob
from pathlib import Path
import joblib
import pickle
import itertools
from tqdm.auto import tqdm
import glob

import torch
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split, GroupKFold
from sklearn.metrics import log_loss, roc_auc_score, matthews_corrcoef, f1_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import xgboost as xgb
from catboost import Pool, CatBoostRegressor, CatBoostClassifier

In [2]:
!mkdir oof
!mkdir models

mkdir: cannot create directory ‘oof’: File exists
mkdir: cannot create directory ‘models’: File exists


In [3]:
# ====================================================
# Configurations
# ====================================================
class CFG:
    VER = 2
    AUTHOR = 'szukiyu'
    COMPETITION = 'jane-street-real-time-market-data-forecasting'
    DATA_PATH = Path('/home/ec2-user/jane_street/data')
    OOF_DATA_PATH = Path('./oof')
    MODEL_DATA_PATH = Path('./models')
    METHOD_LIST = ['lightgbm', 'xgboost', 'catboost']
    USE_GPU = torch.cuda.is_available()
    SEED = 42
    N_SPLIT = 5
    target_col = 'responder_6'
    metric = 'r2_score'
    metric_maximize_flag = True

    num_boost_round = 2500
    early_stopping_round = 10
    verbose = 50
    
    regression_lgb_params = {
        'objective': 'regression',
        'metric': 'rmse', 
        'learning_rate': 0.05,
        'num_leaves': 31,
        'seed': SEED,
    }
    regression_xgb_params = {
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse',
        'learning_rate': 0.05, 
        'max_depth': 7,
        'random_state': SEED,
    }
    
    regression_cat_params = {
        'loss_function': 'RMSE',
        'learning_rate': 0.05, 
        'iterations': num_boost_round, 
        'depth': 7, 
        'random_seed': SEED,
    }
    target_col = "responder_6"
    feature_cols = ["symbol_id", "time_id"] + [f"feature_{idx:02d}" for idx in range(79)]+ [f"responder_{idx}_lag_1" for idx in range(9)]
    all_cols =  ["date_id","time_id", "symbol_id", "weight"] + [f"feature_{idx:02d}" for idx in range(79)]+ [f"responder_{idx}_lag_1" for idx in range(9)] + [target_col]

    data_paths = [
        DATA_PATH / "training.parquet/",
    ]

In [4]:
# ====================================================
# Seed everything
# ====================================================
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
seed_everything(CFG.SEED)

In [5]:
def lightgbm_training(x_train: pd.DataFrame, y_train: pd.DataFrame, x_valid: pd.DataFrame, y_valid: pd.DataFrame):
    lgb_train = lgb.Dataset(x_train, y_train)
    lgb_valid = lgb.Dataset(x_valid, y_valid)
    
    model = lgb.train(
                params = CFG.regression_lgb_params,
                train_set = lgb_train,
                num_boost_round = CFG.num_boost_round,
                valid_sets = [lgb_train, lgb_valid],
                callbacks=[lgb.early_stopping(stopping_rounds=CFG.early_stopping_round, verbose=CFG.verbose),
                           lgb.log_evaluation(CFG.verbose),
                          ]
            )
    # Predict validation
    valid_pred = model.predict(x_valid)
    return model, valid_pred
def xgboost_training(x_train: pd.DataFrame, y_train: pd.DataFrame, x_valid: pd.DataFrame, y_valid: pd.DataFrame):
    xgb_train = xgb.DMatrix(data=x_train, label=y_train)
    xgb_valid = xgb.DMatrix(data=x_valid, label=y_valid)
    model = xgb.train(
                CFG.regression_xgb_params,
                dtrain = xgb_train,
                num_boost_round = CFG.num_boost_round,
                evals = [(xgb_train, 'train'), (xgb_valid, 'eval')],
                early_stopping_rounds = CFG.early_stopping_round,
                verbose_eval = CFG.verbose
            )
    # Predict validation
    valid_pred = model.predict(xgb.DMatrix(x_valid))
    return model, valid_pred
def catboost_training(x_train: pd.DataFrame, y_train: pd.DataFrame, x_valid: pd.DataFrame, y_valid: pd.DataFrame):
    cat_train = Pool(data=x_train, label=y_train)
    cat_valid = Pool(data=x_valid, label=y_valid)
    model = CatBoostRegressor(**CFG.regression_cat_params)
    model.fit(cat_train,
              eval_set = [cat_valid],
              early_stopping_rounds = CFG.early_stopping_round,
              verbose = CFG.verbose,
              use_best_model = True)
    # Predict validation
    valid_pred = model.predict(x_valid)
    return model, valid_pred

def gradient_boosting_model_cv_training(method: str, train_df: pd.DataFrame, features: list):
    # Create a numpy array to store out of folds predictions
    oof_predictions = np.zeros(len(train_df))
    oof_fold = np.zeros(len(train_df))
    ## 1. CV Strategy
    kfold = KFold(n_splits=CFG.N_SPLIT, shuffle=False) # , shuffle=True, random_state=CFG.SEED)
    for fold, (train_index, valid_index) in enumerate(kfold.split(X=train_df[features], y=train_df[CFG.target_col])):
        print('-'*50)
        print(f'{method} training fold {fold+1}')

        x_train = train_df[features].iloc[train_index]
        y_train = train_df[CFG.target_col].iloc[train_index]
        x_valid = train_df[features].iloc[valid_index]
        y_valid = train_df[CFG.target_col].iloc[valid_index]
        if method == 'lightgbm':
            model, valid_pred = lightgbm_training(x_train, y_train, x_valid, y_valid)
            ## 2. feature importance
            importance_df = pd.DataFrame(model.feature_importance(), index=features, columns=['importance']).reset_index()
            importance_df.to_csv(CFG.MODEL_DATA_PATH / f'{method}_fold{fold + 1}_seed{CFG.SEED}_ver{CFG.VER}_importance.csv', index=False)
        if method == 'xgboost':
            model, valid_pred = xgboost_training(x_train, y_train, x_valid, y_valid)
        if method == 'catboost':
            model, valid_pred = catboost_training(x_train, y_train, x_valid, y_valid)

        # Save best model
        pickle.dump(model, open(CFG.MODEL_DATA_PATH / f'{method}_fold{fold + 1}_seed{CFG.SEED}_ver{CFG.VER}.pkl', 'wb'))
        # Add to out of folds array
        oof_predictions[valid_index] = valid_pred
        oof_fold[valid_index] = fold + 1
        del x_train, x_valid, y_train, y_valid, model, valid_pred
        gc.collect()

    # Compute out of folds metric
    score = r2_score(train_df[CFG.target_col], oof_predictions, sample_weight=train_df['weight'])
    print(f'{method} our out of folds CV {CFG.metric} is {score}')
    # Create a dataframe to store out of folds predictions
    oof_df = pd.DataFrame({CFG.target_col: train_df[CFG.target_col], f'{method}_prediction': oof_predictions, 'fold': oof_fold})
    oof_df.to_csv(CFG.OOF_DATA_PATH / f'oof_{method}_seed{CFG.SEED}_ver{CFG.VER}.csv', index = False)

In [6]:
def read_train_data():
    all_files = []
    for path in CFG.data_paths:
        all_files.extend(glob.glob(os.path.join(path, "*/*parquet")))

    pl_train = pl.concat([pl.read_parquet(_f, columns=CFG.all_cols) for _f in all_files])
    return pl_train


In [7]:
#original_features = ['feature_' + str(x).zfill(2) for x in range(78+1)]

In [8]:
train_pl_df = read_train_data()

In [9]:
for method in CFG.METHOD_LIST:
    gradient_boosting_model_cv_training(method, train_pl_df.to_pandas(), CFG.feature_cols)

--------------------------------------------------
lightgbm training fold 1
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.882583 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22028
[LightGBM] [Info] Number of data points in the train set: 16817644, number of used features: 90
[LightGBM] [Info] Start training from score -0.001492
Training until validation scores don't improve for 10 rounds
[50]	training's rmse: 0.86144	valid_1's rmse: 0.848561
[100]	training's rmse: 0.857331	valid_1's rmse: 0.847805
[150]	training's rmse: 0.85471	valid_1's rmse: 0.847543
Early stopping, best iteration is:
[158]	training's rmse: 0.854405	valid_1's rmse: 0.847501
--------------------------------------------------
lightgbm training fold 2
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.789484 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] 