# Overview
This notebook is for performing inference using a model trained on [[JSR-TMDF] Gradient Boosting Models (Training)](https://www.kaggle.com/code/takaito/jsr-tmdf-gradient-boosting-models-training). Please refer to [[JSR-TMDF] Gradient Boosting Models (Training)](https://www.kaggle.com/code/takaito/jsr-tmdf-gradient-boosting-models-training) for the training.



To be updated!! (I plan to add more hints if the number of votes increases.)

In [1]:
# ====================================================
# Library
# ====================================================
import os
import gc
import warnings
warnings.filterwarnings('ignore')
import random
import scipy as sp
import numpy as np
import pandas as pd
import polars as pl
from glob import glob
from pathlib import Path
import joblib
import pickle
import itertools
from tqdm.auto import tqdm

import torch
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split, GroupKFold
from sklearn.metrics import log_loss, roc_auc_score, matthews_corrcoef, f1_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import xgboost as xgb
from catboost import Pool, CatBoostRegressor, CatBoostClassifier

In [2]:
import kaggle_evaluation.jane_street_inference_server

In [3]:
# ====================================================
# Configurations
# ====================================================
class CFG:
    VER = 1
    AUTHOR = 'szukiyu'
    COMPETITION = 'jane-street-real-time-market-data-forecasting'
    DATA_PATH = Path('/kaggle/input/jane-street-real-time-market-data-forecasting')
    OOF_DATA_PATH = Path('./oof')
    MODEL_DATA_PATH = Path('./models')
    METHOD_LIST = ['lightgbm', 'xgboost', 'catboost']
    USE_GPU = torch.cuda.is_available()
    SEED = 42
    N_SPLIT = 5
    target_col = 'responder_6'
    metric = 'r2_score'
    metric_maximize_flag = True
    

In [4]:
# ====================================================
# Seed everything
# ====================================================
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
seed_everything(CFG.SEED)

In [5]:
original_features = ['feature_' + str(x).zfill(2) for x in range(78+1)]

In [6]:
model_list_dict = {}
for method in CFG.METHOD_LIST:
    model_list_dict[method] = []
    for fold in range(CFG.N_SPLIT):
        model_list_dict[method].append(pickle.load(open(f'/kaggle/input/jsr-tmdf-gradient-boosting-models-training/models/{method}_fold{fold+1}_seed{CFG.SEED}_ver{CFG.VER}.pkl', 'rb')))

In [7]:
def lightgbm_inference(model, x_test: pd.DataFrame):
    return model.predict(x_test)

def xgboost_inference(model, x_test: pd.DataFrame):
    return model.predict(xgb.DMatrix(x_test))

def catboost_inference(model, x_test: pd.DataFrame):
    return model.predict(x_test)

In [8]:
lags_ : pl.DataFrame | None = None


# Replace this function with your inference code.
# You can return either a Pandas or Polars dataframe, though Polars is recommended.
# Each batch of predictions (except the very first) must be returned within 10 minutes of the batch features being provided.
def predict(test: pl.DataFrame, lags: pl.DataFrame | None) -> pl.DataFrame | pd.DataFrame:
    """Make a prediction."""
    # All the responders from the previous day are passed in at time_id == 0. We save them in a global variable for access at every time_id.
    # Use them as extra features, if you like.
    global lags_
    if lags is not None:
        lags_ = lags

    predictions = test.select(
        'row_id',
        pl.lit(0.0).alias('responder_6'),
    )
    
    x_test = test[original_features].to_pandas()
    pred = []
    for method in CFG.METHOD_LIST:
        for model in model_list_dict[method]:
            if method == 'lightgbm':
                pred.append(lightgbm_inference(model, x_test))
            if method == 'xgboost':
                pred.append(xgboost_inference(model, x_test))
            if method == 'catboost':
                pred.append(catboost_inference(model, x_test))
                
    pred = np.mean(pred, axis=0)
    
    predictions = predictions.with_columns(pl.Series('responder_6', pred.ravel()))

    # The predict function must return a DataFrame
    assert isinstance(predictions, pl.DataFrame | pd.DataFrame)
    # with columns 'row_id', 'responer_6'
    assert list(predictions.columns) == ['row_id', 'responder_6']
    # and as many rows as the test data.
    assert len(predictions) == len(test)

    return predictions

In [9]:
inference_server = kaggle_evaluation.jane_street_inference_server.JSInferenceServer(predict)

if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    inference_server.serve()
else:
    inference_server.run_local_gateway(
        (
            '/kaggle/input/jane-street-real-time-market-data-forecasting/test.parquet',
            '/kaggle/input/jane-street-real-time-market-data-forecasting/lags.parquet',
        )
    )