In [1]:
import os
import gc
import optuna
import pickle
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.metrics import (
    mean_squared_error,
    mean_absolute_error,
    r2_score,
    mean_squared_log_error,
    root_mean_squared_error,
)

from src.core.loaders import CsvDataLoader, PickleSaver
from src.core.pipelines import XGBModelTrainingPipeline
from src.core.model import XGBCustomModel
from src.core.utils import load_params

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
os.chdir("../")
config_path = "config.yaml"

config = load_params(config_path)

training_pipeline = XGBModelTrainingPipeline(
    data_loader=CsvDataLoader(
        interactions_path=config.basic.interactions_path,
        articles_path=config.basic.articles_path,
    ),
    saver=PickleSaver(path=config.basic.artifact_dir),
    model=XGBCustomModel(
        model=XGBRegressor(**config.xgb_params),
        mf_model=pickle.load(
            open(os.path.join(config.basic.artifact_dir, "mf_model.pkl"), "rb")
        ),
        person_le=pickle.load(
            open(
                os.path.join(config.basic.artifact_dir, "personId_label_encoder.pkl"),
                "rb",
            )
        ),
        content_le=pickle.load(
            open(
                os.path.join(config.basic.artifact_dir, "contentId_label_encoder.pkl"),
                "rb",
            )
        ),
    ),
    top_n=config.basic.top_n,
    split_date=config.basic.split_date,
    event_type_strength=config.event_type_strength,
)

In [3]:
interactions_df = training_pipeline.data_loader.get_interactions()
articles_df = training_pipeline.data_loader.get_articles()

interactions_df = training_pipeline.training_preprocess(
    interactions_df.copy(), training_pipeline.event_type_strength
)

interactions_train, interactions_test = training_pipeline.preprocess_articles(
    articles_df.copy(), interactions_df.copy()
)

articles_fs = pd.read_parquet("feature_stores/articles.parquet")

In [4]:
results = []


def xgboost_reg_hyperparam_opt(trial):
    params = {
        "verbosity": 0,
        "silent": True,
        "objective": "reg:squarederror",
        "eval_metric": "rmse",
        "tree_method": "hist",
        "n_estimators": 200,
        "feature_importances_": "gain",
        "n_jobs": 6,
        "learning_rate": trial.suggest_float("learning_rate", 0.0001, 0.2, log=True),
        "max_depth": trial.suggest_int("max_depth", 5, 20),
        "lambda": trial.suggest_float("lambda", 0, 2),
        "alpha": trial.suggest_float("alpha", 0, 2),
        "min_child_weight": trial.suggest_float("min_child_weight", 0.5, 3),
        "subsample": trial.suggest_float("subsample", 0.2, 1),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.2, 1),
        "grow_policy": trial.suggest_categorical(
            "grow_policy", ["depthwise", "lossguide"]
        ),
        "max_bin": trial.suggest_int("max_bin", 12, 512),
    }

    training_pipeline.model.model = XGBRegressor(**params)

    training_pipeline.model.train(interactions_train.copy())

    predictions = training_pipeline.model.predict(interactions_test.copy())
    predictions = np.where(predictions < 0, 0, predictions)

    rmse = root_mean_squared_error(interactions_test["eventStrength"], predictions)

    metrics = {
        "rmse": root_mean_squared_error(
            interactions_test["eventStrength"], predictions
        ),
        "mse": mean_squared_error(interactions_test["eventStrength"], predictions),
        "mae": mean_absolute_error(interactions_test["eventStrength"], predictions),
        "r2": r2_score(interactions_test["eventStrength"], predictions),
        "RMSLE": mean_squared_log_error(
            interactions_test["eventStrength"], predictions, squared=False
        ),
    }

    results.append({"Trial": trial.number, **metrics, **params})

    pickle.dump(
        training_pipeline.model,
        open(f"experiments/xgb/xgb_model{trial.number}.pkl", "wb"),
    )
    pickle.dump(results, open("experiments/xgb/results.pkl", "wb"))

    gc.collect()

    return rmse


study = optuna.create_study(study_name="OptunaXgb", direction="minimize")
study.optimize(xgboost_reg_hyperparam_opt, n_trials=100)

[I 2024-11-05 13:28:27,016] A new study created in memory with name: OptunaXgb
[I 2024-11-05 13:29:17,823] Trial 0 finished with value: 0.6220500089927007 and parameters: {'learning_rate': 0.038695846961485245, 'max_depth': 11, 'lambda': 1.3185730328146736, 'alpha': 0.044536685809901, 'min_child_weight': 2.3745200308806615, 'subsample': 0.9831943318211458, 'colsample_bytree': 0.745603985582934, 'grow_policy': 'depthwise', 'max_bin': 495}. Best is trial 0 with value: 0.6220500089927007.
[I 2024-11-05 13:29:45,487] Trial 1 finished with value: 0.631481915142564 and parameters: {'learning_rate': 0.00016663102479128592, 'max_depth': 20, 'lambda': 0.039636970177124864, 'alpha': 1.088996043782186, 'min_child_weight': 1.8136915838646117, 'subsample': 0.22146385539690377, 'colsample_bytree': 0.40574162740911507, 'grow_policy': 'lossguide', 'max_bin': 168}. Best is trial 0 with value: 0.6220500089927007.
[I 2024-11-05 13:31:30,754] Trial 2 finished with value: 0.6170134494671186 and parameters:

In [6]:
results = pickle.load(open("experiments/xgb/results.pkl", "rb"))
pd.DataFrame(results).sort_values("rmse", ascending=True)

Unnamed: 0,Trial,rmse,mse,mae,r2,RMSLE,verbosity,silent,objective,eval_metric,...,n_jobs,learning_rate,max_depth,lambda,alpha,min_child_weight,subsample,colsample_bytree,grow_policy,max_bin
62,62,0.604227,0.365090,0.335522,0.086744,0.206934,0,True,reg:squarederror,rmse,...,6,0.024925,8,0.323354,0.411645,1.666298,0.673786,0.796449,lossguide,376
68,68,0.604953,0.365968,0.348054,0.084548,0.207346,0,True,reg:squarederror,rmse,...,6,0.013883,9,0.336033,0.085113,1.782421,0.776692,0.827502,lossguide,280
29,29,0.605191,0.366257,0.357794,0.083826,0.208638,0,True,reg:squarederror,rmse,...,6,0.024286,6,1.225888,1.448518,2.161619,0.624798,0.779434,depthwise,464
69,69,0.605410,0.366521,0.342191,0.083165,0.207540,0,True,reg:squarederror,rmse,...,6,0.014667,9,0.190096,0.096447,1.389819,0.772133,0.892792,lossguide,272
51,51,0.605419,0.366533,0.337987,0.083136,0.207912,0,True,reg:squarederror,rmse,...,6,0.050522,6,0.237609,1.564604,2.255426,0.739375,0.970978,lossguide,288
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6,6,0.629137,0.395813,0.404235,0.009892,0.216153,0,True,reg:squarederror,rmse,...,6,0.000385,12,1.262065,1.842383,1.928209,0.238623,0.676735,depthwise,402
7,7,0.629860,0.396724,0.406656,0.007615,0.216600,0,True,reg:squarederror,rmse,...,6,0.000257,11,0.618786,1.339131,0.547264,0.997632,0.768114,lossguide,390
48,48,0.631203,0.398417,0.408844,0.003378,0.217281,0,True,reg:squarederror,rmse,...,6,0.000114,6,0.442990,0.028340,2.041418,0.974062,0.845493,lossguide,491
1,1,0.631482,0.398769,0.406567,0.002497,0.217208,0,True,reg:squarederror,rmse,...,6,0.000167,20,0.039637,1.088996,1.813692,0.221464,0.405742,lossguide,168
