In [None]:
!pip install -U /kaggle/input/scikit-learn-1-4-2/scikit_learn-1.4.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl

In [None]:
import os
import gc
import numpy as np
import pandas as pd
import polars as pl
import joblib
from pathlib import Path
from sklearn.model_selection import GroupKFold
from sklearn.ensemble import VotingRegressor
from sklearn.metrics import root_mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.base import BaseEstimator, RegressorMixin, TransformerMixin
from scipy.optimize import minimize
import kaggle_evaluation.mcts_inference_server
import mcts_inference_server

import lightgbm as lgb
from catboost import CatBoostRegressor, Pool

In [None]:
from mcts_data_1 import ColumnDropper, AgentFeatureCreator, DowncastTransformer

In [None]:
class CFG:
    n_splits = 5
    sample_path = "/kaggle/input/um-game-playing-strength-of-mcts-variants/sample_submission.csv"
    model_pathes = [
        Path("/kaggle/input/mcts-training-lgbm-ds"),
        Path("/kaggle/input/mcts-training-catboost-ds")
    ]
    model_names = [
        "lgbm", 
        "catboost"
    ]
    train_ds_folder = Path("/kaggle/usr/lib/mcts-data-1")
    game_cols = ['EnglishRules', 'LudRules', 'GameRulesetName']
    fold_col = 'GameRulesetName'
    output_cols = ['num_wins_agent1', 'num_draws_agent1', 'num_losses_agent1']
    target_col = 'utility_agent1'

# Load models and data

In [None]:
X = pd.DataFrame()
y = pd.read_parquet(CFG.train_ds_folder / "labels_df.parquet")

In [None]:
pred_cols = []
oof_models_dict = {}
models_score_df = pd.DataFrame()

for name, path in zip(CFG.model_names, CFG.model_pathes):
    oof_models_dict[name] = joblib.load(path / "oof_model.pkl")
    pred_cols.append(f"pred_{name}")
    oof_pred = joblib.load(path / "oof_pred.pkl")
    X[f"pred_{name}"] = oof_pred
    models_score_df.loc[name, ["rmse"]] = root_mean_squared_error(y, oof_pred)

In [None]:
display(models_score_df)

In [None]:
display(X[pred_cols].head())

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

model_names = models_score_df.index.to_list()
oof_df_corr = X[pred_cols].corr()

mask = np.triu(np.ones_like(oof_df_corr, dtype=bool))
fig, ax = plt.subplots(figsize=(5, 5))
sns.heatmap(
    oof_df_corr, mask=mask, annot=True,
    linewidths=0.5, fmt=".2f", ax=ax, cmap="Reds", square=True
)
plt.title("OOF models correlation")
plt.show()

# Weighted Ensemble

In [None]:
class WeightsSearcher:
    def __init__(self, loss_fn, bounds=[], mode="min", method='SLSQP'):
        self.loss_fn = loss_fn
        self.bounds = bounds
        self.mode = mode
        self.method = method # Nelder-Mead - for not smooth functions
        
    def _objective_function_wrapper(self, pred_values, true_targets, obj_fn):
        def objective_function(weights):
            pred_weighted = (pred_values * weights).sum(axis=1)
            score = obj_fn(true_targets, pred_weighted)
            return score
        return objective_function
    
    def find_weights(self, val_preds, true_targets):
        len_models = len(self.bounds)
        bounds = [0,1] * len_models if len(self.bounds) == 0 else self.bounds
        initial_weights = np.ones(len_models) / len_models
        objective_function = self._objective_function_wrapper(val_preds, true_targets, self.loss_fn)
        result = minimize(
            objective_function, 
            initial_weights, 
            bounds=bounds, 
            method=self.method,
        )
        optimized_weights = result.x
        optimized_weights /= np.sum(optimized_weights)
        return optimized_weights

In [None]:
bounds = [(0, 1)] * len(pred_cols)
w_searcher = WeightsSearcher(root_mean_squared_error, bounds)
optimized_weights = w_searcher.find_weights(
    X[pred_cols].to_numpy(), 
    y
)
optimized_weights_df = pd.DataFrame(zip(CFG.model_names, optimized_weights), columns=['model', 'weight'])
display(optimized_weights_df)
print("sum: ", np.sum(optimized_weights))

In [None]:
oof_pred_optimized = (X[pred_cols] * optimized_weights).sum(axis=1).to_numpy()
X["pred_optimized"] = oof_pred_optimized
rmse_oof = root_mean_squared_error(y, oof_pred_optimized)
print("CV rmse: ", rmse_oof)

# Prediction

In [None]:
data_pipeline = joblib.load(CFG.train_ds_folder / "data_pipeline_1.pkl")

In [None]:
def predict(test_df: pl.DataFrame, sample_sub: pl.DataFrame):
    global data_pipeline, oof_models_dict, optimized_weights
    test_preds_df = pd.DataFrame()
    test_df = data_pipeline.transform(test_df)
    test_df = test_df.drop([CFG.fold_col], axis=1)
    for model_name, oof_model in oof_models_dict.items():
        test_preds = oof_model.predict(test_df)    
        test_preds_df[f"pred_{model_name}"] = test_preds
    test_preds_all = (test_preds_df * optimized_weights).sum(axis=1).to_numpy()
    test_preds_all = np.clip(test_preds_all, a_min = -1.0, a_max = 1.0)
    print(test_preds_df.head())
    print(test_preds_all[:10])
    return sample_sub.with_columns(pl.Series('utility_agent1', test_preds_all))

In [None]:
inference_server = kaggle_evaluation.mcts_inference_server.MCTSInferenceServer(predict)
if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    inference_server.serve()
else:
    inference_server.run_local_gateway(
        (
            '/kaggle/input/um-game-playing-strength-of-mcts-variants/test.csv',
            '/kaggle/input/um-game-playing-strength-of-mcts-variants/sample_submission.csv'
        )
    )