### Imports

In [83]:
import numpy as np
import joblib
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score
from tqdm import tqdm

### Main

In [71]:
df = pd.read_csv("/all_plays_merged_data.csv")
X = df.drop(columns=["event_type"])
y = df["event_type"]

In [72]:
event_counts = df["event_type"].value_counts()
eventos_validos = event_counts[event_counts >= 2000].index
df = df[df["event_type"].isin(eventos_validos)]

In [73]:
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [74]:
best_cat_params = {
    'objective': 'MultiClass',
    'early_stopping_rounds': 50,
    'reg_lambda': 2.32847283242342,
    'learning_rate': 0.05614850749229461,
    'min_data_in_leaf': 12,
    'iterations': 700,
    'depth': 7,
    'boosting_type': 'Plain',
    'bootstrap_type': 'Bayesian',  # Puedes probar 'Bernoulli' o 'MVS'
    'bagging_temperature': 0.0381988978242882,
    'eval_metric': 'MultiClass'
}
catboost_model = CatBoostClassifier(random_seed=42, random_strength=0, task_type="GPU", verbose=0, **best_cat_params)

In [75]:
def cross_val_train(model, X, y, kf, model_name):
    scores = []
    for fold, (train_idx, val_idx) in enumerate(tqdm(kf.split(X, y), total=kf.get_n_splits(), desc=f"{model_name} Training")):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        model.fit(X_train, y_train)
        preds = model.predict(X_val)
        acc = accuracy_score(y_val, preds)
        scores.append(acc)
        print(f"Fold {fold+1}: Accuracy = {acc:.4f}")

    print(f"{model_name} - Accuracy Avg: {np.mean(scores):.4f}")
    return scores

Poor prediction, can be improved with different data but is enough to test a concept

In [76]:
catboost_scores = cross_val_train(catboost_model, X, y, kf, "CatBoost")

CatBoost Training:  20%|██        | 1/5 [01:01<04:07, 61.96s/it]

Fold 1: Accuracy = 0.4193


CatBoost Training:  40%|████      | 2/5 [02:02<03:03, 61.07s/it]

Fold 2: Accuracy = 0.4184


CatBoost Training:  60%|██████    | 3/5 [03:08<02:07, 63.57s/it]

Fold 3: Accuracy = 0.4167


CatBoost Training:  80%|████████  | 4/5 [04:09<01:02, 62.53s/it]

Fold 4: Accuracy = 0.4195


CatBoost Training: 100%|██████████| 5/5 [05:08<00:00, 61.71s/it]

Fold 5: Accuracy = 0.4175
CatBoost - Accuracy Avg: 0.4183





### Final model

In [80]:
catboost_model = CatBoostClassifier(random_seed=42, random_strength=0, task_type="GPU", verbose=0, **best_cat_params)
catboost_model.fit(X, y)

<catboost.core.CatBoostClassifier at 0x78a344f51190>

In [84]:
joblib.dump(catboost_model, "catboost_model.pkl")

['catboost_model.pkl']