### Imports

In [83]:
import numpy as np
import joblib
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score
from tqdm import tqdm

### Main

In [71]:
df = pd.read_csv("/all_plays_merged_data.csv")
X = df.drop(columns=["event_type"])
y = df["event_type"]

In [72]:
event_counts = df["event_type"].value_counts()
eventos_validos = event_counts[event_counts >= 2000].index
df = df[df["event_type"].isin(eventos_validos)]

In [73]:
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [90]:
rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    max_samples=0.8,
    n_jobs=-1,
    random_state=42
)

In [75]:
def cross_val_train(model, X, y, kf, model_name):
    scores = []
    for fold, (train_idx, val_idx) in enumerate(tqdm(kf.split(X, y), total=kf.get_n_splits(), desc=f"{model_name} Training")):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        model.fit(X_train, y_train)
        preds = model.predict(X_val)
        acc = accuracy_score(y_val, preds)
        scores.append(acc)
        print(f"Fold {fold+1}: Accuracy = {acc:.4f}")

    print(f"{model_name} - Accuracy Avg: {np.mean(scores):.4f}")
    return scores

Poor prediction, can be improved with different data but is enough to test a concept

In [91]:
rf_scores = cross_val_train(rf_model, X, y, kf, "Random Forest")

Random Forest Training:  20%|██        | 1/5 [00:53<03:32, 53.10s/it]

Fold 1: Accuracy = 0.4108


Random Forest Training:  40%|████      | 2/5 [01:46<02:40, 53.48s/it]

Fold 2: Accuracy = 0.4117


Random Forest Training:  60%|██████    | 3/5 [02:40<01:47, 53.52s/it]

Fold 3: Accuracy = 0.4111


Random Forest Training:  80%|████████  | 4/5 [03:34<00:53, 53.73s/it]

Fold 4: Accuracy = 0.4107


Random Forest Training: 100%|██████████| 5/5 [04:26<00:00, 53.33s/it]

Fold 5: Accuracy = 0.4116
Random Forest - Accuracy Avg: 0.4112





### Final model

In [92]:
rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    max_samples=0.8,
    n_jobs=-1,
    random_state=42
)

rf_model.fit(X, y)

In [95]:
joblib.dump(rf_model, 'model.joblib')

['model.joblib']

In [94]:
X.columns

Index(['vs_RHB', 'vs_LHB', 'vs_SHB', 'vs_RHP', 'vs_LHP', 'b_airOuts',
       'b_atBats', 'b_baseOnBalls', 'b_catchersInterference',
       'b_caughtStealing',
       ...
       'p_runsScoredPer9', 'p_slg', 'p_stolenBasePercentage',
       'p_strikePercentage', 'p_strikeoutWalkRatio', 'p_strikeoutsPer9Inn',
       'p_walksPer9Inn', 'p_whip', 'p_winPercentage', 'p_ops'],
      dtype='object', length=102)