In [32]:
# 0.  SET-UP
import pandas as pd, numpy as np, joblib
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import roc_auc_score, classification_report
from xgboost import XGBClassifier
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [18]:
PATH = "../datasets/cleaned_fifa_dataset_v3_with_features.csv"
df = pd.read_csv(PATH).sort_values(["p_id2", "start_year"])

In [19]:
# 2.  BUILD A FUTURE-LOOKING TARGET
df["injury_next_season"] = (
    df.groupby("p_id2")["significant_injury_prev_season"]
      .shift(-1)              # look one season ahead
      .fillna(0)
      .astype(int)
)

# optionally drop each player's final season (unknown future)
df = df[df.groupby("p_id2")["start_year"]
          .rank(method="first").lt(df.groupby("p_id2")["start_year"].transform("count"))]

In [27]:
# 3.  FEATURE SELECTION ― ***NO SAME-SEASON INJURY INFO***
leak_cols = [
    "season_days_injured",            # current-season outcome
    "season_days_injured_prev_season",
    "total_days_injured"              # cumulative *to date* leaks outcome
]
numeric = [
    "age", "bmi", "pace", "physic", "fifa_rating",
    "season_minutes_played", "season_games_played",
    "cumulative_minutes_played", "cumulative_games_played",
    "minutes_per_game_prev_seasons", "avg_days_injured_prev_seasons",
    "avg_games_per_season_prev_seasons", "cumulative_days_injured",
    "injury_rate", "minutes_per_game", "experience_years"
]
categorical = ["bmi_class", "nationality"]

X = df.drop(columns=leak_cols + ["injury_next_season"])
X = X[numeric + categorical]              # keep only allowed columns
y = df["injury_next_season"]

In [28]:
# 4.  PREPROCESSING + MODEL
pre = ColumnTransformer([
        ("num", StandardScaler(), numeric),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical)
    ], remainder="drop")

clf = XGBClassifier(
        objective="binary:logistic",
        eval_metric="auc",
        n_estimators=400,
        max_depth=4,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        scale_pos_weight=(y==0).sum()/(y==1).sum(),
        n_jobs=-1,
        random_state=42
    )

pipe = Pipeline([("prep", pre), ("model", clf)])


In [29]:
# 5.  TIME-SERIES CV (gap=1 season)
tscv = TimeSeriesSplit(n_splits=5, gap=1)
scores = []

for fold, (train_idx, test_idx) in enumerate(tscv.split(X, y)):
    pipe.fit(X.iloc[train_idx], y.iloc[train_idx])
    y_pred = pipe.predict_proba(X.iloc[test_idx])[:, 1]
    auc = roc_auc_score(y.iloc[test_idx], y_pred)
    scores.append(auc)
    print(f"Fold {fold+1}: AUC = {auc:.3f}")

print(f"\nCV AUC = {np.mean(scores):.3f} ± {np.std(scores):.3f}")

Fold 1: AUC = 0.754
Fold 2: AUC = 0.880
Fold 3: AUC = 0.782
Fold 3: AUC = 0.782
Fold 4: AUC = 0.703
Fold 4: AUC = 0.703
Fold 5: AUC = 0.803

CV AUC = 0.784 ± 0.059
Fold 5: AUC = 0.803

CV AUC = 0.784 ± 0.059


In [31]:
# 6.  FINAL TRAIN & SAVE
pipe.fit(X, y)
joblib.dump(pipe, "../models/injury_xgb_leakfree_v2.joblib")
print("Saved ➜ ../models/injury_xgb_leakfree.joblib")

Saved ➜ ../models/injury_xgb_leakfree.joblib
