## Load and inspect data

In [1]:
import pandas as pd

db = pd.read_csv("final_matchlog_for_model.csv")

db.head()

Unnamed: 0,Number,Player,Matchweek,Matchday,Injured,Minutes_Last_Game,Minutes_Prev_3_Games,Minutes_Last_30d,Days_Since_Last_Injury_Related,Days_Since_Last_Injury_Any,...,Pos,Days_Rest,Short_Rest_Flag_le3d,Matches_Last_10d,High_Minute_Flag_10d_3x85,Starts_Last_5,Consecutive_Starts,Rolling_Minutes_7d,Rolling_Minutes_28d,Minutes
0,3,Radu Dragusin,1,2024-08-17,0,0.0,0.0,0.0,,,...,,,,0,,,,,,0.0
1,4,Kevin Danso,1,2024-08-17,0,0.0,0.0,0.0,,,...,DF,,,0,,,,,,0.0
2,6,Jo√£o Palhinha,1,2024-08-17,0,0.0,0.0,14.0,,,...,MF,,,1,,,,,,0.0
3,7,Xavi Simons,1,2024-08-17,0,0.0,0.0,88.0,,,...,"FW,MF",,,1,,,,,,0.0
4,8,Yves Bissouma,1,2024-08-17,0,0.0,0.0,0.0,,,...,,,,0,,,,,,0.0


In [2]:
print("Shape:", db.shape)
print("\nColumns:\n", db.columns.tolist())
db.head(10)

Shape: (836, 21)

Columns:
 ['Number', 'Player', 'Matchweek', 'Matchday', 'Injured', 'Minutes_Last_Game', 'Minutes_Prev_3_Games', 'Minutes_Last_30d', 'Days_Since_Last_Injury_Related', 'Days_Since_Last_Injury_Any', 'age', 'Pos', 'Days_Rest', 'Short_Rest_Flag_le3d', 'Matches_Last_10d', 'High_Minute_Flag_10d_3x85', 'Starts_Last_5', 'Consecutive_Starts', 'Rolling_Minutes_7d', 'Rolling_Minutes_28d', 'Minutes']


Unnamed: 0,Number,Player,Matchweek,Matchday,Injured,Minutes_Last_Game,Minutes_Prev_3_Games,Minutes_Last_30d,Days_Since_Last_Injury_Related,Days_Since_Last_Injury_Any,...,Pos,Days_Rest,Short_Rest_Flag_le3d,Matches_Last_10d,High_Minute_Flag_10d_3x85,Starts_Last_5,Consecutive_Starts,Rolling_Minutes_7d,Rolling_Minutes_28d,Minutes
0,3,Radu Dragusin,1,2024-08-17,0,0.0,0.0,0.0,,,...,,,,0,,,,,,0.0
1,4,Kevin Danso,1,2024-08-17,0,0.0,0.0,0.0,,,...,DF,,,0,,,,,,0.0
2,6,Jo√£o Palhinha,1,2024-08-17,0,0.0,0.0,14.0,,,...,MF,,,1,,,,,,0.0
3,7,Xavi Simons,1,2024-08-17,0,0.0,0.0,88.0,,,...,"FW,MF",,,1,,,,,,0.0
4,8,Yves Bissouma,1,2024-08-17,0,0.0,0.0,0.0,,,...,,,,0,,,,,,0.0
5,9,Richarlison,1,2024-08-17,0,0.0,0.0,0.0,,,...,FW,,,0,,,,,,0.0
6,10,James Maddison,1,2024-08-17,0,0.0,0.0,0.0,,,...,,,,0,,,,,,0.0
7,11,,1,2024-08-17,0,0.0,0.0,62.0,,,...,,,,1,,,,,,0.0
8,13,Destiny Udogie,1,2024-08-17,0,0.0,0.0,0.0,,,...,DF,,,0,,,,,,0.0
9,14,,1,2024-08-17,0,0.0,0.0,0.0,,,...,,,,0,,,,,,0.0


## Basic dtype cleanup + quick QA (label balance & missing values)

In [3]:
import pandas as pd
import numpy as np

# Make a copy to work on
df = db.copy()

# Parse dates
df["Matchday"] = pd.to_datetime(df["Matchday"], errors="coerce")

# Ensure numeric types for all numeric feature columns
num_cols = [
    "Number","Matchweek","Injured",
    "Minutes_Last_Game","Minutes_Prev_3_Games","Minutes_Last_30d",
    "Days_Since_Last_Injury_Related","Days_Since_Last_Injury_Any",
    "age","Days_Rest","Short_Rest_Flag_le3d","Matches_Last_10d",
    "High_Minute_Flag_10d_3x85","Starts_Last_5","Consecutive_Starts",
    "Rolling_Minutes_7d","Rolling_Minutes_28d","Minutes"
]
for c in num_cols:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors="coerce")

# Quick label check and missing summary
print("Rows x Cols:", df.shape)
print("\nLabel counts (Injured):")
print(df["Injured"].value_counts(dropna=False))
print("\nLabel rate (% Injured):", round(100*df["Injured"].mean(), 2))

print("\nTop missing values:")
print(df.isna().sum().sort_values(ascending=False).head(12))

# Peek after cleanup
df.head(5)

Rows x Cols: (836, 21)

Label counts (Injured):
Injured
0    665
1    171
Name: count, dtype: int64

Label rate (% Injured): 20.45

Top missing values:
Days_Rest                         532
Short_Rest_Flag_le3d              530
Rolling_Minutes_28d               530
Rolling_Minutes_7d                530
Consecutive_Starts                530
Starts_Last_5                     530
High_Minute_Flag_10d_3x85         530
Days_Since_Last_Injury_Related    433
Days_Since_Last_Injury_Any        358
age                               342
Pos                               342
Player                            152
dtype: int64


Unnamed: 0,Number,Player,Matchweek,Matchday,Injured,Minutes_Last_Game,Minutes_Prev_3_Games,Minutes_Last_30d,Days_Since_Last_Injury_Related,Days_Since_Last_Injury_Any,...,Pos,Days_Rest,Short_Rest_Flag_le3d,Matches_Last_10d,High_Minute_Flag_10d_3x85,Starts_Last_5,Consecutive_Starts,Rolling_Minutes_7d,Rolling_Minutes_28d,Minutes
0,3,Radu Dragusin,1,2024-08-17,0,0.0,0.0,0.0,,,...,,,,0,,,,,,0.0
1,4,Kevin Danso,1,2024-08-17,0,0.0,0.0,0.0,,,...,DF,,,0,,,,,,0.0
2,6,Jo√£o Palhinha,1,2024-08-17,0,0.0,0.0,14.0,,,...,MF,,,1,,,,,,0.0
3,7,Xavi Simons,1,2024-08-17,0,0.0,0.0,88.0,,,...,"FW,MF",,,1,,,,,,0.0
4,8,Yves Bissouma,1,2024-08-17,0,0.0,0.0,0.0,,,...,,,,0,,,,,,0.0


## Create modeling table (impute missing + light encoding)

In [4]:
import numpy as np
import pandas as pd

work = df.copy()

# --- Simple imputations (transparent, LR-friendly) ---
# If no prior injury history, treat "time since" as a large number (no recent injuries)
NO_HISTORY_DAYS = 400  # ~ >1 season

work["Days_Since_Last_Injury_Any"] = work["Days_Since_Last_Injury_Any"].fillna(NO_HISTORY_DAYS)
work["Days_Since_Last_Injury_Related"] = work["Days_Since_Last_Injury_Related"].fillna(NO_HISTORY_DAYS)

# Rest features: assume weekly cadence if missing; short-rest flag -> 0 if unknown
work["Days_Rest"] = work["Days_Rest"].fillna(4)
work["Short_Rest_Flag_le3d"] = work["Short_Rest_Flag_le3d"].fillna(0)

# Age: median fill; minutes/rollups already 0 when absent but ensure no NaNs
work["age"] = work["age"].fillna(work["age"].median())
for c in ["Minutes_Last_Game","Minutes_Prev_3_Games","Minutes_Last_30d",
          "Matches_Last_10d","Rolling_Minutes_7d","Rolling_Minutes_28d",
          "High_Minute_Flag_10d_3x85","Starts_Last_5","Consecutive_Starts","Minutes"]:
    if c in work.columns:
        work[c] = work[c].fillna(0)

# --- Position: keep a simple primary position token and one-hot it ---
pos = work["Pos"].fillna("UNK").astype(str)
work["Pos_primary"] = pos.str.split(",", expand=True)[0].str.strip()
work = pd.get_dummies(work, columns=["Pos_primary"], drop_first=True)

# --- Choose a small, interpretable feature set for the first LR pass ---
base_features = [
    "Minutes_Last_Game","Minutes_Prev_3_Games","Minutes_Last_30d",
    "Days_Since_Last_Injury_Any","Days_Since_Last_Injury_Related",
    "Matches_Last_10d","Days_Rest","Short_Rest_Flag_le3d","age"
]
pos_features = [c for c in work.columns if c.startswith("Pos_primary_")]  # one-hots

features = base_features + pos_features
target = "Injured"

X = work[features].copy()
y = work[target].astype(int).copy()

print("X shape:", X.shape, "| y mean (injury rate):", round(y.mean(), 3))
print("Any NaNs in X?", X.isna().any().any())
print("Feature list:", features)
X.head(5)

X shape: (836, 12) | y mean (injury rate): 0.205
Any NaNs in X? False
Feature list: ['Minutes_Last_Game', 'Minutes_Prev_3_Games', 'Minutes_Last_30d', 'Days_Since_Last_Injury_Any', 'Days_Since_Last_Injury_Related', 'Matches_Last_10d', 'Days_Rest', 'Short_Rest_Flag_le3d', 'age', 'Pos_primary_FW', 'Pos_primary_MF', 'Pos_primary_UNK']


Unnamed: 0,Minutes_Last_Game,Minutes_Prev_3_Games,Minutes_Last_30d,Days_Since_Last_Injury_Any,Days_Since_Last_Injury_Related,Matches_Last_10d,Days_Rest,Short_Rest_Flag_le3d,age,Pos_primary_FW,Pos_primary_MF,Pos_primary_UNK
0,0.0,0.0,0.0,400.0,400.0,0,4.0,0.0,24.0,False,False,True
1,0.0,0.0,0.0,400.0,400.0,0,4.0,0.0,27.0,False,False,False
2,0.0,0.0,14.0,400.0,400.0,1,4.0,0.0,30.0,False,True,False
3,0.0,0.0,88.0,400.0,400.0,1,4.0,0.0,22.0,True,False,False
4,0.0,0.0,0.0,400.0,400.0,0,4.0,0.0,24.0,False,False,True


## Time-based split ➜ scale ➜ train baseline Logistic Regression

In [5]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    roc_auc_score, average_precision_score,
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix
)

# Use the same 'work', X, y, and features from Step 3
data = work.copy()

# Time-aware split by matchweek (avoid leakage)
train_weeks = list(range(1, 29))     # 1–28
val_weeks   = list(range(29, 35))    # 29–34
test_weeks  = list(range(35, 39))    # 35–38

def mask_weeks(weeks):
    return data["Matchweek"].isin(weeks)

X_train, y_train = X[mask_weeks(train_weeks)], y[mask_weeks(train_weeks)]
X_val,   y_val   = X[mask_weeks(val_weeks)],   y[mask_weeks(val_weeks)]
X_test,  y_test  = X[mask_weeks(test_weeks)],  y[mask_weeks(test_weeks)]

print("Split sizes:", X_train.shape, X_val.shape, X_test.shape)

# Pipeline: scale numeric features then LR (balanced for class imbalance)
pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("lr", LogisticRegression(max_iter=1000, class_weight="balanced", solver="lbfgs"))
])

pipe.fit(X_train, y_train)

def eval_split(name, Xs, ys):
    proba = pipe.predict_proba(Xs)[:,1]
    pred  = (proba >= 0.5).astype(int)
    out = {
        "split": name,
        "roc_auc": roc_auc_score(ys, proba),
        "pr_auc": average_precision_score(ys, proba),
        "accuracy": accuracy_score(ys, pred),
        "precision": precision_score(ys, pred, zero_division=0),
        "recall": recall_score(ys, pred, zero_division=0),
        "f1": f1_score(ys, pred, zero_division=0),
        "confusion_matrix": confusion_matrix(ys, pred).tolist()
    }
    return out

report = []
report.append(eval_split("train", X_train, y_train))
report.append(eval_split("val",   X_val,   y_val))
report.append(eval_split("test",  X_test,  y_test))

print("\nMetrics (ROC-AUC / PR-AUC / Acc / Prec / Rec / F1):")
for r in report:
    print(
        f"{r['split']:>5}: "
        f"{r['roc_auc']:.3f} | {r['pr_auc']:.3f} | {r['accuracy']:.3f} | "
        f"{r['precision']:.3f} | {r['recall']:.3f} | {r['f1']:.3f} "
        f"| CM={r['confusion_matrix']}"
    )

# Show sorted coefficients for interpretability (after scaling)
lr = pipe.named_steps["lr"]
scaler = pipe.named_steps["scaler"]

coef = lr.coef_.ravel()
coef_df = pd.DataFrame({"feature": X.columns, "coef": coef})
coef_df["abs_coef"] = coef_df["coef"].abs()
coef_df = coef_df.sort_values("abs_coef", ascending=False).drop(columns="abs_coef")

print("\nTop coefficients (signed, larger magnitude = stronger effect):")
print(coef_df.head(15).to_string(index=False))

Split sizes: (616, 12) (132, 12) (88, 12)

Metrics (ROC-AUC / PR-AUC / Acc / Prec / Rec / F1):
train: 0.978 | 0.938 | 0.924 | 0.782 | 0.930 | 0.850 | CM=[[436, 37], [10, 133]]
  val: 0.954 | 0.845 | 0.720 | 0.288 | 1.000 | 0.448 | CM=[[80, 37], [0, 15]]
 test: 0.861 | 0.723 | 0.750 | 0.333 | 0.692 | 0.450 | CM=[[57, 18], [4, 9]]

Top coefficients (signed, larger magnitude = stronger effect):
                       feature      coef
    Days_Since_Last_Injury_Any -3.587515
Days_Since_Last_Injury_Related  1.621190
              Matches_Last_10d -1.333619
          Minutes_Prev_3_Games  1.143892
              Minutes_Last_30d -0.841264
                Pos_primary_MF -0.698609
             Minutes_Last_Game -0.415846
               Pos_primary_UNK -0.386876
                Pos_primary_FW  0.368526
                     Days_Rest -0.277583
          Short_Rest_Flag_le3d -0.207801
                           age  0.169176


## Tune a decision threshold on the validation set, then lock and test

In [6]:
import numpy as np
import pandas as pd
from sklearn.metrics import (
    roc_auc_score, average_precision_score,
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix
)

# Recompute split masks (in case your kernel was reset later)
train_weeks = list(range(1, 29))
val_weeks   = list(range(29, 35))
test_weeks  = list(range(35, 39))
mask_train = work["Matchweek"].isin(train_weeks)
mask_val   = work["Matchweek"].isin(val_weeks)
mask_test  = work["Matchweek"].isin(test_weeks)

# Get probabilities from the already-fitted pipeline 'pipe'
proba_val  = pipe.predict_proba(X[mask_val])[:,1]
y_val_true = y[mask_val].values

proba_test = pipe.predict_proba(X[mask_test])[:,1]
y_test_true = y[mask_test].values

# Sweep thresholds and select the one with best F1 on validation
thresholds = np.linspace(0.05, 0.95, 19)
scores = []
for t in thresholds:
    yv_pred = (proba_val >= t).astype(int)
    scores.append({
        "threshold": float(t),
        "f1": f1_score(y_val_true, yv_pred, zero_division=0),
        "precision": precision_score(y_val_true, yv_pred, zero_division=0),
        "recall": recall_score(y_val_true, yv_pred, zero_division=0),
        "accuracy": accuracy_score(y_val_true, yv_pred)
    })
scores_df = pd.DataFrame(scores).sort_values("f1", ascending=False)
best_t = float(scores_df.iloc[0]["threshold"])
best_row = scores_df.iloc[0].to_dict()

print("Best validation threshold by F1:", round(best_t, 3))
print("Validation @ best_t -> F1={f1:.3f}, P={precision:.3f}, R={recall:.3f}, Acc={accuracy:.3f}".format(**best_row))

# Lock threshold and evaluate on test
yt_pred = (proba_test >= best_t).astype(int)
test_metrics = {
    "roc_auc": roc_auc_score(y_test_true, proba_test),
    "pr_auc": average_precision_score(y_test_true, proba_test),
    "accuracy": accuracy_score(y_test_true, yt_pred),
    "precision": precision_score(y_test_true, yt_pred, zero_division=0),
    "recall": recall_score(y_test_true, yt_pred, zero_division=0),
    "f1": f1_score(y_test_true, yt_pred, zero_division=0),
    "confusion_matrix": confusion_matrix(y_test_true, yt_pred).tolist(),
    "threshold": best_t
}
print("\nTest @ tuned threshold:")
for k, v in test_metrics.items():
    print(f"{k}: {v}")

# Optional: create a test-set predictions table you can inspect/export later
out_test = work.loc[mask_test, ["Number","Player","Matchweek","Matchday","Injured"]].copy()
out_test["injury_proba"] = proba_test
out_test["pred_injured"] = yt_pred
out_test = out_test.sort_values(["Matchweek","injury_proba"], ascending=[True, False]).reset_index(drop=True)

print("\nPreview of test predictions (top 15 by probability):")
display(out_test.head(15))

Best validation threshold by F1: 0.95
Validation @ best_t -> F1=0.759, P=0.786, R=0.733, Acc=0.947

Test @ tuned threshold:
roc_auc: 0.8605128205128205
pr_auc: 0.7234139383075553
accuracy: 0.9318181818181818
precision: 1.0
recall: 0.5384615384615384
f1: 0.7
confusion_matrix: [[75, 0], [6, 7]]
threshold: 0.95

Preview of test predictions (top 15 by probability):


Unnamed: 0,Number,Player,Matchweek,Matchday,Injured,injury_proba,pred_injured
0,3,Radu Dragusin,35,2025-05-04,1,0.995442,1
1,15,Lucas Bergvall,35,2025-05-04,1,0.883681,0
2,17,Cristian Romero,35,2025-05-04,0,0.801316,0
3,4,Kevin Danso,35,2025-05-04,0,0.719651,0
4,9,Richarlison,35,2025-05-04,0,0.616021,0
5,37,Micky van de Ven,35,2025-05-04,0,0.451869,0
6,19,Dominic Solanke,35,2025-05-04,1,0.418866,0
7,24,Djed Spence,35,2025-05-04,0,0.402905,0
8,22,Brennan Johnson,35,2025-05-04,0,0.392779,0
9,8,Yves Bissouma,35,2025-05-04,0,0.268876,0


## Interpret the model (standardized coefficients → odds ratios per 1σ)

In [7]:
import pandas as pd
import numpy as np

# Pull fitted pieces
lr = pipe.named_steps["lr"]
scaler = pipe.named_steps["scaler"]

coef = lr.coef_.ravel()
features_series = pd.Index(X.columns)

# Odds ratios correspond to a +1 standard deviation change because we standardized
or_per_1sd = np.exp(coef)

# Gather basic feature stats for context
stats = pd.DataFrame({
    "feature": features_series,
    "mean": X.mean().values,
    "std": X.std(ddof=0).values,  # population std for clarity
    "coef_stdscale": coef,
    "odds_ratio_per_1sd": or_per_1sd
}).sort_values("odds_ratio_per_1sd", ascending=False)

# Add an interpretation helper
def direction(r):
    return "↑ risk" if r["odds_ratio_per_1sd"] > 1.0 else "↓ risk"

stats["direction"] = stats.apply(direction, axis=1)

print("Odds ratio is the multiplicative change in odds of injury for a +1 SD increase in the feature.\n")
print(stats.to_string(index=False))

Odds ratio is the multiplicative change in odds of injury for a +1 SD increase in the feature.

                       feature       mean        std  coef_stdscale  odds_ratio_per_1sd direction
Days_Since_Last_Injury_Related 251.722488 159.822051       1.621190            5.059106    ↑ risk
          Minutes_Prev_3_Games  68.259569  77.682388       1.143892            3.138962    ↑ risk
                Pos_primary_FW   0.181818   0.385695       0.368526            1.445602    ↑ risk
                           age  24.318182   2.475082       0.169176            1.184329    ↑ risk
          Short_Rest_Flag_le3d   0.137560   0.344437      -0.207801            0.812368    ↓ risk
                     Days_Rest   4.930622   5.295341      -0.277583            0.757612    ↓ risk
               Pos_primary_UNK   0.409091   0.491666      -0.386876            0.679175    ↓ risk
             Minutes_Last_Game  23.523923  36.565052      -0.415846            0.659782    ↓ risk
                Pos_pr

## Export predictions and metrics for reuse in notebook or Excel

In [8]:
# Save the final test predictions + probabilities and the metrics summary

preds_out_path = "test_predictions.csv"
metrics_out_path = "model_metrics.csv"

# Save predictions (from Step 5: out_test DataFrame)
out_test.to_csv(preds_out_path, index=False)

# Collect metrics from Step 5 (tuned threshold test performance)
metrics_df = pd.DataFrame([test_metrics])
metrics_df.to_csv(metrics_out_path, index=False)

print(f"Saved predictions to {preds_out_path}")
print(f"Saved metrics to {metrics_out_path}")

# Show a quick preview of the predictions file
out_test.head(10)

Saved predictions to test_predictions.csv
Saved metrics to model_metrics.csv


Unnamed: 0,Number,Player,Matchweek,Matchday,Injured,injury_proba,pred_injured
0,3,Radu Dragusin,35,2025-05-04,1,0.995442,1
1,15,Lucas Bergvall,35,2025-05-04,1,0.883681,0
2,17,Cristian Romero,35,2025-05-04,0,0.801316,0
3,4,Kevin Danso,35,2025-05-04,0,0.719651,0
4,9,Richarlison,35,2025-05-04,0,0.616021,0
5,37,Micky van de Ven,35,2025-05-04,0,0.451869,0
6,19,Dominic Solanke,35,2025-05-04,1,0.418866,0
7,24,Djed Spence,35,2025-05-04,0,0.402905,0
8,22,Brennan Johnson,35,2025-05-04,0,0.392779,0
9,8,Yves Bissouma,35,2025-05-04,0,0.268876,0


## Retrain on Train+Val, lock threshold, evaluate on Test, and save the model

In [9]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    roc_auc_score, average_precision_score,
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix
)
import joblib

# --- Rebuild masks and datasets (reuse X, y, work, features from earlier steps) ---
train_weeks = list(range(1, 29))
val_weeks   = list(range(29, 35))
test_weeks  = list(range(35, 39))

mask_train = work["Matchweek"].isin(train_weeks)
mask_val   = work["Matchweek"].isin(val_weeks)
mask_test  = work["Matchweek"].isin(test_weeks)

X_trainval = X[mask_train | mask_val]
y_trainval = y[mask_train | mask_val]
X_test     = X[mask_test]
y_test     = y[mask_test]

print("Train+Val size:", X_trainval.shape, "| Test size:", X_test.shape)

# --- Refit pipeline on Train+Val ---
pipe_tv = Pipeline([
    ("scaler", StandardScaler()),
    ("lr", LogisticRegression(max_iter=1000, class_weight="balanced", solver="lbfgs"))
])
pipe_tv.fit(X_trainval, y_trainval)

# --- Use the previously tuned threshold (best_t from Step 5) ---
proba_test_tv = pipe_tv.predict_proba(X_test)[:,1]
yt_pred_tv = (proba_test_tv >= best_t).astype(int)

# --- Evaluate ---
metrics_retrain = {
    "roc_auc": roc_auc_score(y_test, proba_test_tv),
    "pr_auc": average_precision_score(y_test, proba_test_tv),
    "accuracy": accuracy_score(y_test, yt_pred_tv),
    "precision": precision_score(y_test, yt_pred_tv, zero_division=0),
    "recall": recall_score(y_test, yt_pred_tv, zero_division=0),
    "f1": f1_score(y_test, yt_pred_tv, zero_division=0),
    "confusion_matrix": confusion_matrix(y_test, yt_pred_tv).tolist(),
    "threshold": float(best_t)
}
print("Test (retrained on Train+Val) @ tuned threshold:")
for k, v in metrics_retrain.items():
    print(f"{k}: {v}")

# --- Save the retrained pipeline and threshold ---
joblib.dump({"pipeline": pipe_tv, "features": features, "threshold": float(best_t)}, "injury_lr_pipeline.joblib")
print("\nSaved model to injury_lr_pipeline.joblib")

# --- Attach predictions to the earlier out_test table for convenience ---
out_test_retrain = work.loc[mask_test, ["Number","Player","Matchweek","Matchday","Injured"]].copy()
out_test_retrain["injury_proba"] = proba_test_tv
out_test_retrain["pred_injured"] = yt_pred_tv
out_test_retrain = out_test_retrain.sort_values(["Matchweek","injury_proba"], ascending=[True, False]).reset_index(drop=True)

out_test_retrain.head(15)

Train+Val size: (748, 12) | Test size: (88, 12)
Test (retrained on Train+Val) @ tuned threshold:
roc_auc: 0.8707692307692307
pr_auc: 0.742314187360627
accuracy: 0.9090909090909091
precision: 1.0
recall: 0.38461538461538464
f1: 0.5555555555555556
confusion_matrix: [[75, 0], [8, 5]]
threshold: 0.95

Saved model to injury_lr_pipeline.joblib


Unnamed: 0,Number,Player,Matchweek,Matchday,Injured,injury_proba,pred_injured
0,3,Radu Dragusin,35,2025-05-04,1,0.991925,1
1,15,Lucas Bergvall,35,2025-05-04,1,0.803711,0
2,17,Cristian Romero,35,2025-05-04,0,0.736128,0
3,4,Kevin Danso,35,2025-05-04,0,0.627391,0
4,9,Richarlison,35,2025-05-04,0,0.500661,0
5,37,Micky van de Ven,35,2025-05-04,0,0.411117,0
6,24,Djed Spence,35,2025-05-04,0,0.30766,0
7,19,Dominic Solanke,35,2025-05-04,1,0.274244,0
8,8,Yves Bissouma,35,2025-05-04,0,0.259193,0
9,6,Jo√£o Palhinha,35,2025-05-04,0,0.233436,0


In [12]:
# Get predicted probabilities for ALL weeks
proba_all = pipe_tv.predict_proba(X)[:,1]
pred_all = (proba_all >= best_t).astype(int)

# Attach to the full dataset
predictions_all = work[["Number","Player","Matchweek","Matchday","Injured"]].copy()
predictions_all["injury_proba"] = proba_all
predictions_all["pred_injured"] = pred_all

# Save to CSV for Power BI
predictions_all.to_csv("predictions_all_matchweeks.csv", index=False)

print("Saved predictions for all 38 matchweeks.")

Saved predictions for all 38 matchweeks.
