In [1]:
import pandas as pd
from pathlib import Path
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from scipy.stats import spearmanr, kendalltau  

# ----------------------------
# 1. Load data
# ----------------------------
root_dir = Path.cwd().parent.parent
dataset_path = root_dir / "outputs" / "college_stats.csv"

df = pd.read_csv(dataset_path)

In [2]:
train_df = df[df["SEASON"] != 2025]
test_df = df[df["SEASON"] == 2025]


In [3]:
# ----------------------------
# 2. Simple ranking model
# ----------------------------
class RankMLP(nn.Module):
    def __init__(self, input_dim, hidden_dim=64):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 1)
        )

    def forward(self, x):
        # x: [N, D]
        return self.net(x).squeeze(-1)  # [N]

# ----------------------------
# 3. Listwise losses
# ----------------------------

def listnet_loss(scores, labels):
    """
    ListNet top-1 cross entropy.
    scores: [N] model scores (higher means better)
    labels: [N] OVERALL_PICK (lower is better in reality)
    We convert labels to relevance by rel = -labels.
    """
    rel = -labels  # larger rel = better
    P_y = F.softmax(rel, dim=0)
    P_s = F.softmax(scores, dim=0)
    loss = -torch.sum(P_y * torch.log(P_s + 1e-12))
    return loss

def listmle_loss(scores, labels):
    """
    ListMLE loss.
    scores: [N]
    labels: [N] OVERALL_PICK (lower = better)
    We sort items by true ranking (ascending OVERALL_PICK).
    """
    # sort by true rank: best (smallest pick) first
    _, idx = torch.sort(labels, descending=False)
    s_sorted = scores[idx]

    # log-sum-exp over suffixes:
    # denominator for position i is sum_{j>=i} exp(s_j)
    log_cumsumexp = torch.logcumsumexp(s_sorted.flip(0), dim=0).flip(0)

    # log-likelihood: sum_i [s_i - log(sum_{j>=i} exp(s_j))]
    log_likelihood = torch.sum(s_sorted - log_cumsumexp)
    return -log_likelihood  # negate to get loss

# ----------------------------
# 4. Evaluation: pairwise ranking accuracy
# ----------------------------
def pairwise_accuracy(scores, labels):
    """
    Pairwise accuracy within one list.
    True order: lower OVERALL_PICK is better.
    """
    scores = scores.detach().cpu().numpy()
    labels = labels.detach().cpu().numpy()
    n = len(labels)
    if n < 2:
        return 0.0

    correct = 0
    total = 0
    for i in range(n):
        for j in range(i + 1, n):
            total += 1
            true_better = labels[i] < labels[j]  # True if i should rank ahead of j
            pred_better = scores[i] > scores[j]  # True if model scores i > j
            if (true_better and pred_better) or ((not true_better) and (not pred_better)):
                correct += 1
    return correct / total if total > 0 else 0.0

def evaluate_model(model, groups):
    """
    Evaluate model over all season groups.

    Returns a dict with:
      - pairwise_accuracy: pooled over all seasons
      - spearman: mean Spearman rho across seasons
      - kendall:  mean Kendall tau across seasons
    """
    model.eval()
    total_correct = 0.0
    total_pairs = 0

    spearman_scores = []
    kendall_scores = []

    with torch.no_grad():
        for season, X, y in groups:
            s = model(X)  # scores for this season
            n = len(y)
            if n < 2:
                continue

            # ----- pairwise accuracy -----
            n_pairs = n * (n - 1) // 2
            acc = pairwise_accuracy(s, y)
            total_correct += acc * n_pairs
            total_pairs += n_pairs

            # ----- Spearman & Kendall -----
            scores_np = s.detach().cpu().numpy()
            labels_np = y.detach().cpu().numpy()

            # higher score = better, lower pick = better
            # so correlate scores with -labels to make "better" = larger value
            rho, _ = spearmanr(scores_np, -labels_np)
            tau, _ = kendalltau(scores_np, -labels_np)

            if not np.isnan(rho):
                spearman_scores.append(rho)
            if not np.isnan(tau):
                kendall_scores.append(tau)

    pairwise_acc = total_correct / total_pairs if total_pairs > 0 else 0.0
    mean_spearman = float(np.mean(spearman_scores)) if spearman_scores else 0.0
    mean_kendall  = float(np.mean(kendall_scores))  if kendall_scores  else 0.0

    return {
        "pairwise_accuracy": pairwise_acc,
        "spearman":          mean_spearman,
        "kendall":           mean_kendall,
    }

# ----------------------------
# 5. Training loop helper
# ----------------------------
def train_listwise(model, groups, loss_fn, n_epochs=200, lr=1e-3, name="model"):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    for epoch in range(1, n_epochs + 1):
        model.train()
        total_loss = 0.0
        for season, X, y in groups:
            optimizer.zero_grad()
            scores = model(X)
            loss = loss_fn(scores, y)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        avg_loss = total_loss / max(len(groups), 1)

        if epoch % 20 == 0 or epoch == 1:
            print(f"[{name}] Epoch {epoch:3d} | train loss = {avg_loss:.4f}")

    return model


In [4]:
# ----------------------------
# 6. K-fold cross-validation
# ----------------------------

from sklearn.model_selection import KFold
import numpy as np
import torch

def prepare_kfold_folds(df, feature_cols, k_folds=5, random_state=42):
    """
    Prepare K-fold season-wise data for listwise ranking.

    Returns a list of folds, where each fold is a dict:
      {
        "fold_id": int,
        "train_seasons": [...],
        "test_seasons":  [...],
        "train_groups":  [(season, X, y), ...],
        "test_groups":   [(season, X, y), ...],
      }

    Each fold has its own scaling (mean/std) computed from that fold's TRAIN seasons only.
    """
    all_seasons = sorted(df["SEASON"].unique())
    kf = KFold(n_splits=k_folds, shuffle=True, random_state=random_state)

    folds = []

    for fold_id, (train_idx, test_idx) in enumerate(kf.split(all_seasons), start=1):
        train_seasons = [all_seasons[i] for i in train_idx]
        test_seasons  = [all_seasons[i] for i in test_idx]

        df_train = df[df["SEASON"].isin(train_seasons)].copy()
        df_test  = df[df["SEASON"].isin(test_seasons)].copy()

        # ---- scaling: fit ONLY on this fold's training data ----
        train_feats = df_train[feature_cols]
        feat_mean = train_feats.mean()
        feat_std  = train_feats.std().replace(0, 1.0)

        def make_groups(df_subset, seasons_subset):
            groups = []
            for season in seasons_subset:
                g = df_subset[df_subset["SEASON"] == season].copy()
                if g.empty:
                    continue

                g = g.sort_values("OVERALL_PICK")  # lower pick = better
                g_scaled = (g[feature_cols] - feat_mean) / feat_std

                X = torch.tensor(g_scaled.values, dtype=torch.float32)
                y = torch.tensor(g["OVERALL_PICK"].values, dtype=torch.float32)
                groups.append((season, X, y))
            return groups

        train_groups = make_groups(df_train, train_seasons)
        test_groups  = make_groups(df_test,  test_seasons)

        folds.append({
            "fold_id": fold_id,
            "train_seasons": train_seasons,
            "test_seasons":  test_seasons,
            "train_groups":  train_groups,
            "test_groups":   test_groups,
        })

    return folds


In [5]:
drop_cols = ["player_name", "OVERALL_PICK", "SEASON"]
feature_cols = [c for c in df.columns if c not in drop_cols]

folds = prepare_kfold_folds(
    df=train_df,
    feature_cols=feature_cols,
    k_folds=5,
    random_state=42,
)


In [6]:
def run_kfold_for_loss(folds, loss_fn, model_name, n_epochs=200, hidden_dim=64, lr=1e-3):

    train_metrics = []
    test_metrics  = []

    for fold in folds:
        fold_id = fold["fold_id"]
        train_groups = fold["train_groups"]
        test_groups  = fold["test_groups"]

        print(f"\n===== {model_name} | Fold {fold_id} =====")
        print("Train seasons:", fold["train_seasons"])
        print("Test  seasons:", fold["test_seasons"])

        torch.manual_seed(42)
        model = RankMLP(input_dim=len(feature_cols), hidden_dim=hidden_dim)

        model = train_listwise(
            model,
            train_groups,
            loss_fn=loss_fn,
            n_epochs=n_epochs,
            lr=lr,
            name=f"{model_name} Fold {fold_id}"
        )

        # --- NEW: metrics are now dicts ---
        train_res = evaluate_model(model, train_groups)
        test_res  = evaluate_model(model, test_groups)

        train_metrics.append(train_res)
        test_metrics.append(test_res)

        print(f"[{model_name} Fold {fold_id}]")
        print(f"  Train: pairwise={train_res['pairwise_accuracy']:.3f} | "
              f"Spearman={train_res['spearman']:.3f} | "
              f"Kendall={train_res['kendall']:.3f}")
        print(f"  Test : pairwise={test_res['pairwise_accuracy']:.3f} | "
              f"Spearman={test_res['spearman']:.3f} | "
              f"Kendall={test_res['kendall']:.3f}")

    # ==== Summary over all folds ====
    print(f"\n=== {model_name} {len(folds)}-fold CV (season-wise) ===")

    for i, (tr, te) in enumerate(zip(train_metrics, test_metrics), start=1):
        print(f"Fold {i}:")
        print(f"  Train: pair={tr['pairwise_accuracy']:.3f},  "
              f"Spearman={tr['spearman']:.3f},  Kendall={tr['kendall']:.3f}")
        print(f"  Test : pair={te['pairwise_accuracy']:.3f},  "
              f"Spearman={te['spearman']:.3f},  Kendall={te['kendall']:.3f}")

    # Mean summary
    mean_train_pair   = np.mean([m["pairwise_accuracy"] for m in train_metrics])
    mean_train_rho    = np.mean([m["spearman"] for m in train_metrics])
    mean_train_tau    = np.mean([m["kendall"] for m in train_metrics])

    mean_test_pair    = np.mean([m["pairwise_accuracy"] for m in test_metrics])
    mean_test_rho     = np.mean([m["spearman"] for m in test_metrics])
    mean_test_tau     = np.mean([m["kendall"] for m in test_metrics])

    print("\n=== Mean Metrics Across Folds ===")
    print(f"Train: pair={mean_train_pair:.3f}, Spearman={mean_train_rho:.3f}, Kendall={mean_train_tau:.3f}")
    print(f"Test : pair={mean_test_pair:.3f}, Spearman={mean_test_rho:.3f}, Kendall={mean_test_tau:.3f}")

    return train_metrics, test_metrics

In [7]:
# ----------------------------
# 7. Train and evaluate ListNet
# ----------------------------
listnet_train_accs, listnet_test_accs = run_kfold_for_loss(
    folds=folds,
    loss_fn=listnet_loss,
    model_name="ListNet",
    n_epochs=200,
    hidden_dim=64,
    lr=1e-3,
)



===== ListNet | Fold 1 =====
Train seasons: [np.int64(2001), np.int64(2002), np.int64(2004), np.int64(2005), np.int64(2006), np.int64(2007), np.int64(2008), np.int64(2010), np.int64(2011), np.int64(2013), np.int64(2014), np.int64(2015), np.int64(2016), np.int64(2018), np.int64(2020), np.int64(2021), np.int64(2022), np.int64(2023), np.int64(2024)]
Test  seasons: [np.int64(2000), np.int64(2009), np.int64(2012), np.int64(2017), np.int64(2019)]
[ListNet Fold 1] Epoch   1 | train loss = 3.7761
[ListNet Fold 1] Epoch  20 | train loss = 2.6211
[ListNet Fold 1] Epoch  40 | train loss = 2.3643
[ListNet Fold 1] Epoch  60 | train loss = 2.1447
[ListNet Fold 1] Epoch  80 | train loss = 1.9611
[ListNet Fold 1] Epoch 100 | train loss = 1.8212
[ListNet Fold 1] Epoch 120 | train loss = 1.7123
[ListNet Fold 1] Epoch 140 | train loss = 1.6177
[ListNet Fold 1] Epoch 160 | train loss = 1.5405
[ListNet Fold 1] Epoch 180 | train loss = 1.4699
[ListNet Fold 1] Epoch 200 | train loss = 1.4045
[ListNet Fold 1

In [8]:
# =========================================
# 8. Final training on all train seasons
#    + evaluation on real test season 2025
# =========================================

def prepare_holdout_groups(train_df, test_df, feature_cols):
    """
    Build train/test groups for final holdout evaluation.
    Scaling is fit on ALL training data (all seasons except 2025),
    and applied to both train and test.
    """
    # ---- fit scaler on ALL training rows ----
    train_feats = train_df[feature_cols]
    feat_mean = train_feats.mean()
    feat_std  = train_feats.std().replace(0, 1.0)

    def make_groups(df_subset):
        groups = []
        for season, g in df_subset.groupby("SEASON"):
            g = g.sort_values("OVERALL_PICK")  # lower pick = better
            g_scaled = (g[feature_cols] - feat_mean) / feat_std

            X = torch.tensor(g_scaled.values, dtype=torch.float32)
            y = torch.tensor(g["OVERALL_PICK"].values, dtype=torch.float32)
            groups.append((season, X, y))
        return groups

    train_groups = make_groups(train_df)
    test_groups  = make_groups(test_df)
    return train_groups, test_groups


# Build groups for all train seasons (≠ 2025) and the true test season (2025)
final_train_groups, final_test_groups = prepare_holdout_groups(
    train_df=train_df,
    test_df=test_df,
    feature_cols=feature_cols,
)

# Instantiate a fresh model with same hyperparameters as CV
torch.manual_seed(42)
final_model = RankMLP(input_dim=len(feature_cols), hidden_dim=64)

# You can choose ListNet or ListMLE here:
final_model = train_listwise(
    final_model,
    final_train_groups,
    loss_fn=listnet_loss,   # or listmle_loss
    n_epochs=200,
    lr=1e-3,
    name="ListNet-Final-AllTrain"
)

# ---- Evaluate on both train (all past seasons) and test (2025) ----
final_train_res = evaluate_model(final_model, final_train_groups)
final_test_res  = evaluate_model(final_model, final_test_groups)

print("\n=== Final Model (trained on all seasons except 2025) ===")
print(f"Train (2000–2024): "
      f"pair={final_train_res['pairwise_accuracy']:.3f}, "
      f"Spearman={final_train_res['spearman']:.3f}, "
      f"Kendall={final_train_res['kendall']:.3f}")

print(f"Test  (2025 only): "
      f"pair={final_test_res['pairwise_accuracy']:.3f}, "
      f"Spearman={final_test_res['spearman']:.3f}, "
      f"Kendall={final_test_res['kendall']:.3f}")


[ListNet-Final-AllTrain] Epoch   1 | train loss = 3.7412
[ListNet-Final-AllTrain] Epoch  20 | train loss = 2.4562
[ListNet-Final-AllTrain] Epoch  40 | train loss = 2.2106
[ListNet-Final-AllTrain] Epoch  60 | train loss = 2.0159
[ListNet-Final-AllTrain] Epoch  80 | train loss = 1.8539
[ListNet-Final-AllTrain] Epoch 100 | train loss = 1.7219
[ListNet-Final-AllTrain] Epoch 120 | train loss = 1.6194
[ListNet-Final-AllTrain] Epoch 140 | train loss = 1.5370
[ListNet-Final-AllTrain] Epoch 160 | train loss = 1.4691
[ListNet-Final-AllTrain] Epoch 180 | train loss = 1.4067
[ListNet-Final-AllTrain] Epoch 200 | train loss = 1.3516

=== Final Model (trained on all seasons except 2025) ===
Train (2000–2024): pair=0.692, Spearman=0.534, Kendall=0.386
Test  (2025 only): pair=0.715, Spearman=0.631, Kendall=0.433


In [9]:
# ----------------------------
# 8. Train and evaluate ListMLE
# ----------------------------
listmle_train_accs, listmle_test_accs = run_kfold_for_loss(
    folds=folds,
    loss_fn=listmle_loss,
    model_name="ListMLE",
    n_epochs=200,
    hidden_dim=64,
    lr=1e-3,
)



===== ListMLE | Fold 1 =====
Train seasons: [np.int64(2001), np.int64(2002), np.int64(2004), np.int64(2005), np.int64(2006), np.int64(2007), np.int64(2008), np.int64(2010), np.int64(2011), np.int64(2013), np.int64(2014), np.int64(2015), np.int64(2016), np.int64(2018), np.int64(2020), np.int64(2021), np.int64(2022), np.int64(2023), np.int64(2024)]
Test  seasons: [np.int64(2000), np.int64(2009), np.int64(2012), np.int64(2017), np.int64(2019)]
[ListMLE Fold 1] Epoch   1 | train loss = 138.6040
[ListMLE Fold 1] Epoch  20 | train loss = 128.6210
[ListMLE Fold 1] Epoch  40 | train loss = 127.1695
[ListMLE Fold 1] Epoch  60 | train loss = 126.0747
[ListMLE Fold 1] Epoch  80 | train loss = 125.1556
[ListMLE Fold 1] Epoch 100 | train loss = 124.3046
[ListMLE Fold 1] Epoch 120 | train loss = 123.5974
[ListMLE Fold 1] Epoch 140 | train loss = 122.9483
[ListMLE Fold 1] Epoch 160 | train loss = 122.2828
[ListMLE Fold 1] Epoch 180 | train loss = 121.6443
[ListMLE Fold 1] Epoch 200 | train loss = 12

In [10]:
# =========================================
# 9. Final ListMLE: train on all train_df
#    and evaluate on real 2025 test_df
# =========================================

def prepare_holdout_groups(train_df, test_df, feature_cols):
    """
    Build train/test groups for final holdout evaluation.
    Scaling is fit on ALL training data (all seasons except 2025),
    and applied to both train and test.
    """
    # ---- fit scaler on ALL training rows ----
    train_feats = train_df[feature_cols]
    feat_mean = train_feats.mean()
    feat_std  = train_feats.std().replace(0, 1.0)

    def make_groups(df_subset):
        groups = []
        for season, g in df_subset.groupby("SEASON"):
            if g.empty:
                continue
            g = g.sort_values("OVERALL_PICK")  # lower pick = better
            g_scaled = (g[feature_cols] - feat_mean) / feat_std

            X = torch.tensor(g_scaled.values, dtype=torch.float32)
            y = torch.tensor(g["OVERALL_PICK"].values, dtype=torch.float32)
            groups.append((season, X, y))
        return groups

    train_groups = make_groups(train_df)
    test_groups  = make_groups(test_df)
    return train_groups, test_groups


# Build groups for all train seasons (≠ 2025) and the true test season (2025)
final_train_groups_mle, final_test_groups_mle = prepare_holdout_groups(
    train_df=train_df,
    test_df=test_df,
    feature_cols=feature_cols,
)

# Fresh model for ListMLE final training
torch.manual_seed(42)
final_model_mle = RankMLP(input_dim=len(feature_cols), hidden_dim=64)

# Train with ListMLE loss on ALL past seasons
final_model_mle = train_listwise(
    final_model_mle,
    final_train_groups_mle,
    loss_fn=listmle_loss,   # <- key difference
    n_epochs=200,
    lr=1e-3,
    name="ListMLE-Final-AllTrain"
)

# Evaluate on train (2000–2024) and test (2025)
final_train_res_mle = evaluate_model(final_model_mle, final_train_groups_mle)
final_test_res_mle  = evaluate_model(final_model_mle, final_test_groups_mle)

print("\n=== Final ListMLE Model (trained on all seasons except 2025) ===")
print(f"Train (2000–2024): "
      f"pair={final_train_res_mle['pairwise_accuracy']:.3f}, "
      f"Spearman={final_train_res_mle['spearman']:.3f}, "
      f"Kendall={final_train_res_mle['kendall']:.3f}")

print(f"Test  (2025 only): "
      f"pair={final_test_res_mle['pairwise_accuracy']:.3f}, "
      f"Spearman={final_test_res_mle['spearman']:.3f}, "
      f"Kendall={final_test_res_mle['kendall']:.3f}")


[ListMLE-Final-AllTrain] Epoch   1 | train loss = 139.9994
[ListMLE-Final-AllTrain] Epoch  20 | train loss = 129.7068
[ListMLE-Final-AllTrain] Epoch  40 | train loss = 128.3372
[ListMLE-Final-AllTrain] Epoch  60 | train loss = 127.3779
[ListMLE-Final-AllTrain] Epoch  80 | train loss = 126.5796
[ListMLE-Final-AllTrain] Epoch 100 | train loss = 125.8577
[ListMLE-Final-AllTrain] Epoch 120 | train loss = 125.2207
[ListMLE-Final-AllTrain] Epoch 140 | train loss = 124.6199
[ListMLE-Final-AllTrain] Epoch 160 | train loss = 124.0712
[ListMLE-Final-AllTrain] Epoch 180 | train loss = 123.5496
[ListMLE-Final-AllTrain] Epoch 200 | train loss = 123.0214

=== Final ListMLE Model (trained on all seasons except 2025) ===
Train (2000–2024): pair=0.757, Spearman=0.699, Kendall=0.519
Test  (2025 only): pair=0.735, Spearman=0.656, Kendall=0.476


In [16]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from scipy.stats import spearmanr, kendalltau


def build_lgb_data_for_fold(df, feature_cols, train_seasons, test_seasons):
    """
    Build LightGBM ranking data (X, y, group) for a given fold.
    Scaling is fit on TRAIN seasons only.
    Returns:
      X_train, y_train_rel, group_train,
      X_test,  y_test_rel,  group_test,
      feat_mean, feat_std
    """
    df_train = df[df["SEASON"].isin(train_seasons)].copy()
    df_test  = df[df["SEASON"].isin(test_seasons)].copy()

    # ---- scaling (TRAIN only) ----
    train_feats = df_train[feature_cols]
    feat_mean = train_feats.mean()
    feat_std  = train_feats.std().replace(0, 1.0)

    def build_X_y_group(df_subset, seasons_subset):
        dfs = []
        ys = []
        group = []
        for season in seasons_subset:
            g = df_subset[df_subset["SEASON"] == season].copy()
            if g.empty:
                continue
            g = g.sort_values("OVERALL_PICK")  # lower pick = better
            dfs.append(g)
            ys.append(g["OVERALL_PICK"].values.astype(float))
            group.append(len(g))
        if not dfs:
            return np.empty((0, len(feature_cols))), np.array([]), []
        df_cat = pd.concat(dfs, axis=0)
        X = ((df_cat[feature_cols] - feat_mean) / feat_std).values
        y = np.concatenate(ys, axis=0)
        return X, y, group

    X_train, y_train, group_train = build_X_y_group(df_train, train_seasons)
    X_test,  y_test,  group_test  = build_X_y_group(df_test,  test_seasons)

    # LightGBM expects "higher is better"
    max_y = y_train.max()
    y_train_rel = (max_y - y_train).astype(int)   # pick 1 → big number, pick 60 → small number
    y_test_rel  = (max_y - y_test).astype(int)

    return (
        X_train, y_train_rel, group_train,
        X_test,  y_test_rel,  group_test,
        feat_mean, feat_std,
    )


In [17]:
def run_lambdamart_cv(train_df, feature_cols, k_folds=5, random_state=42, num_boost_round=300):
    """
    LambdaMART (LightGBM lambdarank) with season-wise K-fold CV.
    Uses your existing `prepare_kfold_folds` on TRAIN ONLY (no 2025).

    Call like:
        lgb_cv = run_lambdamart_cv(train_df, feature_cols)

    Returns dict of per-fold metrics.
    """
    # folds is built ONLY from train_df (no 2025)
    folds = prepare_kfold_folds(train_df, feature_cols, k_folds=k_folds, random_state=random_state)

    base_params = {
        "objective": "lambdarank",
        "metric": "ndcg",
        "ndcg_at": [5, 10, 20],
        "learning_rate": 0.05,
        "num_leaves": 31,
        "min_data_in_leaf": 20,
        "feature_fraction": 0.8,
        "bagging_fraction": 0.8,
        "bagging_freq": 1,
        "max_depth": -1,
        "verbose": -1,
    }

    train_pair_list  = []
    test_pair_list   = []
    train_spear_list = []
    test_spear_list  = []
    train_kend_list  = []
    test_kend_list   = []

    for fold in folds:
        fold_id      = fold["fold_id"]
        train_seasons = fold["train_seasons"]
        test_seasons  = fold["test_seasons"]

        print(f"\n===== LambdaMART | Fold {fold_id} =====")
        print("Train seasons:", train_seasons)
        print("Test  seasons:", test_seasons)

        # ---- Build data for this fold (scaling on TRAIN only) ----
        (
            X_train, y_train_rel, group_train,
            X_test,  y_test_rel,  group_test,
            feat_mean, feat_std,
        ) = build_lgb_data_for_fold(train_df, feature_cols, train_seasons, test_seasons)

        if X_train.shape[0] == 0 or X_test.shape[0] == 0:
            print(f"Fold {fold_id}: empty train or test, skipping.")
            continue

        max_label = int(max(y_train_rel.max(), y_test_rel.max()))
        params = dict(base_params)
        params["label_gain"] = list(range(max_label + 1))

        train_set = lgb.Dataset(X_train, label=y_train_rel, group=group_train)
        valid_set = lgb.Dataset(X_test,  label=y_test_rel,  group=group_test, reference=train_set)

        model = lgb.train(
            params,
            train_set,
            num_boost_round=num_boost_round,
            valid_sets=[valid_set],
            valid_names=["valid"],
        )

        # ---- Evaluate on TRAIN + TEST seasons for this fold ----
        train_pair, train_spear, train_kend = evaluate_lambdamart_fold(
            model, train_df, feature_cols, feat_mean, feat_std, train_seasons
        )
        test_pair, test_spear, test_kend = evaluate_lambdamart_fold(
            model, train_df, feature_cols, feat_mean, feat_std, test_seasons
        )

        print(f"[Fold {fold_id}]")
        print(f"  Train: Pairwise = {train_pair:.3f}, Spearman = {train_spear:.3f}, Kendall = {train_kend:.3f}")
        print(f"  Test : Pairwise = {test_pair:.3f}, Spearman = {test_spear:.3f}, Kendall = {test_kend:.3f}")

        train_pair_list.append(train_pair)
        test_pair_list.append(test_pair)
        train_spear_list.append(train_spear)
        test_spear_list.append(test_spear)
        train_kend_list.append(train_kend)
        test_kend_list.append(test_kend)

    print("\n=== LambdaMART K-fold CV (season-wise) ===")
    for i, (tr_p, te_p, tr_s, te_s, tr_k, te_k) in enumerate(
        zip(train_pair_list, test_pair_list,
            train_spear_list, test_spear_list,
            train_kend_list, test_kend_list),
        start=1,
    ):
        print(f"Fold {i}: "
              f"TrainPair = {tr_p:.3f}, TestPair = {te_p:.3f} | "
              f"TrainSpearman = {tr_s:.3f}, TestSpearman = {te_s:.3f} | "
              f"TrainKendall = {tr_k:.3f}, TestKendall = {te_k:.3f}")

    print("\nMean Train pairwise:", np.mean(train_pair_list))
    print("Mean Test  pairwise:", np.mean(test_pair_list))
    print("Mean Train Spearman:", np.mean(train_spear_list))
    print("Mean Test  Spearman:", np.mean(test_spear_list))
    print("Mean Train Kendall :", np.mean(train_kend_list))
    print("Mean Test  Kendall :", np.mean(test_kend_list))

    return {
        "train_pair":  train_pair_list,
        "test_pair":   test_pair_list,
        "train_spear": train_spear_list,
        "test_spear":  test_spear_list,
        "train_kend":  train_kend_list,
        "test_kend":   test_kend_list,
    }


In [18]:
def evaluate_lambdamart_fold(model, df, feature_cols, feat_mean, feat_std, seasons):
    """
    Evaluate LambdaMART model on given seasons:
      - pairwise accuracy
      - mean Spearman
      - mean Kendall
    df can be a subset (e.g., train_df or test_df).
    """
    total_correct = 0
    total_pairs = 0
    spear_list = []
    kend_list = []

    for s in seasons:
        g = df[df["SEASON"] == s].copy()
        if g.empty:
            continue

        g = g.sort_values("OVERALL_PICK")
        X = ((g[feature_cols] - feat_mean) / feat_std).values
        true_pick = g["OVERALL_PICK"].values.astype(float)

        scores = model.predict(X)
        n = len(true_pick)

        # pairwise accuracy
        correct = 0
        total = 0
        for i in range(n):
            for j in range(i + 1, n):
                total += 1
                true_better = true_pick[i] < true_pick[j]   # smaller pick = better
                pred_better = scores[i] > scores[j]         # higher score = better
                if true_better == pred_better:
                    correct += 1
        total_correct += correct
        total_pairs += total

        # rank correlations (negate picks so higher is better)
        spear, _ = spearmanr(-true_pick, scores)
        kend, _ = kendalltau(-true_pick, scores)
        spear_list.append(spear)
        kend_list.append(kend)

    pair_acc = total_correct / total_pairs if total_pairs > 0 else 0.0
    mean_spear = float(np.nanmean(spear_list)) if spear_list else 0.0
    mean_kend  = float(np.nanmean(kend_list))  if kend_list else 0.0

    return pair_acc, mean_spear, mean_kend


In [19]:
def train_lambdamart_holdout(train_df, test_df, feature_cols, num_boost_round=300):
    """
    Train a final LambdaMART model on ALL training seasons (train_df)
    and evaluate on:
      - all train seasons (2000–2024)
      - hold-out test seasons in test_df (e.g., 2025)
    """
    base_params = {
        "objective": "lambdarank",
        "metric": "ndcg",
        "ndcg_at": [5, 10, 20],
        "learning_rate": 0.05,
        "num_leaves": 31,
        "min_data_in_leaf": 20,
        "feature_fraction": 0.8,
        "bagging_fraction": 0.8,
        "bagging_freq": 1,
        "max_depth": -1,
        "verbose": -1,
    }

    # ---- scaling on ALL train data ----
    feat_mean = train_df[feature_cols].mean()
    feat_std  = train_df[feature_cols].std().replace(0, 1.0)

    def build_group_from_df(df_slice):
        dfs = []
        ys = []
        groups = []
        seasons_list = []
        for s, g in df_slice.groupby("SEASON"):
            if g.empty:
                continue
            g = g.sort_values("OVERALL_PICK")
            dfs.append(g)
            ys.append(g["OVERALL_PICK"].values.astype(float))
            groups.append(len(g))
            seasons_list.append(s)
        if not dfs:
            return (
                np.empty((0, len(feature_cols))), 
                np.array([]), 
                [], 
                []
            )
        df_cat = pd.concat(dfs, axis=0)
        X = ((df_cat[feature_cols] - feat_mean) / feat_std).values
        y = np.concatenate(ys, axis=0)
        return X, y, groups, seasons_list

    X_train, y_train, group_train, train_seasons = build_group_from_df(train_df)
    X_test,  y_test,  group_test,  test_seasons  = build_group_from_df(test_df)

    if X_train.shape[0] == 0 or X_test.shape[0] == 0:
        raise ValueError("Empty train or test data in hold-out training.")

    max_y = y_train.max()
    y_train_rel = (max_y - y_train).astype(int)
    y_test_rel  = (max_y - y_test).astype(int)

    max_label = int(max(y_train_rel.max(), y_test_rel.max()))
    params = dict(base_params)
    params["label_gain"] = list(range(max_label + 1))

    train_set = lgb.Dataset(X_train, label=y_train_rel, group=group_train)
    valid_set = lgb.Dataset(X_test,  label=y_test_rel,  group=group_test, reference=train_set)

    model = lgb.train(
        params,
        train_set,
        num_boost_round=num_boost_round,
        valid_sets=[valid_set],
        valid_names=["valid"],
    )

    # ---- final evaluation ----
    train_pair, train_spear, train_kend = evaluate_lambdamart_fold(
        model, train_df, feature_cols, feat_mean, feat_std, train_seasons
    )
    test_pair, test_spear, test_kend = evaluate_lambdamart_fold(
        model, test_df, feature_cols, feat_mean, feat_std, test_seasons
    )

    print("\n=== Final LambdaMART Model (trained on all seasons except 2025) ===")
    print(f"Train (2000–2024): "
          f"Pairwise = {train_pair:.3f}, Spearman = {train_spear:.3f}, Kendall = {train_kend:.3f}")
    print(f"Test  (2025): "
          f"Pairwise = {test_pair:.3f}, Spearman = {test_spear:.3f}, Kendall = {test_kend:.3f}")

    return {
        "model": model,
        "feat_mean": feat_mean,
        "feat_std": feat_std,
        "train_seasons": train_seasons,
        "test_seasons": test_seasons,
        "train_pair": train_pair,
        "test_pair": test_pair,
        "train_spear": train_spear,
        "test_spear": test_spear,
        "train_kend": train_kend,
        "test_kend": test_kend,
    }


In [20]:
# 1. CV on train only (no leakage)
lgb_cv = run_lambdamart_cv(train_df, feature_cols, k_folds=5, random_state=42)

# 2. Final model on all train seasons, evaluate on 2025
lgb_holdout = train_lambdamart_holdout(train_df, test_df, feature_cols)



===== LambdaMART | Fold 1 =====
Train seasons: [np.int64(2001), np.int64(2002), np.int64(2004), np.int64(2005), np.int64(2006), np.int64(2007), np.int64(2008), np.int64(2010), np.int64(2011), np.int64(2013), np.int64(2014), np.int64(2015), np.int64(2016), np.int64(2018), np.int64(2020), np.int64(2021), np.int64(2022), np.int64(2023), np.int64(2024)]
Test  seasons: [np.int64(2000), np.int64(2009), np.int64(2012), np.int64(2017), np.int64(2019)]
[Fold 1]
  Train: Pairwise = 0.910, Spearman = 0.942, Kendall = 0.826
  Test : Pairwise = 0.690, Spearman = 0.550, Kendall = 0.385

===== LambdaMART | Fold 2 =====
Train seasons: [np.int64(2000), np.int64(2002), np.int64(2004), np.int64(2005), np.int64(2007), np.int64(2008), np.int64(2009), np.int64(2011), np.int64(2012), np.int64(2013), np.int64(2015), np.int64(2016), np.int64(2017), np.int64(2018), np.int64(2019), np.int64(2020), np.int64(2021), np.int64(2023), np.int64(2024)]
Test  seasons: [np.int64(2001), np.int64(2006), np.int64(2010), np.