In [1]:
# =========================
# Section 0: Imports
# =========================
import numpy as np
import pandas as pd

from sklearn.model_selection import GroupKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.base import clone

from sklearn.ensemble import HistGradientBoostingRegressor, ExtraTreesRegressor, RandomForestRegressor
from sklearn.linear_model import Ridge


In [2]:
# ============================================
# Section 1: Metrics (Spearman + Pairwise Acc)
# ============================================
def _rankdata_average_ties(x: np.ndarray) -> np.ndarray:
    x = np.asarray(x)
    order = np.argsort(x, kind="mergesort")
    ranks = np.empty_like(order, dtype=float)
    ranks[order] = np.arange(len(x), dtype=float)

    sorted_x = x[order]
    i = 0
    while i < len(x):
        j = i
        while j + 1 < len(x) and sorted_x[j + 1] == sorted_x[i]:
            j += 1
        if j > i:
            avg = (ranks[order[i]] + ranks[order[j]]) / 2.0
            ranks[order[i:j+1]] = avg
        i = j + 1
    return ranks


def spearman_corr(pred_scores: np.ndarray, true_picks: np.ndarray) -> float:
    pred_scores = np.asarray(pred_scores)
    true_picks = np.asarray(true_picks)
    y_true = -true_picks.astype(float)  # higher is better

    ra = _rankdata_average_ties(pred_scores)
    rb = _rankdata_average_ties(y_true)

    ra -= ra.mean()
    rb -= rb.mean()
    denom = np.sqrt((ra**2).sum() * (rb**2).sum())
    if denom == 0:
        return np.nan
    return float((ra * rb).sum() / denom)


def pairwise_accuracy(pred_scores: np.ndarray, true_picks: np.ndarray) -> float:
    pred_scores = np.asarray(pred_scores)
    true_picks = np.asarray(true_picks)
    n = len(pred_scores)
    if n < 2:
        return np.nan

    correct = 0
    total = 0
    for i in range(n):
        for j in range(i + 1, n):
            true = true_picks[i] < true_picks[j]   # earlier pick better
            pred = pred_scores[i] > pred_scores[j] # higher score better
            correct += int(true == pred)
            total += 1
    return correct / total


def evaluate_season_df(season_df: pd.DataFrame, score_col="score") -> dict:
    return {
        "Spearman_rho": spearman_corr(season_df[score_col].to_numpy(), season_df["OVERALL_PICK"].to_numpy()),
        "PairwiseAcc": pairwise_accuracy(season_df[score_col].to_numpy(), season_df["OVERALL_PICK"].to_numpy()),
        "N": len(season_df)
    }


In [4]:
# ============================================
# Section 2: Load CSV + Train/Test split
# ============================================
import pandas as pd
from pathlib import Path
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from scipy.stats import spearmanr, kendalltau  

# ----------------------------
# 1. Load data
# ----------------------------
root_dir = Path.cwd().parent.parent
dataset_path = root_dir / "data" / "cleaned" / "college_drafted" / "college_drafted_selected_features.csv"

df = pd.read_csv(dataset_path)

df = df.dropna(subset=["SEASON", "OVERALL_PICK"]).copy()
df["SEASON"] = df["SEASON"].astype(int)
df["OVERALL_PICK"] = df["OVERALL_PICK"].astype(int)

train_df = df[(df["SEASON"] >= 2000) & (df["SEASON"] <= 2024)].copy()
test_df  = df[df["SEASON"] == 2025].copy()

assert len(train_df) > 0, "No training rows found for seasons 2000–2024."
if len(test_df) == 0:
    print("WARNING: No rows found for SEASON == 2025. Test evaluation will be skipped.")

exclude = {"player_name", "SEASON", "OVERALL_PICK"}
feature_cols = [c for c in df.columns if c not in exclude and pd.api.types.is_numeric_dtype(df[c])]
assert len(feature_cols) > 0, "No numeric feature columns found."

# IMPORTANT: no imputation -> ensure no NaNs in features
if train_df[feature_cols].isna().any().any() or (len(test_df) and test_df[feature_cols].isna().any().any()):
    raise ValueError("Found NaNs in features, but you requested no imputation. Clean/remove missing values first.")

X_train = train_df[feature_cols].to_numpy()
y_train = (-train_df["OVERALL_PICK"].astype(float)).to_numpy()
groups  = train_df["SEASON"].to_numpy()


In [5]:
# ============================================
# Section 3: Define MULTIPLE pointwise rankers
# ============================================
# All are pointwise: regress score -> sort within season
models = {
    "Ridge": Pipeline([
        ("scaler", StandardScaler()),
        ("reg", Ridge(alpha=1.0, random_state=42))
    ]),
    "RandomForest": Pipeline([
        ("scaler", StandardScaler()),
        ("reg", RandomForestRegressor(
            n_estimators=500,
            max_depth=None,
            min_samples_leaf=5,
            random_state=42,
            n_jobs=-1
        ))
    ]),
    "ExtraTrees": Pipeline([
        ("scaler", StandardScaler()),
        ("reg", ExtraTreesRegressor(
            n_estimators=800,
            max_depth=None,
            min_samples_leaf=3,
            random_state=42,
            n_jobs=-1
        ))
    ]),
    "HistGB": Pipeline([
        ("scaler", StandardScaler()),
        ("reg", HistGradientBoostingRegressor(
            loss="squared_error",
            learning_rate=0.06,
            max_depth=6,
            max_leaf_nodes=31,
            min_samples_leaf=30,
            l2_regularization=1e-2,
            random_state=42
        ))
    ]),
}


In [6]:
# ==================================================
# Section 4: Season-wise CV (GroupKFold) for all models
# ==================================================
gkf = GroupKFold(n_splits=5)

cv_results = []

for name, base_model in models.items():
    fold_rows = []

    for fold, (tr_idx, va_idx) in enumerate(gkf.split(X_train, y_train, groups=groups), start=1):
        model = clone(base_model)
        model.fit(X_train[tr_idx], y_train[tr_idx])

        va_scores = model.predict(X_train[va_idx])
        va_df = train_df.iloc[va_idx].copy()
        va_df["score"] = va_scores

        # per-season metrics, then macro avg across seasons
        season_metrics = []
        for season, g in va_df.groupby("SEASON"):
            m = evaluate_season_df(g, score_col="score")
            season_metrics.append(m)

        season_metrics = pd.DataFrame(season_metrics)
        fold_rows.append({
            "model": name,
            "fold": fold,
            "macro_Spearman_rho": season_metrics["Spearman_rho"].mean(),
            "macro_PairwiseAcc": season_metrics["PairwiseAcc"].mean(),
            "n_val_rows": len(va_df),
            "n_val_seasons": va_df["SEASON"].nunique()
        })

    cv_results.append(pd.DataFrame(fold_rows))

cv_summary = pd.concat(cv_results, ignore_index=True)

print("\n=== CV fold summary (macro over seasons) ===")
print(cv_summary.to_string(index=False))

print("\n=== CV mean across folds (per model) ===")
print(cv_summary.groupby("model")[["macro_Spearman_rho", "macro_PairwiseAcc"]].mean().sort_values("macro_Spearman_rho", ascending=False).to_string())


  ret = a @ b
  ret = a @ b
  ret = a @ b
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  ret = a @ b
  ret = a @ b
  ret = a @ b
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  ret = a @ b
  ret = a @ b
  ret = a @ b
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  ret = a @ b
  ret = a @ b
  ret = a @ b
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  ret = a @ b
  ret = a @ b
  ret = a @ b
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_



=== CV fold summary (macro over seasons) ===
       model  fold  macro_Spearman_rho  macro_PairwiseAcc  n_val_rows  n_val_seasons
       Ridge     1            0.557709           0.702465         241              5
       Ridge     2            0.572391           0.700092         199              4
       Ridge     3            0.580916           0.709515         238              5
       Ridge     4            0.621252           0.718594         236              5
       Ridge     5            0.561905           0.696475         233              5
RandomForest     1            0.545813           0.692475         241              5
RandomForest     2            0.552788           0.692790         199              4
RandomForest     3            0.520963           0.685777         238              5
RandomForest     4            0.609560           0.719342         236              5
RandomForest     5            0.532450           0.685287         233              5
  ExtraTrees     1 

In [7]:
# ============================================
# Section 5: Train on 2000–2024, Test on 2025
# ============================================
if len(test_df) > 0:
    X_test = test_df[feature_cols].to_numpy()

    test_rows = []
    for name, base_model in models.items():
        model = clone(base_model)
        model.fit(X_train, y_train)

        test_sc = model.predict(X_test)
        tmp = test_df.copy()
        tmp["score"] = test_sc

        # 2025 is a single season -> compute single-season metrics
        m = evaluate_season_df(tmp, score_col="score")
        test_rows.append({
            "model": name,
            "Spearman_rho_2025": m["Spearman_rho"],
            "PairwiseAcc_2025": m["PairwiseAcc"],
            "N_2025": m["N"]
        })

    test_summary = pd.DataFrame(test_rows).sort_values("Spearman_rho_2025", ascending=False)
    print("\n=== Test results on SEASON == 2025 ===")
    print(test_summary.to_string(index=False))


  ret = a @ b
  ret = a @ b
  ret = a @ b



=== Test results on SEASON == 2025 ===
       model  Spearman_rho_2025  PairwiseAcc_2025  N_2025
      HistGB           0.663417          0.736303      63
RandomForest           0.641017          0.734255      63
       Ridge           0.639817          0.727599      63
  ExtraTrees           0.635927          0.735791      63


### Train MLP for pointwise Rankers

In [10]:
# ============================================
# Section 2: Load CSV + Train/Test split
# ============================================
import pandas as pd
from pathlib import Path
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from scipy.stats import spearmanr, kendalltau  

# ----------------------------
# 1. Load data
# ----------------------------
root_dir = Path.cwd().parent.parent
dataset_path = root_dir / "data" / "cleaned" / "college_drafted" / "college_drafted_selected_features.csv"

df = pd.read_csv(dataset_path)
df = df.dropna(subset=["SEASON", "OVERALL_PICK"]).copy()
df["SEASON"] = df["SEASON"].astype(int)
df["OVERALL_PICK"] = df["OVERALL_PICK"].astype(int)

train_df = df[(df["SEASON"] >= 2000) & (df["SEASON"] <= 2024)].copy()
test_df  = df[df["SEASON"] == 2025].copy()

exclude = {"player_name", "SEASON", "OVERALL_PICK"}
feature_cols = [c for c in df.columns if c not in exclude and pd.api.types.is_numeric_dtype(df[c])]

# No imputation requested -> enforce no NaNs
if train_df[feature_cols].isna().any().any() or (len(test_df) and test_df[feature_cols].isna().any().any()):
    raise ValueError("Found NaNs in features. Clean/remove missing values since no imputation is used.")

X_train_raw = train_df[feature_cols].to_numpy().astype(np.float32)
y_train = (-train_df["OVERALL_PICK"].astype(float)).to_numpy().astype(np.float32)
groups = train_df["SEASON"].to_numpy()

X_test_raw = test_df[feature_cols].to_numpy().astype(np.float32) if len(test_df) else None
y_test_picks = test_df["OVERALL_PICK"].to_numpy() if len(test_df) else None

print("Train rows:", len(train_df), " Train seasons:", train_df["SEASON"].nunique())
print("Test rows:", len(test_df), " (SEASON==2025)")
print("Features:", feature_cols)


Train rows: 1147  Train seasons: 24
Test rows: 63  (SEASON==2025)
Features: ['Totals_FG', 'Totals_FT', 'Totals_TRB', 'Totals_BLK', 'Totals_STL', 'Totals_TOV', 'Totals_PF', 'Shooting_FG%', 'MP', 'Age']


In [11]:
# ============================================
# Section 3: Define MLP pointwise ranker
# ============================================
class MLPPointwiseRanker(nn.Module):
    def __init__(self, d_in: int, hidden=(128, 64), dropout=0.15):
        super().__init__()
        layers = []
        prev = d_in
        for h in hidden:
            layers.append(nn.Linear(prev, h))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(dropout))
            prev = h
        layers.append(nn.Linear(prev, 1))  # scalar score
        self.net = nn.Sequential(*layers)

    def forward(self, x):
        return self.net(x).squeeze(-1)


def train_mlp(
    X_tr: np.ndarray, y_tr: np.ndarray,
    X_va: np.ndarray, y_va: np.ndarray,
    *,
    hidden=(128, 64),
    dropout=0.15,
    lr=1e-3,
    weight_decay=1e-4,
    batch_size=64,
    max_epochs=300,
    patience=25,
    device=None,
    seed=42
):
    if device is None:
        device = "cuda" if torch.cuda.is_available() else "cpu"

    torch.manual_seed(seed)
    np.random.seed(seed)

    model = MLPPointwiseRanker(d_in=X_tr.shape[1], hidden=hidden, dropout=dropout).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    loss_fn = nn.SmoothL1Loss()  # Huber-like, robust to outliers

    tr_ds = TensorDataset(torch.from_numpy(X_tr), torch.from_numpy(y_tr))
    va_ds = TensorDataset(torch.from_numpy(X_va), torch.from_numpy(y_va))

    tr_loader = DataLoader(tr_ds, batch_size=batch_size, shuffle=True, drop_last=False)
    va_loader = DataLoader(va_ds, batch_size=256, shuffle=False, drop_last=False)

    best_state = None
    best_val = float("inf")
    bad = 0

    for epoch in range(1, max_epochs + 1):
        model.train()
        tr_losses = []
        for xb, yb in tr_loader:
            xb = xb.to(device)
            yb = yb.to(device)
            optimizer.zero_grad()
            pred = model(xb)
            loss = loss_fn(pred, yb)
            loss.backward()
            optimizer.step()
            tr_losses.append(loss.item())

        model.eval()
        va_losses = []
        with torch.no_grad():
            for xb, yb in va_loader:
                xb = xb.to(device)
                yb = yb.to(device)
                pred = model(xb)
                loss = loss_fn(pred, yb)
                va_losses.append(loss.item())

        val_loss = float(np.mean(va_losses))
        if val_loss < best_val - 1e-6:
            best_val = val_loss
            best_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}
            bad = 0
        else:
            bad += 1
            if bad >= patience:
                break

    if best_state is not None:
        model.load_state_dict(best_state)

    return model


In [13]:
# ==================================================
# Section 4: Season-wise CV (GroupKFold) for MLP
# ==================================================
from torch.utils.data import TensorDataset, DataLoader

gkf = GroupKFold(n_splits=5)

cv_rows = []
device = "cuda" if torch.cuda.is_available() else "cpu"

for fold, (tr_idx, va_idx) in enumerate(gkf.split(X_train_raw, y_train, groups=groups), start=1):
    X_tr_raw, X_va_raw = X_train_raw[tr_idx], X_train_raw[va_idx]
    y_tr, y_va = y_train[tr_idx], y_train[va_idx]

    # Train-only standardization (no leakage)
    scaler = StandardScaler()
    X_tr = scaler.fit_transform(X_tr_raw).astype(np.float32)
    X_va = scaler.transform(X_va_raw).astype(np.float32)

    model = train_mlp(
        X_tr, y_tr.astype(np.float32),
        X_va, y_va.astype(np.float32),
        hidden=(128, 64),
        dropout=0.15,
        lr=1e-3,
        weight_decay=1e-4,
        batch_size=64,
        max_epochs=300,
        patience=25,
        device=device,
        seed=42 + fold
    )

    # Predict on validation fold
    model.eval()
    with torch.no_grad():
        va_scores = model(torch.from_numpy(X_va).to(device)).detach().cpu().numpy()

    va_df = train_df.iloc[va_idx].copy()
    va_df["score"] = va_scores

    season_metrics = []
    for season, g in va_df.groupby("SEASON"):
        season_metrics.append(evaluate_season_df(g, score_col="score"))

    season_metrics = pd.DataFrame(season_metrics)

    cv_rows.append({
        "fold": fold,
        "macro_Spearman_rho": season_metrics["Spearman_rho"].mean(),
        "macro_PairwiseAcc": season_metrics["PairwiseAcc"].mean(),
        "n_val_rows": len(va_df),
        "n_val_seasons": va_df["SEASON"].nunique()
    })

cv_summary = pd.DataFrame(cv_rows)
print("\n=== MLP CV fold summary (macro over seasons) ===")
print(cv_summary.to_string(index=False))
print("\n=== MLP CV mean across folds ===")
print(cv_summary[["macro_Spearman_rho", "macro_PairwiseAcc"]].mean().to_string())



=== MLP CV fold summary (macro over seasons) ===
 fold  macro_Spearman_rho  macro_PairwiseAcc  n_val_rows  n_val_seasons
    1            0.599673           0.716940         241              5
    2            0.582296           0.711057         199              4
    3            0.575365           0.710362         238              5
    4            0.621091           0.717636         236              5
    5            0.582319           0.705308         233              5

=== MLP CV mean across folds ===
macro_Spearman_rho    0.592149
macro_PairwiseAcc     0.712261


In [14]:
# ============================================
# Section 5: Train on 2000–2024, Test on 2025
# ============================================
if len(test_df) > 0:
    # Fit scaler on full train, transform train/test
    scaler = StandardScaler()
    X_tr_full = scaler.fit_transform(X_train_raw).astype(np.float32)
    X_te = scaler.transform(X_test_raw).astype(np.float32)

    # Train with a small in-time validation split (last 2 seasons in train)
    # to keep early stopping "time-respecting"
    train_years = np.sort(train_df["SEASON"].unique())
    val_years = set(train_years[-2:])  # last 2 years of training window
    is_val = train_df["SEASON"].isin(val_years).to_numpy()

    X_tr = X_tr_full[~is_val]
    y_tr = y_train[~is_val]
    X_va = X_tr_full[is_val]
    y_va = y_train[is_val]

    model = train_mlp(
        X_tr, y_tr.astype(np.float32),
        X_va, y_va.astype(np.float32),
        hidden=(128, 64),
        dropout=0.15,
        lr=1e-3,
        weight_decay=1e-4,
        batch_size=64,
        max_epochs=400,
        patience=30,
        device=device,
        seed=2025
    )

    model.eval()
    with torch.no_grad():
        test_scores = model(torch.from_numpy(X_te).to(device)).detach().cpu().numpy()

    tmp = test_df.copy()
    tmp["score"] = test_scores

    m = evaluate_season_df(tmp, score_col="score")
    print("\n=== MLP Test results on SEASON == 2025 ===")
    print({
        "Spearman_rho_2025": m["Spearman_rho"],
        "PairwiseAcc_2025": m["PairwiseAcc"],
        "N_2025": m["N"]
    })
else:
    print("No 2025 rows found; skipping test evaluation.")



=== MLP Test results on SEASON == 2025 ===
{'Spearman_rho_2025': 0.6464430940788876, 'PairwiseAcc_2025': 0.7260624679979518, 'N_2025': 63}
