In [1]:
# ============================================================
# PAIRWISE RANKERS ONLY (train/val on 2000â€“2024, test on 2025)
# Season-wise GroupKFold CV + final train on all 2000â€“2024
# Metrics on 2025: Pairwise Accuracy + Spearman rho
#
# Pairwise rankers implemented:
#   1) RankSVM (LinearSVC on pairwise differences)
#   2) Pairwise Logistic (LogisticRegression on pairwise differences)
#   3) RankNet (PyTorch pairwise logistic loss)
#
# No missing-value imputation (drops NA if any appear).
# ============================================================

import numpy as np
import pandas as pd
import random
from scipy.stats import spearmanr
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset


# -----------------------------
# Metrics (within a season)
# -----------------------------
def pairwise_accuracy(scores: np.ndarray, picks: np.ndarray) -> float:
    scores = np.asarray(scores).reshape(-1)
    picks = np.asarray(picks).reshape(-1)
    n = len(scores)
    if n <= 1:
        return np.nan
    correct = 0
    total = 0
    for i in range(n):
        for j in range(i + 1, n):
            true = picks[i] < picks[j]      # earlier pick should be ahead
            pred = scores[i] > scores[j]    # higher score = earlier
            correct += int(pred == true)
            total += 1
    return correct / total if total else np.nan


def eval_season_df(df_s: pd.DataFrame, score_col="score") -> dict:
    g = df_s.sort_values("OVERALL_PICK")
    scores = g[score_col].to_numpy()
    picks  = g["OVERALL_PICK"].to_numpy()
    rho = spearmanr(scores, -picks).correlation
    pacc = pairwise_accuracy(scores, picks)
    return {"Spearman": float(rho), "Pairwise": float(pacc), "N": int(len(g))}


def eval_2025(test_df: pd.DataFrame, score_col="score") -> dict:
    m = eval_season_df(test_df, score_col=score_col)
    return {"PairwiseAcc_2025": m["Pairwise"], "Spearman_rho_2025": m["Spearman"], "N_2025": m["N"]}


# -----------------------------
# Pair generation
# -----------------------------
def make_pairs_for_season(X: np.ndarray, picks: np.ndarray, max_pairs: int, rng: np.random.RandomState):
    """
    Build training pairs (x_i - x_j, label) for ONE season.
    Label y=1 means i should be ranked ahead of j (i picked earlier).
    """
    n = len(picks)
    if n < 2:
        return None, None

    # all possible pairs count = n*(n-1)/2
    all_pairs = []
    for i in range(n):
        for j in range(i + 1, n):
            all_pairs.append((i, j))

    if max_pairs is not None and len(all_pairs) > max_pairs:
        idx = rng.choice(len(all_pairs), size=max_pairs, replace=False)
        pairs = [all_pairs[k] for k in idx]
    else:
        pairs = all_pairs

    Xd_list = []
    y_list = []
    for i, j in pairs:
        if picks[i] < picks[j]:
            # i ahead of j => (xi-xj) positive example
            Xd_list.append(X[i] - X[j])
            y_list.append(1)
        else:
            Xd_list.append(X[j] - X[i])
            y_list.append(1)

        # Also add the opposite direction as negative (optional; helps balance)
        # If you want fewer samples, comment this block out.
        if picks[i] < picks[j]:
            Xd_list.append(X[j] - X[i])
            y_list.append(0)
        else:
            Xd_list.append(X[i] - X[j])
            y_list.append(0)

    Xd = np.vstack(Xd_list).astype(np.float32)
    y  = np.array(y_list).astype(np.int64)
    return Xd, y


def build_pairwise_dataset(df_part: pd.DataFrame, feature_cols: list, max_pairs_per_season=30000, seed=42):
    rng = np.random.RandomState(seed)
    X_all = df_part[feature_cols].to_numpy().astype(np.float32)
    picks_all = df_part["OVERALL_PICK"].to_numpy().astype(np.int32)
    seasons = df_part["SEASON"].to_numpy().astype(np.int32)

    X_pairs = []
    y_pairs = []

    for season in np.unique(seasons):
        mask = seasons == season
        Xs = X_all[mask]
        ps = picks_all[mask]
        Xd, y = make_pairs_for_season(Xs, ps, max_pairs=max_pairs_per_season, rng=rng)
        if Xd is None:
            continue
        X_pairs.append(Xd)
        y_pairs.append(y)

    if len(X_pairs) == 0:
        raise ValueError("No pairwise data built. Check seasons / input data.")
    return np.vstack(X_pairs), np.concatenate(y_pairs)


# -----------------------------
# Rank score from pairwise linear model
# -----------------------------
def score_linear_model(model, X: np.ndarray) -> np.ndarray:
    """
    Convert pairwise classifier to per-item score.
    For linear models trained on differences, score = w^T x (+b ignored for ranking).
    """
    if hasattr(model, "coef_"):
        w = model.coef_.reshape(-1)
        return X @ w
    raise ValueError("Model has no coef_ to produce linear scores.")


# ============================================================
# 1) Load CSV + split (2000â€“2024 vs 2025)
# ============================================================

import pandas as pd
from pathlib import Path
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from scipy.stats import spearmanr, kendalltau  

# ----------------------------
# 1. Load data
# ----------------------------
root_dir = Path.cwd().parent.parent
dataset_path = root_dir / "data" / "cleaned" / "college_drafted" / "college_drafted_selected_features.csv"

df = pd.read_csv(dataset_path)

df["SEASON"] = df["SEASON"].astype(int)
df["OVERALL_PICK"] = df["OVERALL_PICK"].astype(int)

exclude = {"SEASON", "OVERALL_PICK"}
feature_cols = [c for c in df.columns if c not in exclude and pd.api.types.is_numeric_dtype(df[c])]
if not feature_cols:
    raise ValueError("No numeric feature columns found. Set feature_cols manually.")

# No imputation; drop NA rows if any.
df = df.dropna(subset=feature_cols + ["SEASON", "OVERALL_PICK"]).copy()

train_df = df[df["SEASON"].between(2000, 2024)].copy()
test_df  = df[df["SEASON"] == 2025].copy()
if len(test_df) == 0:
    raise ValueError("No SEASON==2025 rows in CSV.")

X_train_raw = train_df[feature_cols].to_numpy().astype(np.float32)
groups = train_df["SEASON"].to_numpy().astype(int)

print("Features:", feature_cols)
print(f"Train rows: {len(train_df)} | seasons: {train_df['SEASON'].nunique()}")
print(f"Test rows (2025): {len(test_df)}")


# ============================================================
# 2) Season-wise CV for Pairwise RankSVM / Pairwise Logistic / RankNet
# ============================================================

def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)


# ---------- RankNet (PyTorch) ----------
class RankNet(nn.Module):
    def __init__(self, d_in: int, hidden=(128, 64), dropout=0.1):
        super().__init__()
        layers = []
        prev = d_in
        for h in hidden:
            layers += [nn.Linear(prev, h), nn.ReLU(), nn.Dropout(dropout)]
            prev = h
        layers += [nn.Linear(prev, 1)]
        self.net = nn.Sequential(*layers)

    def forward(self, x):
        return self.net(x).squeeze(-1)


def train_ranknet_on_pairs(Xp, yp, d_in, device="cpu",
                           hidden=(128, 64), dropout=0.1,
                           lr=1e-3, weight_decay=1e-4,
                           batch_size=1024, max_epochs=30):
    """
    Train RankNet as a pairwise classifier on (xi-xj) -> label in {0,1}
    """
    model = RankNet(d_in, hidden=hidden, dropout=dropout).to(device)
    opt = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    loss_fn = nn.BCEWithLogitsLoss()

    ds = TensorDataset(torch.from_numpy(Xp), torch.from_numpy(yp.astype(np.float32)))
    loader = DataLoader(ds, batch_size=batch_size, shuffle=True, drop_last=False)

    model.train()
    for _ in range(max_epochs):
        for xb, yb in loader:
            xb = xb.to(device)
            yb = yb.to(device)
            opt.zero_grad(set_to_none=True)
            logits = model(xb)
            loss = loss_fn(logits, yb)
            loss.backward()
            opt.step()
    return model


def score_ranknet(model: nn.Module, X: np.ndarray, device="cpu") -> np.ndarray:
    """
    Score items by forward pass on X (not differences).
    """
    model.eval()
    with torch.no_grad():
        xb = torch.from_numpy(X.astype(np.float32)).to(device)
        s = model(xb).cpu().numpy()
    return s


# ---------- CV driver ----------
def seasonwise_cv_pairwise(train_df: pd.DataFrame, feature_cols: list, n_splits=5,
                           max_pairs_per_season=30000, seed=42):
    gkf = GroupKFold(n_splits=n_splits)
    X_raw = train_df[feature_cols].to_numpy().astype(np.float32)
    groups = train_df["SEASON"].to_numpy().astype(int)

    device = "cuda" if torch.cuda.is_available() else "cpu"

    rows = []

    for fold, (tr_idx, va_idx) in enumerate(gkf.split(X_raw, groups=groups), start=1):
        tr_part = train_df.iloc[tr_idx].copy()
        va_part = train_df.iloc[va_idx].copy()

        # Train-only scaling (important for SVM/Logistic/NN)
        scaler = StandardScaler()
        scaler.fit(tr_part[feature_cols].to_numpy())

        Xtr = scaler.transform(tr_part[feature_cols].to_numpy()).astype(np.float32)
        Xva = scaler.transform(va_part[feature_cols].to_numpy()).astype(np.float32)

        # Build pairwise dataset from TRAIN seasons only
        tr_scaled = tr_part.copy()
        tr_scaled.loc[:, feature_cols] = Xtr

        Xp, yp = build_pairwise_dataset(tr_scaled, feature_cols, max_pairs_per_season=max_pairs_per_season, seed=seed + fold)

        # ----- (1) RankSVM (LinearSVC) -----
        svm = LinearSVC(C=1.0, max_iter=5000, random_state=seed + fold)
        svm.fit(Xp, yp)

        va_scores = score_linear_model(svm, Xva)
        va_scored = va_part.copy()
        va_scored["score"] = va_scores

        season_metrics = []
        for _, g in va_scored.groupby("SEASON"):
            season_metrics.append(eval_season_df(g, score_col="score"))
        season_metrics = pd.DataFrame(season_metrics)

        rows.append({
            "model": "RankSVM",
            "fold": fold,
            "macro_PairwiseAcc": season_metrics["Pairwise"].mean(),
            "macro_Spearman_rho": season_metrics["Spearman"].mean(),
        })

        # ----- (2) Pairwise Logistic Regression -----
        logreg = LogisticRegression(
            C=1.0, max_iter=3000, solver="lbfgs", n_jobs=-1, random_state=seed + fold
        )
        logreg.fit(Xp, yp)

        va_scores = score_linear_model(logreg, Xva)
        va_scored = va_part.copy()
        va_scored["score"] = va_scores

        season_metrics = []
        for _, g in va_scored.groupby("SEASON"):
            season_metrics.append(eval_season_df(g, score_col="score"))
        season_metrics = pd.DataFrame(season_metrics)

        rows.append({
            "model": "PairwiseLogistic",
            "fold": fold,
            "macro_PairwiseAcc": season_metrics["Pairwise"].mean(),
            "macro_Spearman_rho": season_metrics["Spearman"].mean(),
        })

        # ----- (3) RankNet (Neural pairwise) -----
        set_seed(seed + 100 + fold)
        ranknet = train_ranknet_on_pairs(
            Xp, yp, d_in=len(feature_cols), device=device,
            hidden=(128, 64), dropout=0.10,
            lr=1e-3, weight_decay=1e-4,
            batch_size=1024, max_epochs=25
        )

        va_scores = score_ranknet(ranknet, Xva, device=device)
        va_scored = va_part.copy()
        va_scored["score"] = va_scores

        season_metrics = []
        for _, g in va_scored.groupby("SEASON"):
            season_metrics.append(eval_season_df(g, score_col="score"))
        season_metrics = pd.DataFrame(season_metrics)

        rows.append({
            "model": "RankNet",
            "fold": fold,
            "macro_PairwiseAcc": season_metrics["Pairwise"].mean(),
            "macro_Spearman_rho": season_metrics["Spearman"].mean(),
        })

    return pd.DataFrame(rows)


print("\n=== Season-wise CV (2000â€“2024) for Pairwise rankers ===")
cv_pairwise = seasonwise_cv_pairwise(
    train_df, feature_cols,
    n_splits=5,
    max_pairs_per_season=30000,
    seed=42
)
print("\nFold summaries:")
print(cv_pairwise.to_string(index=False))

print("\nCV mean (macro over seasons, averaged across folds):")
print(
    cv_pairwise.groupby("model")[["macro_PairwiseAcc", "macro_Spearman_rho"]]
    .mean()
    .sort_values("macro_Spearman_rho", ascending=False)
)


# ============================================================
# 3) Final train on ALL 2000â€“2024, test on 2025
# ============================================================

device = "cuda" if torch.cuda.is_available() else "cpu"

# Train-only scaler
scaler = StandardScaler()
scaler.fit(train_df[feature_cols].to_numpy())

Xtr = scaler.transform(train_df[feature_cols].to_numpy()).astype(np.float32)
Xte = scaler.transform(test_df[feature_cols].to_numpy()).astype(np.float32)

# Build pairwise dataset from ALL train seasons
train_scaled = train_df.copy()
train_scaled.loc[:, feature_cols] = Xtr
Xp_all, yp_all = build_pairwise_dataset(train_scaled, feature_cols, max_pairs_per_season=30000, seed=777)

final_rows = []

# (1) RankSVM
svm = LinearSVC(C=1.0, max_iter=5000, random_state=777)
svm.fit(Xp_all, yp_all)
scores_2025 = score_linear_model(svm, Xte)
tmp = test_df.copy()
tmp["score"] = scores_2025
m = eval_2025(tmp, score_col="score")
final_rows.append({"Model": "RankSVM", **m})

# (2) Pairwise Logistic
logreg = LogisticRegression(C=1.0, max_iter=3000, solver="lbfgs", n_jobs=-1, random_state=777)
logreg.fit(Xp_all, yp_all)
scores_2025 = score_linear_model(logreg, Xte)
tmp = test_df.copy()
tmp["score"] = scores_2025
m = eval_2025(tmp, score_col="score")
final_rows.append({"Model": "PairwiseLogistic", **m})

# (3) RankNet
set_seed(888)
ranknet = train_ranknet_on_pairs(
    Xp_all, yp_all, d_in=len(feature_cols), device=device,
    hidden=(128, 64), dropout=0.10,
    lr=1e-3, weight_decay=1e-4,
    batch_size=1024, max_epochs=30
)
scores_2025 = score_ranknet(ranknet, Xte, device=device)
tmp = test_df.copy()
tmp["score"] = scores_2025
m = eval_2025(tmp, score_col="score")
final_rows.append({"Model": "RankNet", **m})

final_2025 = pd.DataFrame(final_rows).sort_values(["PairwiseAcc_2025", "Spearman_rho_2025"], ascending=False)

print("\n=== FINAL PAIRWISE TEST RESULTS on SEASON==2025 ===")
print(final_2025[["Model", "PairwiseAcc_2025", "Spearman_rho_2025", "N_2025"]].to_string(index=False))


# ============================================================
# 4) LaTeX table helper (Pairwise first, Spearman second)
#    Remove N_2025 if you want: just omit it here.
# ============================================================

def latex_table_pairwise_only(df_res: pd.DataFrame, caption: str, label: str) -> str:
    lines = []
    lines.append(r"\begin{table}[H]")
    lines.append(r"\centering")
    lines.append(rf"\caption{{{caption}}}")
    lines.append(rf"\label{{{label}}}")
    lines.append(r"\begin{tabular}{lcc}")
    lines.append(r"\hline")
    lines.append(r"\textbf{Model} & \textbf{Pairwise Acc.} & \textbf{Spearman's $\rho$} \\")
    lines.append(r"\hline")
    for _, r in df_res.iterrows():
        lines.append(f"{r['Model']} & {float(r['PairwiseAcc_2025']):.3f} & {float(r['Spearman_rho_2025']):.3f} \\\\")
    lines.append(r"\hline")
    lines.append(r"\end{tabular}")
    lines.append(r"\end{table}")
    return "\n".join(lines)

print("\n=== LaTeX table (pairwise only; no Kendall; no N column) ===\n")
print(
    latex_table_pairwise_only(
        final_2025,
        caption="Pairwise learning-to-rank results on the held-out 2025 draft class (trained/validated on 2000--2024).",
        label="tab:pairwise_results_2025"
    )
)


Features: ['Totals_FG', 'Totals_FT', 'Totals_TRB', 'Totals_BLK', 'Totals_STL', 'Totals_TOV', 'Totals_PF', 'Shooting_FG%', 'MP', 'Age']
Train rows: 1147 | seasons: 24
Test rows (2025): 63

=== Season-wise CV (2000â€“2024) for Pairwise rankers ===


 -1.1936127  -0.8976547  -0.49724096 -0.44501308 -0.5494688   1.3655535
 -1.1239755  -2.394854    0.07726575  0.65177244  0.49508882 -0.44501308
  1.3829628   0.77363753 -0.5146502  -0.25351083 -1.1413848   0.8432747
  0.5299074  -1.1239755   0.37322375  0.2339494   1.5570557  -0.04459931
  0.5473167   0.40804234 -0.6365153  -0.30573872  0.11208434  0.14690293
  1.3481442   0.564726    0.2513587  -0.20128295  1.6092837  -0.46242237
  0.860684   -0.619106    1.2088698  -0.28832942  1.4003721  -0.09682719
 -0.619106   -0.323148    1.3481442  -0.0620086   1.156642   -0.82801753
 -1.2806592  -0.6016967   1.2610978   0.5995446  -0.56687814  0.16431223
 -0.27092013 -0.00978072 -1.7681195   0.7214096   0.49508882 -0.02719001
 -1.332887   -0.42760378 -1.089157    1.330735    0.61695385 -1.924803
 -0.7061525  -0.49724096 -1.1936127  -1.3677057  -0.00978072 -0.3405573
 -0.18387365 -0.27092013 -2.9693606  -0.6887432   0.05985646  1.3133256
  0.77363753  1.9400603   1.5918744  -0.00978072  1.38296


Fold summaries:
           model  fold  macro_PairwiseAcc  macro_Spearman_rho
         RankSVM     1           0.703417            0.554715
PairwiseLogistic     1           0.703049            0.553536
         RankNet     1           0.691997            0.531203
         RankSVM     2           0.700446            0.574177
PairwiseLogistic     2           0.700872            0.573839
         RankNet     2           0.687043            0.540632
         RankSVM     3           0.708749            0.582524
PairwiseLogistic     3           0.709113            0.583237
         RankNet     3           0.707227            0.568750
         RankSVM     4           0.718178            0.618734
PairwiseLogistic     4           0.718393            0.618624
         RankNet     4           0.711365            0.603136
         RankSVM     5           0.696923            0.560632
PairwiseLogistic     5           0.697571            0.562268
         RankNet     5           0.691289            

 -0.23790398]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  train_scaled.loc[:, feature_cols] = Xtr
  0.30271408]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  train_scaled.loc[:, feature_cols] = Xtr
 -0.49800906]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  train_scaled.loc[:, feature_cols] = Xtr
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights



=== FINAL PAIRWISE TEST RESULTS on SEASON==2025 ===
           Model  PairwiseAcc_2025  Spearman_rho_2025  N_2025
         RankSVM          0.725550           0.642386      63
PairwiseLogistic          0.725038           0.641953      63
         RankNet          0.717870           0.629901      63

=== LaTeX table (pairwise only; no Kendall; no N column) ===

\begin{table}[H]
\centering
\caption{Pairwise learning-to-rank results on the held-out 2025 draft class (trained/validated on 2000--2024).}
\label{tab:pairwise_results_2025}
\begin{tabular}{lcc}
\hline
\textbf{Model} & \textbf{Pairwise Acc.} & \textbf{Spearman's $\rho$} \\
\hline
RankSVM & 0.726 & 0.642 \\
PairwiseLogistic & 0.725 & 0.642 \\
RankNet & 0.718 & 0.630 \\
\hline
\end{tabular}
\end{table}
