In [20]:
import pandas as pd
from pathlib import Path


root_dir = Path.cwd().parent.parent
dataset_path = root_dir /"outputs" / "nba_college_selected_features.csv"

df = pd.read_csv(dataset_path)
print(df.columns)
print(df.shape)

Index(['player_name', 'OVERALL_PICK', 'SEASON', 'Totals_FG', 'Totals_FT',
       'Totals_TRB', 'Totals_BLK', 'Totals_STL', 'Totals_TOV', 'Totals_PF',
       'Shooting_FG%', 'MP', 'Age'],
      dtype='object')
(888, 13)


In [28]:
import pandas as pd
from pathlib import Path
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

# ----------------------------
# 1. Load data
# ----------------------------
root_dir = Path.cwd().parent.parent
dataset_path = root_dir / "outputs" / "nba_college_selected_features.csv"

df = pd.read_csv(dataset_path)


# ----------------------------
# 2. Train / Test split by SEASON
#    - Last 2 seasons -> test
#    - Others -> train
# ----------------------------
all_seasons = sorted(df["SEASON"].unique())
test_seasons = all_seasons[-2:]
train_seasons = all_seasons[:-2]


df_train = df[df["SEASON"].isin(train_seasons)].copy()
df_test = df[df["SEASON"].isin(test_seasons)].copy()

# ----------------------------
# 3. Features, labels, and SCALING
# ----------------------------
drop_cols = ["player_name", "OVERALL_PICK", "SEASON"]
feature_cols = [c for c in df.columns if c not in drop_cols]

print("Feature columns:", feature_cols)

# Compute mean/std from TRAIN ONLY
train_feats = df_train[feature_cols]
feat_mean = train_feats.mean()
feat_std = train_feats.std().replace(0, 1.0)  # avoid division by zero


def build_season_groups(df_slice):
    """
    Returns a list of (season, X, y) per season.
    X: float32 tensor [n_players, n_features], SCALED
    y: float32 tensor [n_players] (OVERALL_PICK)
    """
    groups = []
    for season, g in df_slice.groupby("SEASON"):
        g = g.sort_values("OVERALL_PICK")  # stable order

        # SCALE using train means/stds
        g_scaled = (g[feature_cols] - feat_mean) / feat_std

        X = torch.tensor(g_scaled.values, dtype=torch.float32)
        y = torch.tensor(g["OVERALL_PICK"].values, dtype=torch.float32)
        groups.append((season, X, y))
    return groups

train_groups = build_season_groups(df_train)
test_groups = build_season_groups(df_test)

print(f"#train seasons: {len(train_groups)}, #test seasons: {len(test_groups)}")

input_dim = len(feature_cols)

# ----------------------------
# 4. Simple ranking model
# ----------------------------
class RankMLP(nn.Module):
    def __init__(self, input_dim, hidden_dim=64):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 1)
        )

    def forward(self, x):
        # x: [N, D]
        return self.net(x).squeeze(-1)  # [N]

# ----------------------------
# 5. Listwise losses
# ----------------------------

def listnet_loss(scores, labels):
    """
    ListNet top-1 cross entropy.
    scores: [N] model scores (higher means better)
    labels: [N] OVERALL_PICK (lower is better in reality)
    We convert labels to relevance by rel = -labels.
    """
    rel = -labels  # larger rel = better
    P_y = F.softmax(rel, dim=0)
    P_s = F.softmax(scores, dim=0)
    loss = -torch.sum(P_y * torch.log(P_s + 1e-12))
    return loss

def listmle_loss(scores, labels):
    """
    ListMLE loss.
    scores: [N]
    labels: [N] OVERALL_PICK (lower = better)
    We sort items by true ranking (ascending OVERALL_PICK).
    """
    # sort by true rank: best (smallest pick) first
    _, idx = torch.sort(labels, descending=False)
    s_sorted = scores[idx]

    # log-sum-exp over suffixes:
    # denominator for position i is sum_{j>=i} exp(s_j)
    log_cumsumexp = torch.logcumsumexp(s_sorted.flip(0), dim=0).flip(0)

    # log-likelihood: sum_i [s_i - log(sum_{j>=i} exp(s_j))]
    log_likelihood = torch.sum(s_sorted - log_cumsumexp)
    return -log_likelihood  # negate to get loss

# ----------------------------
# 6. Evaluation: pairwise ranking accuracy
# ----------------------------
def pairwise_accuracy(scores, labels):
    """
    Pairwise accuracy within one list.
    True order: lower OVERALL_PICK is better.
    """
    scores = scores.detach().cpu().numpy()
    labels = labels.detach().cpu().numpy()
    n = len(labels)
    if n < 2:
        return 0.0

    correct = 0
    total = 0
    for i in range(n):
        for j in range(i + 1, n):
            total += 1
            true_better = labels[i] < labels[j]  # True if i should rank ahead of j
            pred_better = scores[i] > scores[j]  # True if model scores i > j
            if (true_better and pred_better) or ((not true_better) and (not pred_better)):
                correct += 1
    return correct / total if total > 0 else 0.0

def evaluate_model(model, groups):
    model.eval()
    total_correct = 0.0
    total_pairs = 0
    with torch.no_grad():
        for season, X, y in groups:
            s = model(X)
            n = len(y)
            if n < 2:
                continue
            n_pairs = n * (n - 1) // 2
            acc = pairwise_accuracy(s, y)
            total_correct += acc * n_pairs
            total_pairs += n_pairs
    return total_correct / total_pairs if total_pairs > 0 else 0.0

# ----------------------------
# 7. Training loop helper
# ----------------------------
def train_listwise(model, groups, loss_fn, n_epochs=200, lr=1e-3, name="model"):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    for epoch in range(1, n_epochs + 1):
        model.train()
        total_loss = 0.0
        for season, X, y in groups:
            optimizer.zero_grad()
            scores = model(X)
            loss = loss_fn(scores, y)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        avg_loss = total_loss / max(len(groups), 1)

        if epoch % 20 == 0 or epoch == 1:
            print(f"[{name}] Epoch {epoch:3d} | train loss = {avg_loss:.4f}")

    return model

# ----------------------------
# 8. Train and evaluate ListNet
# ----------------------------
torch.manual_seed(42)

listnet_model = RankMLP(input_dim=input_dim, hidden_dim=64)
listnet_model = train_listwise(
    listnet_model,
    train_groups,
    loss_fn=listnet_loss,
    n_epochs=200,
    lr=1e-3,
    name="ListNet"
)

train_acc_listnet = evaluate_model(listnet_model, train_groups)
test_acc_listnet = evaluate_model(listnet_model, test_groups)

print("\n=== ListNet Results ===")
print(f"Train pairwise accuracy: {train_acc_listnet:.3f}")
print(f"Test  pairwise accuracy: {test_acc_listnet:.3f}")

# ----------------------------
# 9. Train and evaluate ListMLE
# ----------------------------
torch.manual_seed(42)

listmle_model = RankMLP(input_dim=input_dim, hidden_dim=64)
listmle_model = train_listwise(
    listmle_model,
    train_groups,
    loss_fn=listmle_loss,
    n_epochs=200,
    lr=1e-3,
    name="ListMLE"
)

train_acc_listmle = evaluate_model(listmle_model, train_groups)
test_acc_listmle = evaluate_model(listmle_model, test_groups)

print("\n=== ListMLE Results ===")
print(f"Train pairwise accuracy: {train_acc_listmle:.3f}")
print(f"Test  pairwise accuracy: {test_acc_listmle:.3f}")


Feature columns: ['Totals_FG', 'Totals_FT', 'Totals_TRB', 'Totals_BLK', 'Totals_STL', 'Totals_TOV', 'Totals_PF', 'Shooting_FG%', 'MP', 'Age']
#train seasons: 17, #test seasons: 2
[ListNet] Epoch   1 | train loss = 3.7726
[ListNet] Epoch  20 | train loss = 2.4898
[ListNet] Epoch  40 | train loss = 2.2237
[ListNet] Epoch  60 | train loss = 2.0067
[ListNet] Epoch  80 | train loss = 1.8472
[ListNet] Epoch 100 | train loss = 1.7235
[ListNet] Epoch 120 | train loss = 1.6256
[ListNet] Epoch 140 | train loss = 1.5489
[ListNet] Epoch 160 | train loss = 1.4854
[ListNet] Epoch 180 | train loss = 1.4280
[ListNet] Epoch 200 | train loss = 1.3765

=== ListNet Results ===
Train pairwise accuracy: 0.716
Test  pairwise accuracy: 0.700
[ListMLE] Epoch   1 | train loss = 135.1516
[ListMLE] Epoch  20 | train loss = 124.3245
[ListMLE] Epoch  40 | train loss = 122.9757
[ListMLE] Epoch  60 | train loss = 121.9305
[ListMLE] Epoch  80 | train loss = 121.0653
[ListMLE] Epoch 100 | train loss = 120.2505
[ListMLE