In [10]:
import pandas as pd
import numpy as np
from pathlib import Path
import sys

def _find_repo_root() -> Path:
    here = Path.cwd().resolve()
    for base in [here, *here.parents]:
        if (base / 'ranker').is_dir():
            return base
    return here

ROOT = _find_repo_root()
if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))

# Pairwise / LTR helpers
from ranker.pairwise import (
    load_and_clean, pick_feature_columns, select_top_features,
    ltr_fit_predict, pairwise_fit_predict, random_search_ltr, grid_search_ltr,
    write_rankings_csv, evaluate_spearman_by_season, RANDOM_STATE, DEFAULT_DATA_PATHS,
)


In [11]:
# ----------------------------
# 1. Load data
# ----------------------------
data_path = DEFAULT_DATA_PATHS[0]
df = load_and_clean(data_path)
print(data_path, df.shape)
df.head()


[Clean] Deduped (season, overall_pick): removed 113 rows (had 113 duplicates).
/Users/young/code/pa_project/nba-draft-ranker/outputs/college_stats.csv (1097, 22)


Unnamed: 0,player_name,overall_pick,season,totals_fg,totals_ft,totals_trb,totals_blk,totals_stl,totals_tov,totals_pf,...,age,position_group_UNK,position_group_nan,totals_fg_per_min,totals_ft_per_min,totals_trb_per_min,totals_blk_per_min,totals_stl_per_min,totals_tov_per_min,totals_pf_per_min
0,Kemba Walker,9,2011,316,258,223,7.0,77.0,93.0,56.0,...,20.0,True,False,0.204796,0.167207,0.144524,0.004537,0.049903,0.060272,0.036293
1,Devonte' Graham,34,2018,199,167,157,2.0,62.0,109.0,56.0,...,22.0,True,False,0.135007,0.113297,0.106513,0.001357,0.042062,0.073948,0.037992
2,Jonny Flynn,6,2009,219,180,104,6.0,54.0,129.0,55.0,...,19.0,True,False,0.154443,0.126939,0.073343,0.004231,0.038082,0.090973,0.038787
3,D.J. Augustin,9,2008,242,173,112,0.0,47.0,105.0,61.0,...,20.0,True,False,0.170783,0.122089,0.07904,0.0,0.033169,0.0741,0.043049
4,Michael Carter-Williams,11,2013,155,129,199,20.0,109.0,138.0,92.0,...,21.0,True,False,0.110007,0.091554,0.141235,0.014194,0.07736,0.097942,0.065295


In [12]:
# ----------------------------
# 2. Feature selection (Spearman + mutual info)
# ----------------------------
feature_cols, dropped_cols = pick_feature_columns(df)
print(f'Base features ({len(feature_cols)}):', feature_cols)
top_k = min(12, len(feature_cols))
feature_cols, scores = select_top_features(df, feature_cols, top_k)
print(f'Selected top {len(feature_cols)} features: {feature_cols}')
if scores:
    print('Top scores:')
    for col, sp, mi, comb in scores[:top_k]:
        print(f"  {col:20s} spearman={sp:+.3f} mi={mi:.4f} combined={comb:.3f}")


Base features (19): ['shooting_fg%', 'mp', 'age', 'totals_fg_per_min', 'totals_ft_per_min', 'totals_trb_per_min', 'totals_blk_per_min', 'totals_stl_per_min', 'totals_tov_per_min', 'totals_pf_per_min', 'totals_fg', 'totals_ft', 'totals_trb', 'totals_blk', 'totals_stl', 'totals_tov', 'totals_pf', 'position_group_UNK', 'position_group_nan']
Selected top 12 features: ['age', 'shooting_fg%', 'totals_blk_per_min', 'totals_trb', 'totals_blk', 'totals_ft_per_min', 'totals_trb_per_min', 'totals_fg_per_min', 'totals_fg', 'totals_ft', 'totals_stl', 'totals_stl_per_min']
Top scores:
  age                  spearman=-0.473 mi=0.1398 combined=0.973
  shooting_fg%         spearman=+0.140 mi=0.0514 combined=0.324
  totals_blk_per_min   spearman=+0.153 mi=0.0436 combined=0.309
  totals_trb           spearman=+0.115 mi=0.0499 combined=0.293
  totals_blk           spearman=+0.174 mi=0.0116 combined=0.216
  totals_ft_per_min    spearman=+0.133 mi=0.0193 combined=0.202
  totals_trb_per_min   spearman=+0.088

  sp = float(spearmanr(y[mask], s[mask]).correlation)
  sp = float(spearmanr(y[mask], s[mask]).correlation)


In [13]:
# ----------------------------
# 3. Train/test split by season (match CLI defaults)
# ----------------------------
holdout_seasons = 4  # same as CLI default
seasons = np.sort(df['season'].unique())
if len(seasons) <= holdout_seasons:
    train_last = int(seasons.max())
else:
    train_last = int(seasons[-(holdout_seasons + 1)])
train_df = df[df['season'] <= train_last].reset_index(drop=True)
test_df = df[df['season'] > train_last].reset_index(drop=True)
print(f'Train seasons <= {train_last} (holdout={holdout_seasons}):', train_df.shape, '| Test:', test_df.shape)


Train seasons <= 2021 (holdout=4): (925, 22) | Test: (172, 22)


In [14]:
# ----------------------------
# 3b. Season-level K-fold (optional)
# ----------------------------
do_kfold_cv = False  # set True to run K-fold across seasons
k_folds = min(5, len(np.unique(df['season'])))

if do_kfold_cv and k_folds >= 2:
    from sklearn.model_selection import KFold
    seasons_all = np.sort(df['season'].unique())
    kf = KFold(n_splits=k_folds, shuffle=False)
    cv_scores = []
    for fold, (tr_idx, te_idx) in enumerate(kf.split(seasons_all), 1):
        train_seasons = seasons_all[tr_idx]
        test_seasons = seasons_all[te_idx]
        tr_df = df[df['season'].isin(train_seasons)].reset_index(drop=True)
        te_df = df[df['season'].isin(test_seasons)].reset_index(drop=True)

        ensemble_cfgs_cv = [
            {'seed': RANDOM_STATE, 'num_leaves':31, 'learning_rate':0.05, 'num_boost_round':3000, 'min_child_samples':20},
            {'seed': RANDOM_STATE+101, 'num_leaves':47, 'learning_rate':0.04, 'num_boost_round':3500, 'min_child_samples':25},
            {'seed': RANDOM_STATE+202, 'num_leaves':63, 'learning_rate':0.035, 'num_boost_round':4000, 'min_child_samples':30},
        ]

        scores_cv, sorted_cv = ltr_fit_predict(
            tr_df, te_df, feature_cols, season_zscore=True, val_last_k=2, ensemble_cfgs=ensemble_cfgs_cv
        )
        corrs_cv = evaluate_spearman_by_season(sorted_cv, scores_cv)
        vals_cv = [c for _, c in corrs_cv if not np.isnan(c)]
        mean_cv = float(np.mean(vals_cv)) if vals_cv else float('nan')
        cv_scores.append(mean_cv)
        print(f"Fold {fold}: train seasons {train_seasons} | test seasons {test_seasons} | LTR avg Spearman={mean_cv:.3f}")

    if cv_scores:
        print('K-fold mean LTR Spearman:', float(np.nanmean(cv_scores)))
else:
    print('Skip K-fold; set do_kfold_cv=True to run.')


Skip K-fold; set do_kfold_cv=True to run.


In [15]:
# ----------------------------
# 4. LTR (LightGBM LambdaRank)
# ----------------------------
use_random_search = False  # set True to try random search
use_grid_search = False    # set True to try grid search
season_zscore = True       # recommended for this data
val_last_k = 2             # same as CLI default

ensemble_cfgs = [
    {'seed': RANDOM_STATE, 'num_leaves':31, 'learning_rate':0.05, 'num_boost_round':3000, 'min_child_samples':20},
    {'seed': RANDOM_STATE+101, 'num_leaves':47, 'learning_rate':0.04, 'num_boost_round':3500, 'min_child_samples':25},
    {'seed': RANDOM_STATE+202, 'num_leaves':63, 'learning_rate':0.035, 'num_boost_round':4000, 'min_child_samples':30},
]

if use_random_search:
    best_cfg, best_score, _ = random_search_ltr(
        train_df, feature_cols, season_zscore=season_zscore, val_last_k=val_last_k, n_trials=12
    )
    seeds = [RANDOM_STATE, RANDOM_STATE+101, RANDOM_STATE+202]
    ensemble_cfgs = [{**best_cfg, 'seed': s} for s in seeds]
    print('[Random search] best val spearman:', best_score)
elif use_grid_search:
    best_cfg, best_score, _ = grid_search_ltr(
        train_df, feature_cols, season_zscore=season_zscore, val_last_k=val_last_k
    )
    seeds = [RANDOM_STATE, RANDOM_STATE+101, RANDOM_STATE+202]
    ensemble_cfgs = [{**best_cfg, 'seed': s} for s in seeds]
    print('[Grid search] best val spearman:', best_score)

ltr_scores, ltr_sorted = ltr_fit_predict(
    train_df, test_df, feature_cols, season_zscore=season_zscore, val_last_k=val_last_k, ensemble_cfgs=ensemble_cfgs
)
ltr_corrs = evaluate_spearman_by_season(ltr_sorted, ltr_scores)
ltr_vals = [c for _, c in ltr_corrs if not np.isnan(c)]
print('LTR Spearman by season:', ltr_corrs)
if ltr_vals:
    print('LTR avg Spearman:', float(np.mean(ltr_vals)))


[250]	valid_0's mean_spearman: 0.409119
[500]	valid_0's mean_spearman: 0.393156
[250]	valid_0's mean_spearman: 0.395294
[250]	valid_0's mean_spearman: 0.425559
LTR Spearman by season: [(2022, 0.5326148610323312), (2023, 0.4803689064558629), (2024, 0.583015963049996), (2025, 0.7946239806704923)]
LTR avg Spearman: 0.5976559278021706


In [None]:
# ----------------------------
# 5. Pairwise ensemble baseline
# ----------------------------
pw_scores, pw_sorted = pairwise_fit_predict(
    train_df, test_df, feature_cols, within_position_pairs=False, max_pairs_per_season=30000, n_splits=3
)
pw_corrs = evaluate_spearman_by_season(pw_sorted, pw_scores)
pw_vals = [c for _, c in pw_corrs if not np.isnan(c)]
print('Pairwise Spearman by season:', pw_corrs)
if pw_vals:
    print('Pairwise avg Spearman:', float(np.mean(pw_vals)))


[CV]      log_reg  logloss=0.4363
[CV]      sgd_log  logloss=0.4388
[CV]          hgb  logloss=0.4451


In [29]:
# ----------------------------
# 5b. Additional ML baselines (regression on pick)
# ----------------------------
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import ExtraTreesRegressor, HistGradientBoostingRegressor

ml_models = {}

train_sorted_ml = train_df.sort_values(['season','overall_pick']).reset_index(drop=True)
test_sorted_ml = test_df.sort_values(['season','overall_pick']).reset_index(drop=True)
X_train_ml = train_sorted_ml[feature_cols]
X_test_ml = test_sorted_ml[feature_cols]
y_train_ml = -train_sorted_ml['overall_pick'].to_numpy()  # higher is better

imp = SimpleImputer()
scaler = StandardScaler()
Xtr = scaler.fit_transform(imp.fit_transform(X_train_ml))
Xte = scaler.transform(imp.transform(X_test_ml))

for name, model in [
    ('extra_trees_reg', ExtraTreesRegressor(
        n_estimators=800, max_depth=None, min_samples_leaf=4, max_features='sqrt',
        n_jobs=-1, random_state=RANDOM_STATE
    )),
    ('hgb_reg', HistGradientBoostingRegressor(
        learning_rate=0.05, max_depth=6, max_iter=600, l2_regularization=1e-2, random_state=RANDOM_STATE
    )),
]:
    model.fit(Xtr, y_train_ml)
    preds = model.predict(Xte)
    corrs = evaluate_spearman_by_season(test_sorted_ml, preds)
    ml_models[name] = (preds, corrs)
    vals = [c for _, c in corrs if not np.isnan(c)]
    mean_val = float(np.nanmean(vals)) if vals else float('nan')
    print(f"{name}: avg Spearman={mean_val:.3f} | per-season={corrs}")


extra_trees_reg: avg Spearman=0.640 | per-season=[(2022, 0.5758852605137347), (2023, 0.5544137022397891), (2024, 0.6207762742079249), (2025, 0.8074599818785864)]
hgb_reg: avg Spearman=0.576 | per-season=[(2022, 0.5190017016449234), (2023, 0.5272727272727272), (2024, 0.4977716554574184), (2025, 0.7585321655089098)]


In [30]:
# ----------------------------
# 6. Model comparison table
# ----------------------------
results = []

def _avg_corr(corrs):
    vals = [c for _, c in corrs if not np.isnan(c)]
    return float(np.nanmean(vals)) if vals else float('nan')

if 'ltr_scores' in locals():
    ltr_mean = _avg_corr(ltr_corrs)
    results.append({'method': 'ltr', 'avg_spearman': ltr_mean})

if 'pw_scores' in locals():
    pw_mean = _avg_corr(pw_corrs)
    results.append({'method': 'pairwise', 'avg_spearman': pw_mean})

if 'ml_models' in locals():
    for name, (preds, corrs) in ml_models.items():
        results.append({'method': name, 'avg_spearman': _avg_corr(corrs)})

hybrid_combo = None
best_alpha = None
best_mean = float('-inf')
if 'ltr_scores' in locals() and 'pw_scores' in locals():
    alphas = np.linspace(0.0, 1.0, 11)
    for a in alphas:
        hybrid_scores = a * pw_scores + (1.0 - a) * ltr_scores
        hybrid_corrs = evaluate_spearman_by_season(pw_sorted, hybrid_scores)
        m = _avg_corr(hybrid_corrs)
        results.append({'method': f'hybrid_{a:.2f}', 'avg_spearman': m})
        if not np.isnan(m) and m > best_mean:
            best_mean = m
            best_alpha = a
            hybrid_combo = (hybrid_scores, hybrid_corrs)

results_df = pd.DataFrame(results) if results else pd.DataFrame(columns=['method','avg_spearman'])
print(results_df.sort_values('avg_spearman', ascending=False))
if not results_df.empty:
    best_method = results_df.sort_values('avg_spearman', ascending=False).iloc[0]
    print('Best method:', dict(best_method))
    if best_method['method'].startswith('hybrid') and best_alpha is not None:
        print(f"Best hybrid alpha: {best_alpha:.2f}, avg Spearman={best_mean:.3f}")


             method  avg_spearman
13      hybrid_0.90      0.642998
2   extra_trees_reg      0.639634
12      hybrid_0.80      0.632280
1          pairwise      0.630456
14      hybrid_1.00      0.630456
10      hybrid_0.60      0.622723
11      hybrid_0.70      0.619138
7       hybrid_0.30      0.610204
9       hybrid_0.50      0.609911
8       hybrid_0.40      0.609342
6       hybrid_0.20      0.602204
5       hybrid_0.10      0.599491
0               ltr      0.597656
4       hybrid_0.00      0.597656
3           hgb_reg      0.575645
Best method: {'method': 'hybrid_0.90', 'avg_spearman': np.float64(0.6429975480469944)}
Best hybrid alpha: 0.90, avg Spearman=0.643


In [None]:
# ----------------------------
# 7. Choose final scores & write CSV
# ----------------------------
if 'best_method' in locals():
    method_name = best_method['method']
    if method_name == 'ltr' and 'ltr_scores' in locals():
        final_scores = ltr_scores
        final_df = ltr_sorted
    elif method_name.startswith('hybrid') and 'hybrid_combo' in locals() and hybrid_combo is not None:
        final_scores, _ = hybrid_combo
        final_df = pw_sorted
    elif 'ml_models' in locals() and method_name in ml_models:
        final_scores, _ = ml_models[method_name]
        final_df = test_sorted_ml
    else:
        final_scores = pw_scores
        final_df = pw_sorted
else:
    final_scores = pw_scores  # fallback
    final_df = pw_sorted

write_rankings_csv(Path('../outputs/pairwise_rankings.csv'), final_df, final_scores)
final_df.assign(pred_score=final_scores).head()
