In [6]:
import pandas as pd
import numpy as np
from pathlib import Path
import sys

def _find_repo_root() -> Path:
    here = Path.cwd().resolve()
    for base in [here, *here.parents]:
        if (base / 'ranker').is_dir():
            return base
    return here

ROOT = _find_repo_root()
if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))

# Pairwise / LTR helpers
from ranker.pairwise import (
    load_and_clean, pick_feature_columns, select_top_features,
    ltr_fit_predict, pairwise_fit_predict, random_search_ltr, grid_search_ltr,
    write_rankings_csv, evaluate_spearman_by_season, RANDOM_STATE, DEFAULT_DATA_PATHS,
)


In [7]:
# ----------------------------
# 1. Load data
# ----------------------------
data_path = DEFAULT_DATA_PATHS[0]
df = load_and_clean(data_path)
print(data_path, df.shape)
df.head()


FileNotFoundError: [Errno 2] No such file or directory: '/Users/young/code/pa_project/nba-draft-ranker/outputs/nba_college_selected_features.csv'

In [None]:
# ----------------------------
# 2. Feature selection (Spearman + mutual info)
# ----------------------------
feature_cols, dropped_cols = pick_feature_columns(df)
print(f'Base features ({len(feature_cols)}):', feature_cols)
top_k = min(12, len(feature_cols))
feature_cols, scores = select_top_features(df, feature_cols, top_k)
print(f'Selected top {len(feature_cols)} features: {feature_cols}')
if scores:
    print('Top scores:')
    for col, sp, mi, comb in scores[:top_k]:
        print(f"  {col:20s} spearman={sp:+.3f} mi={mi:.4f} combined={comb:.3f}")


In [None]:
# ----------------------------
# 3. Train/test split by season
# ----------------------------
seasons = np.sort(df['season'].unique())
train_last = int(seasons[-2]) if len(seasons) > 1 else int(seasons[-1])
train_df = df[df['season'] <= train_last].reset_index(drop=True)
test_df = df[df['season'] > train_last].reset_index(drop=True)
print(f'Train seasons <= {train_last}:', train_df.shape, '| Test:', test_df.shape)


In [None]:
# ----------------------------
# 4. LTR (LightGBM LambdaRank)
# ----------------------------
use_random_search = False  # set True to try random search
use_grid_search = False    # set True to try grid search

ensemble_cfgs = [
    {'seed': RANDOM_STATE, 'num_leaves':31, 'learning_rate':0.05, 'num_boost_round':3000, 'min_child_samples':20},
    {'seed': RANDOM_STATE+101, 'num_leaves':47, 'learning_rate':0.04, 'num_boost_round':3500, 'min_child_samples':25},
    {'seed': RANDOM_STATE+202, 'num_leaves':63, 'learning_rate':0.035, 'num_boost_round':4000, 'min_child_samples':30},
]

if use_random_search:
    best_cfg, best_score, _ = random_search_ltr(
        train_df, feature_cols, season_zscore=True, val_last_k=2, n_trials=12
    )
    seeds = [RANDOM_STATE, RANDOM_STATE+101, RANDOM_STATE+202]
    ensemble_cfgs = [{**best_cfg, 'seed': s} for s in seeds]
    print('[Random search] best val spearman:', best_score)
elif use_grid_search:
    best_cfg, best_score, _ = grid_search_ltr(
        train_df, feature_cols, season_zscore=True, val_last_k=2
    )
    seeds = [RANDOM_STATE, RANDOM_STATE+101, RANDOM_STATE+202]
    ensemble_cfgs = [{**best_cfg, 'seed': s} for s in seeds]
    print('[Grid search] best val spearman:', best_score)

ltr_scores, ltr_sorted = ltr_fit_predict(
    train_df, test_df, feature_cols, season_zscore=True, val_last_k=2, ensemble_cfgs=ensemble_cfgs
)
ltr_corrs = evaluate_spearman_by_season(ltr_sorted, ltr_scores)
print('LTR Spearman by season:', ltr_corrs)


In [None]:
# ----------------------------
# 5. Pairwise ensemble baseline
# ----------------------------
pw_scores, pw_sorted = pairwise_fit_predict(
    train_df, test_df, feature_cols, within_position_pairs=False, max_pairs_per_season=30000, n_splits=3
)
pw_corrs = evaluate_spearman_by_season(pw_sorted, pw_scores)
print('Pairwise Spearman by season:', pw_corrs)


In [None]:
# ----------------------------
# 6. Choose final scores & write CSV
# ----------------------------
final_scores = pw_scores  # or ltr_scores, or 0.5*pw_scores + 0.5*ltr_scores
final_df = pw_sorted  # align with chosen scores
write_rankings_csv(Path('../outputs/pairwise_rankings.csv'), final_df, final_scores)
final_df.assign(pred_score=final_scores).head()
