# Pointwise Rank

In [3]:
import sys
print(sys.executable)

/opt/anaconda3/envs/nba-ranker/bin/python3.11


In [6]:
import sys
from pathlib import Path

cwd = Path.cwd()
print("Current working dir:", cwd)

# From ranker/pointwise → go up two levels to NBA-DRAFT-RANKER
PROJECT_ROOT = cwd.parents[1]

if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

print("PROJECT_ROOT:", PROJECT_ROOT)
print("Contents:", [p.name for p in PROJECT_ROOT.iterdir()])


Current working dir: /Users/aryaman/Documents/Sem3/PA/nba-draft-ranker/ranker/pointwise
PROJECT_ROOT: /Users/aryaman/Documents/Sem3/PA/nba-draft-ranker
Contents: ['scrapper', '.DS_Store', 'requirements.txt', 'preprocess', 'README.md', 'ranker', '.gitignore', 'extract', '.git', 'data', 'outputs']


In [7]:
from pathlib import Path
import numpy as np
import pandas as pd

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import HistGradientBoostingRegressor
from scipy.stats import spearmanr

PROJECT_ROOT = Path(PROJECT_ROOT)

DATA_PATH = PROJECT_ROOT / "outputs" / "nba_draft_final.csv"  # or college_stats.csv
OUT_PATH = PROJECT_ROOT / "outputs" / "pointwise_rankings.csv"

FEATURE_MIN_COVERAGE = 0.5   # similar spirit to your MIN_FEATURE_COVERAGE
HOLDOUT_SEASONS = 4
RANDOM_STATE = 42

In [8]:
df = pd.read_csv(DATA_PATH)

# Normalize column names
df.columns = [c.strip().lower() for c in df.columns]

# Make sure we have season and overall_pick
if "season" not in df.columns and "draft_year" in df.columns:
    df["season"] = df["draft_year"]

if "overall_pick" not in df.columns and "overall_pick" in df.columns:
    # Just a placeholder reminder; usually it already exists
    pass

required = ["season", "overall_pick"]
missing = [c for c in required if c not in df.columns]
if missing:
    raise ValueError(f"Missing required columns in {DATA_PATH.name}: {missing}")

# Drop rows without key info
df = df.dropna(subset=["season", "overall_pick"]).copy()
df["season"] = df["season"].astype(int)
df["overall_pick"] = df["overall_pick"].astype(int)

print(df.shape)
df.head()


(888, 51)


Unnamed: 0,person_id,player_name,season,round_number,overall_pick,team_name,organization,organization_type,player_profile_flag,position,...,totals_tov,totals_pf,totals_pts,shooting_fg%,shooting_3p%,shooting_ft%,per game_mp,per game_pts,per game_trb,per game_ast
0,2030,Kenyon Martin,2000,1,1,Nets,Cincinnati,College/University,1,,...,56.0,71.0,585,0.568,0.286,0.684,29.3,18.9,9.7,1.4
1,2031,Stromile Swift,2000,1,2,Grizzlies,Louisiana State,College/University,1,,...,80.0,88.0,550,0.608,0.28,0.617,29.8,16.2,8.2,0.9
2,2033,Marcus Fizer,2000,1,4,Bulls,Iowa State,College/University,1,,...,77.0,103.0,844,0.582,0.357,0.732,33.6,22.8,7.7,1.1
3,2034,Mike Miller,2000,1,5,Magic,Florida,College/University,1,,...,71.0,70.0,521,0.476,0.338,0.729,28.6,14.1,6.6,2.5
4,2035,DerMarr Johnson,2000,1,6,Hawks,Cincinnati,College/University,1,,...,46.0,64.0,402,0.478,0.371,0.737,27.5,12.6,3.8,1.4


In [9]:
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()

# Don't use the target or season as features
exclude = {"overall_pick", "season"}
candidate_features = [c for c in numeric_cols if c not in exclude]

coverage = df[candidate_features].notna().mean()
feature_cols = [c for c in candidate_features if coverage[c] >= FEATURE_MIN_COVERAGE]

print(f"Total numeric candidate features: {len(candidate_features)}")
print(f"Using {len(feature_cols)} features with coverage >= {FEATURE_MIN_COVERAGE:.2f}")
print("First 10 features:", feature_cols[:10])


Total numeric candidate features: 42
Using 39 features with coverage >= 0.50
First 10 features: ['person_id', 'round_number', 'player_profile_flag', 'height', 'weight', 'wingspan', 'body_fat_pct', 'standing_vertical_leap', 'max_vertical_leap', 'lane_agility_time']


In [10]:
def fit_preprocessors(train_df: pd.DataFrame, feature_cols):
    imputer = SimpleImputer(strategy="mean")
    scaler = StandardScaler()
    X_raw = train_df[feature_cols]
    imputer.fit(X_raw)
    scaler.fit(imputer.transform(X_raw))
    return imputer, scaler

def transform_features(df: pd.DataFrame, feature_cols, imputer, scaler):
    X_raw = df[feature_cols]
    return scaler.transform(imputer.transform(X_raw))


def evaluate_spearman_by_season(df_with_scores: pd.DataFrame, score_col: str):
    """
    Returns list of (season, spearman_corr), comparing score_col vs -overall_pick.
    """
    results = []
    for season, grp in df_with_scores.groupby("season"):
        if grp["overall_pick"].nunique() < 2:
            continue
        corr, _ = spearmanr(grp[score_col], -grp["overall_pick"])
        results.append((int(season), float(corr)))
    return sorted(results)


def write_rankings_csv(out_path: Path, df_all: pd.DataFrame, scores: np.ndarray):
    tmp = df_all.copy()
    tmp["pred_score"] = scores
    tmp["pred_rank"] = (
        tmp.groupby("season")["pred_score"]
        .rank(ascending=False, method="first")
        .astype(int)
    )
    tmp = tmp.sort_values(["season", "pred_rank"])
    out_path.parent.mkdir(parents=True, exist_ok=True)
    tmp.to_csv(out_path, index=False)
    return tmp


In [11]:
seasons = np.sort(df["season"].unique())
print("Seasons:", seasons)

if len(seasons) <= HOLDOUT_SEASONS:
    train_last = int(seasons.max())
else:
    train_last = int(seasons[-(HOLDOUT_SEASONS + 1)])

train_df = df[df["season"] <= train_last].copy()
test_df = df[df["season"] > train_last].copy()

print(f"Train seasons ≤ {train_last}")
print(f"Train rows: {len(train_df)}, Test rows: {len(test_df)}")


Seasons: [2000 2001 2002 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014
 2015 2016 2017 2018 2019]
Train seasons ≤ 2015
Train rows: 696, Test rows: 192


In [12]:
train_sorted = train_df.sort_values(["season", "overall_pick"]).reset_index(drop=True)
test_sorted = test_df.sort_values(["season", "overall_pick"]).reset_index(drop=True)

imputer, scaler = fit_preprocessors(train_sorted, feature_cols)

X_train = transform_features(train_sorted, feature_cols, imputer, scaler)
X_test = transform_features(test_sorted, feature_cols, imputer, scaler)

# Target: better pick => higher label
y_train = -train_sorted["overall_pick"].to_numpy(dtype=float)

print("X_train:", X_train.shape, "X_test:", X_test.shape)

reg = HistGradientBoostingRegressor(
    loss="squared_error",
    learning_rate=0.06,
    max_depth=6,
    max_leaf_nodes=31,
    min_samples_leaf=30,
    l2_regularization=1e-2,
    early_stopping=True,
    random_state=RANDOM_STATE,
)

reg.fit(X_train, y_train)
print("Pointwise model trained.")


X_train: (696, 39) X_test: (192, 39)
Pointwise model trained.


In [13]:
test_scores = reg.predict(X_test)

test_with_scores = test_sorted.copy()
test_with_scores["pointwise_score"] = test_scores

per_season = evaluate_spearman_by_season(test_with_scores, "pointwise_score")

for season, corr in per_season:
    print(f"Season {season}: Spearman = {corr:+.3f}")

if per_season:
    mean_corr = float(np.mean([c for _, c in per_season]))
    print(f"\nMean Spearman over held-out seasons: {mean_corr:+.3f}")


Season 2016: Spearman = +0.799
Season 2017: Spearman = +0.810
Season 2018: Spearman = +0.781
Season 2019: Spearman = +0.822

Mean Spearman over held-out seasons: +0.803


In [14]:
df_sorted_all = df.sort_values(["season", "overall_pick"]).reset_index(drop=True)
X_all = transform_features(df_sorted_all, feature_cols, imputer, scaler)
scores_all = reg.predict(X_all)

rankings = write_rankings_csv(OUT_PATH, df_sorted_all, scores_all)
print(f"Wrote pointwise rankings to {OUT_PATH}")
rankings.head()


Wrote pointwise rankings to /Users/aryaman/Documents/Sem3/PA/nba-draft-ranker/outputs/pointwise_rankings.csv


Unnamed: 0,person_id,player_name,season,round_number,overall_pick,team_name,organization,organization_type,player_profile_flag,position,...,totals_pts,shooting_fg%,shooting_3p%,shooting_ft%,per game_mp,per game_pts,per game_trb,per game_ast,pred_score,pred_rank
1,2031,Stromile Swift,2000,1,2,Grizzlies,Louisiana State,College/University,1,,...,550,0.608,0.28,0.617,29.8,16.2,8.2,0.9,-8.784265,1
3,2034,Mike Miller,2000,1,5,Magic,Florida,College/University,1,,...,521,0.476,0.338,0.729,28.6,14.1,6.6,2.5,-9.729428,2
5,2036,Chris Mihm,2000,1,7,Bulls,Texas,College/University,1,,...,583,0.523,0.467,0.707,30.7,17.7,10.5,0.7,-10.481791,3
7,2038,Joel Przybilla,2000,1,9,Rockets,Minnesota,College/University,1,,...,299,0.613,,0.495,30.4,14.2,8.4,2.4,-11.405798,4
4,2035,DerMarr Johnson,2000,1,6,Hawks,Cincinnati,College/University,1,,...,402,0.478,0.371,0.737,27.5,12.6,3.8,1.4,-11.562941,5
