### 1.Create Label

In [1]:
import pandas as pd
from pathlib import Path

current_dir = Path.cwd()
df = pd.read_csv(current_dir.parent/ "data" /"cleaned" / "college_undrafted.csv")
df.head()

Unnamed: 0,SEASON,OVERALL_PICK,Totals_FG,Totals_FT,Totals_TRB,Totals_BLK,Totals_STL,Totals_TOV,Totals_PF,Shooting_FG%,MP
0,2000,1,221.0,141.0,300.0,107.0,43.0,56.0,71.0,0.568,909.0
1,2000,2,208.0,127.0,279.0,95.0,50.0,80.0,88.0,0.608,1013.0
2,2000,4,327.0,175.0,285.0,39.0,29.0,77.0,103.0,0.582,1243.0
3,2000,5,175.0,124.0,243.0,15.0,46.0,71.0,70.0,0.476,1058.0
4,2000,6,140.0,70.0,123.0,30.0,31.0,46.0,64.0,0.478,879.0


In [2]:
import numpy as np
import pandas as pd

# df is your merged dataset
# drafted = 1 if picked 1â€“60, undrafted (100) = 0
df["drafted"] = (df["OVERALL_PICK"] <= 60).astype(int)
print(df["drafted"].value_counts())


drafted
1    1210
0     559
Name: count, dtype: int64


### 2. Train Test Split

In [3]:
!pip install imbalanced-learn

You should consider upgrading via the '/Users/tenzin/Desktop/mycodes/nba-draft-ranker/.venv/bin/python3 -m pip install --upgrade pip' command.[0m


In [4]:
from imblearn.under_sampling import RandomUnderSampler
import pandas as pd

# ============================
# 1. Train/Test Split by SEASON
# ============================
# Sort all seasons
all_seasons = sorted(df["SEASON"].unique())

# Number of seasons to use as test (20% of 25 = 5)
test_size = 5

# Select last 5 seasons
test_seasons = all_seasons[-test_size:]

# Remaining seasons are the training data
train_seasons = all_seasons[:-test_size]

print("Train seasons:", train_seasons)
print("Test seasons:", test_seasons)

# Create train_df and test_df
train_df = df[df["SEASON"].isin(train_seasons)].copy()
test_df  = df[df["SEASON"].isin(test_seasons)].copy()

print("Train seasons:", sorted(train_df["SEASON"].unique()))
print("Train size:", len(train_df), "Test size:", len(test_df))

# ============================
# 2. Features / Target
# ============================
target_col = "drafted"   # <-- 1 = drafted, 0 = undrafted

feature_cols = [
    "Totals_FG",
    "Totals_FT",
    "Totals_TRB",
    "Totals_STL",
    "Totals_BLK",
    "Totals_TOV",
    "Totals_PF",
    "Shooting_FG%",
    "MP",
]

X_train = train_df[feature_cols]
y_train = train_df[target_col]

X_test  = test_df[feature_cols]
y_test  = test_df[target_col]

print("Train Class Balance:--->")
print(y_train.value_counts())

print("Test Class Balance:--->")
print(y_test.value_counts())

Train seasons: [np.int64(1952), np.int64(1977), np.int64(1981), np.int64(1986), np.int64(1987), np.int64(1995), np.int64(1999), np.int64(2000), np.int64(2001), np.int64(2002), np.int64(2003), np.int64(2004), np.int64(2005), np.int64(2006), np.int64(2007), np.int64(2008), np.int64(2009), np.int64(2010), np.int64(2011), np.int64(2012), np.int64(2013), np.int64(2014), np.int64(2015), np.int64(2016), np.int64(2017), np.int64(2018), np.int64(2019), np.int64(2020)]
Test seasons: [np.int64(2021), np.int64(2022), np.int64(2023), np.int64(2024), np.int64(2025)]
Train seasons: [np.int64(1952), np.int64(1977), np.int64(1981), np.int64(1986), np.int64(1987), np.int64(1995), np.int64(1999), np.int64(2000), np.int64(2001), np.int64(2002), np.int64(2003), np.int64(2004), np.int64(2005), np.int64(2006), np.int64(2007), np.int64(2008), np.int64(2009), np.int64(2010), np.int64(2011), np.int64(2012), np.int64(2013), np.int64(2014), np.int64(2015), np.int64(2016), np.int64(2017), np.int64(2018), np.int64(

### 3. Train

In [5]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    roc_auc_score,
    confusion_matrix,
    classification_report,
)
from sklearn.utils.class_weight import compute_class_weight
from sklearn.base import clone
import numpy as np

# ============================
# 1. Compute class weights from TRAIN
# ============================
classes = np.array([0, 1])
class_weights = compute_class_weight(
    class_weight="balanced",
    classes=classes,
    y=y_train
)
class_weight_dict = {cls: w for cls, w in zip(classes, class_weights)}
print("Class weights (train):", class_weight_dict)

# sample_weight vector aligned with y_train index
sample_weight_train = np.array([class_weight_dict[c] for c in y_train])

# ============================
# 2. Base Gradient Boosting model
# ============================
base_model = GradientBoostingClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=3,
    subsample=0.8,
    random_state=42,
)

# ============================
# 3. Manual Stratified K-Fold CV on TRAIN
# ============================
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

train_accs, train_f1s, train_aucs = [], [], []
val_accs,   val_f1s,   val_aucs   = [], [], []

for fold, (tr_idx, val_idx) in enumerate(cv.split(X_train, y_train), start=1):
    X_tr, X_val = X_train.iloc[tr_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[tr_idx], y_train.iloc[val_idx]
    sw_tr = sample_weight_train[tr_idx]

    model = clone(base_model)
    model.fit(X_tr, y_tr, sample_weight=sw_tr)

    # ---- TRAIN metrics ----
    y_tr_pred  = model.predict(X_tr)
    y_tr_proba = model.predict_proba(X_tr)[:, 1]

    train_accs.append(accuracy_score(y_tr, y_tr_pred))
    train_f1s.append(f1_score(y_tr, y_tr_pred))
    train_aucs.append(roc_auc_score(y_tr, y_tr_proba))

    # ---- VALIDATION metrics ----
    y_val_pred  = model.predict(X_val)
    y_val_proba = model.predict_proba(X_val)[:, 1]

    val_accs.append(accuracy_score(y_val, y_val_pred))
    val_f1s.append(f1_score(y_val, y_val_pred))
    val_aucs.append(roc_auc_score(y_val, y_val_proba))

    print(f"Fold {fold}: Train AUC={train_aucs[-1]:.3f}, Val AUC={val_aucs[-1]:.3f}")

print("\n===== MEAN TRAIN METRICS (from CV folds) =====")
print(f"Train Accuracy: {np.mean(train_accs):.3f}")
print(f"Train F1-score: {np.mean(train_f1s):.3f}")
print(f"Train ROC-AUC : {np.mean(train_aucs):.3f}")

print("\n===== MEAN VALIDATION METRICS (from CV folds) =====")
print(f"Validation Accuracy: {np.mean(val_accs):.3f}")
print(f"Validation F1-score: {np.mean(val_f1s):.3f}")
print(f"Validation ROC-AUC : {np.mean(val_aucs):.3f}")

Class weights (train): {np.int64(0): np.float64(1.475051975051975), np.int64(1): np.float64(0.7563965884861408)}
Fold 1: Train AUC=0.975, Val AUC=0.696
Fold 2: Train AUC=0.973, Val AUC=0.670
Fold 3: Train AUC=0.970, Val AUC=0.649
Fold 4: Train AUC=0.973, Val AUC=0.689
Fold 5: Train AUC=0.972, Val AUC=0.661

===== MEAN TRAIN METRICS (from CV folds) =====
Train Accuracy: 0.899
Train F1-score: 0.920
Train ROC-AUC : 0.973

===== MEAN VALIDATION METRICS (from CV folds) =====
Validation Accuracy: 0.648
Validation F1-score: 0.723
Validation ROC-AUC : 0.673


### 4.Evaluate

In [6]:

# ============================
# 4. Train on FULL TRAIN and evaluate on TEST
# ============================
final_model = clone(base_model)
final_model.fit(X_train, y_train, sample_weight=sample_weight_train)

y_test_pred  = final_model.predict(X_test)               # threshold = 0.5
y_test_proba = final_model.predict_proba(X_test)[:, 1]   # probabilities for class 1

print("\n===== TEST SET EVALUATION (Last 5 Seasons) =====")
cm = confusion_matrix(y_test, y_test_pred)
print("Confusion matrix on TEST:")
print(cm)

print("\nClassification report on TEST:")
print(classification_report(y_test, y_test_pred, digits=3))

test_auc = roc_auc_score(y_test, y_test_proba)
test_f1  = f1_score(y_test, y_test_pred)
print(f"ROC-AUC on TEST: {test_auc:.3f}")
print(f"F1-score on TEST: {test_f1:.3f}")



===== TEST SET EVALUATION (Last 5 Seasons) =====
Confusion matrix on TEST:
[[ 39  39]
 [104 168]]

Classification report on TEST:
              precision    recall  f1-score   support

           0      0.273     0.500     0.353        78
           1      0.812     0.618     0.701       272

    accuracy                          0.591       350
   macro avg      0.542     0.559     0.527       350
weighted avg      0.692     0.591     0.624       350

ROC-AUC on TEST: 0.565
F1-score on TEST: 0.701
