### 1.Create Label

In [1]:
import pandas as pd
from pathlib import Path

current_dir = Path.cwd()
df = pd.read_csv(current_dir.parent/ "outputs" /"classification" / "merged_college_stats.csv")
df.head()

Unnamed: 0,SEASON,OVERALL_PICK,Totals_FG,Totals_FT,Totals_TRB,Totals_BLK,Totals_STL,Totals_TOV,Totals_PF,Shooting_FG%,MP
0,2000,1,221.0,141.0,300.0,107.0,43.0,56.0,71.0,0.568,909.0
1,2000,2,208.0,127.0,279.0,95.0,50.0,80.0,88.0,0.608,1013.0
2,2000,4,327.0,175.0,285.0,39.0,29.0,77.0,103.0,0.582,1243.0
3,2000,5,175.0,124.0,243.0,15.0,46.0,71.0,70.0,0.476,1058.0
4,2000,6,140.0,70.0,123.0,30.0,31.0,46.0,64.0,0.478,879.0


In [2]:
import numpy as np
import pandas as pd

# df is your merged dataset
# drafted = 1 if picked 1â€“60, undrafted (100) = 0
df["drafted"] = (df["OVERALL_PICK"] <= 60).astype(int)
print(df["drafted"].value_counts())


drafted
1    1210
0     552
Name: count, dtype: int64


### 2. Train Test Split

In [3]:
!pip install imbalanced-learn

You should consider upgrading via the '/Users/tenzin/Desktop/mycodes/nba-draft-ranker/.venv/bin/python3 -m pip install --upgrade pip' command.[0m


In [6]:
from imblearn.under_sampling import RandomUnderSampler
import pandas as pd

# ============================
# 1. Train/Test Split by SEASON
# ============================
# Sort all seasons
all_seasons = sorted(df["SEASON"].unique())

# Number of seasons to use as test (20% of 25 = 5)
test_size = 5

# Select last 5 seasons
test_seasons = all_seasons[-test_size:]

# Remaining seasons are the training data
train_seasons = all_seasons[:-test_size]

print("Train seasons:", train_seasons)
print("Test seasons:", test_seasons)

# Create train_df and test_df
train_df = df[df["SEASON"].isin(train_seasons)].copy()
test_df  = df[df["SEASON"].isin(test_seasons)].copy()

print("Train seasons:", sorted(train_df["SEASON"].unique()))
print("Train size:", len(train_df), "Test size:", len(test_df))

# ============================
# 2. Features / Target
# ============================
target_col = "drafted"   # <-- 1 = drafted, 0 = undrafted

feature_cols = [
    "Totals_FG",
    "Totals_FT",
    "Totals_TRB",
    "Totals_STL",
    "Totals_BLK",
    "Totals_TOV",
    "Totals_PF",
    "Shooting_FG%",
    "MP",
]

X_train = train_df[feature_cols]
y_train = train_df[target_col]

X_test  = test_df[feature_cols]
y_test  = test_df[target_col]

print("Train Class Balance:--->")
print(y_train.value_counts())

print("Test Class Balance:--->")
print(y_test.value_counts())

Train seasons: [np.int64(2000), np.int64(2001), np.int64(2002), np.int64(2003), np.int64(2004), np.int64(2005), np.int64(2006), np.int64(2007), np.int64(2008), np.int64(2009), np.int64(2010), np.int64(2011), np.int64(2012), np.int64(2013), np.int64(2014), np.int64(2015), np.int64(2016), np.int64(2017), np.int64(2018), np.int64(2019), np.int64(2020)]
Test seasons: [np.int64(2021), np.int64(2022), np.int64(2023), np.int64(2024), np.int64(2025)]
Train seasons: [np.int64(2000), np.int64(2001), np.int64(2002), np.int64(2003), np.int64(2004), np.int64(2005), np.int64(2006), np.int64(2007), np.int64(2008), np.int64(2009), np.int64(2010), np.int64(2011), np.int64(2012), np.int64(2013), np.int64(2014), np.int64(2015), np.int64(2016), np.int64(2017), np.int64(2018), np.int64(2019), np.int64(2020)]
Train size: 1430 Test size: 332
Train Class Balance:--->
drafted
1    938
0    492
Name: count, dtype: int64
Test Class Balance:--->
drafted
1    272
0     60
Name: count, dtype: int64


### 3. Train

In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

# ============================
# 3. Compute class weights from TRAIN
# ============================
classes = np.array([0, 1])
class_weights = compute_class_weight(
    class_weight="balanced",
    classes=classes,
    y=y_train
)
class_weight_dict = {cls: w for cls, w in zip(classes, class_weights)}
print("Class weights (train):", class_weight_dict)

# ============================
# 4. Define weighted classifier
# ============================
model_clf = RandomForestClassifier(
    n_estimators=300,
    max_depth=None,
    min_samples_split=5,
    min_samples_leaf=2,
    class_weight=class_weight_dict,  # <--- weightage here
    random_state=42,
    n_jobs=-1,
)

# ============================
# 5. Cross-validation on TRAIN
# ============================
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

cv_results = cross_validate(
    model_clf,
    X_train,
    y_train,
    cv=cv,
    scoring=["accuracy", "f1", "roc_auc"],
    return_train_score=True,
    n_jobs=-1,
)

# ----- Mean TRAIN metrics -----
mean_train_acc = cv_results["train_accuracy"].mean()
mean_train_f1  = cv_results["train_f1"].mean()
mean_train_auc = cv_results["train_roc_auc"].mean()

print("\n===== MEAN TRAIN METRICS (from CV folds) =====")
print(f"Train Accuracy: {mean_train_acc:.3f}")
print(f"Train F1-score: {mean_train_f1:.3f}")
print(f"Train ROC-AUC : {mean_train_auc:.3f}")

# ----- Mean VALIDATION metrics -----
mean_val_acc = cv_results["test_accuracy"].mean()
mean_val_f1  = cv_results["test_f1"].mean()
mean_val_auc = cv_results["test_roc_auc"].mean()

print("\n===== MEAN VALIDATION METRICS (from CV folds) =====")
print(f"Validation Accuracy: {mean_val_acc:.3f}")
print(f"Validation F1-score: {mean_val_f1:.3f}")
print(f"Validation ROC-AUC : {mean_val_auc:.3f}")

Class weights (train): {np.int64(0): np.float64(1.4532520325203253), np.int64(1): np.float64(0.7622601279317697)}

===== MEAN TRAIN METRICS (from CV folds) =====
Train Accuracy: 0.986
Train F1-score: 0.989
Train ROC-AUC : 0.999

===== MEAN VALIDATION METRICS (from CV folds) =====
Validation Accuracy: 0.658
Validation F1-score: 0.761
Validation ROC-AUC : 0.639


### 4.Evaluate

In [9]:
# ============================
# 6. Train on FULL TRAIN and evaluate on TEST (last 5 seasons)
# ============================
model_clf.fit(X_train, y_train)

y_pred  = model_clf.predict(X_test)
y_proba = model_clf.predict_proba(X_test)[:, 1]

print("\n===== TEST SET EVALUATION (Last 5 Seasons) =====")
cm = confusion_matrix(y_test, y_pred)
print("Confusion matrix on TEST:")
print(cm)

print("\nClassification report on TEST:")
print(classification_report(y_test, y_pred, digits=3))

test_auc = roc_auc_score(y_test, y_proba)
print("ROC-AUC on TEST:", test_auc)



===== TEST SET EVALUATION (Last 5 Seasons) =====
Confusion matrix on TEST:
[[ 16  44]
 [ 52 220]]

Classification report on TEST:
              precision    recall  f1-score   support

           0      0.235     0.267     0.250        60
           1      0.833     0.809     0.821       272

    accuracy                          0.711       332
   macro avg      0.534     0.538     0.535       332
weighted avg      0.725     0.711     0.718       332

ROC-AUC on TEST: 0.5420649509803922
