In [None]:
import pandas as pd
from xgboost import XGBClassifier
from sklearn.preprocessing import OrdinalEncoder


df = pd.read_csv("/content/player_seasons_with_breakouts.csv")

df = df[df["E_NET_RATING_PREV"].notna()].copy()


df["SEASON_START"] = df["SEASON"].str.slice(0, 4).astype(int)

feature_cols = [
    'E_NET_RATING_PREV',
    'E_NET_RATING_2YRS_AGO',
    'E_OFF_RATING_PREV',
    'E_DEF_RATING_PREV',
    'TS_PCT_PREV',
    'TS_PCT_2YRS_AGO',
    'FG_PCT_PREV',
    'FG3_PCT_PREV',
    'AGE',
    'EXPERIENCE'
]

missing = [c for c in feature_cols if c not in df.columns]


train_df = df[df["SEASON_START"] <= 2023].copy()
test_df  = df[df["SEASON_START"] >= 2024].copy()

# print(f"Train    rows: {len(train_df):,}")
# print(f"Test rows:  {len(test_df):,}")

X_train = train_df[feature_cols].copy()
y_train = train_df["BREAKOUT"].astype(int)

X_test = test_df[feature_cols].copy()
y_test = test_df["BREAKOUT"].astype(int)

cat_cols = X_train.select_dtypes(include=["object"]).columns.tolist()
print("catgorical columns:", cat_cols)

if cat_cols:
    enc = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
    X_train[cat_cols] = enc.fit_transform(X_train[cat_cols])
    X_test[cat_cols]  = enc.transform(X_test[cat_cols])

X_train = X_train.astype(float)
X_test  = X_test.astype(float)
#
pos_rate = y_train.mean()
scale_pos_weight = (1 - pos_rate) / pos_rate

model = XGBClassifier(
    n_estimators=600,
    learning_rate=0.03,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_lambda=1.0,
    min_child_weight=5,
    scale_pos_weight=scale_pos_weight,
    objective="binary:logistic",
    eval_metric="logloss",
    tree_method="hist",
    random_state=42,
    n_jobs=-1,
)

model.fit(
    X_train, y_train,
    eval_set=[(X_train, y_train)])


importances = model.feature_importances_
fi_df = pd.DataFrame({
    "feature": feature_cols,
    "importance": importances
}).sort_values("importance", ascending=False)

