Trying XGBoost Model

In [1]:
import sqlite3
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# Connect and load data
conn = sqlite3.connect("../data/sqlite/nba_mvp.db")
df = pd.read_sql("SELECT * FROM final_player_data", conn)
conn.close()

print("Shape:", df.shape)
# print("Columns:", df.columns.tolist())

Shape: (1150, 29)


In [3]:
features = [
    'GP', 'MIN', 'PTS', 'FGM', 'FGA', 'FG%', '3PM',
    '3PA', '3P%', 'FTM', 'FTA', 'FT%', 'REB', 'AST', 'STL', 'BLK', 'TO', 
    'DD2', 'TD3'
]

target = "is_mvp"

In [4]:
df_model = df[features + [target] + ["season", "player_id", "Name"]].dropna()

# Split full DataFrame so we still have season info
train_df, test_df = train_test_split(df_model, test_size=0.2, random_state=42, stratify=df_model[target])

# Extract X and y
X_train = train_df[features]
y_train = train_df[target]

X_test = test_df[features]
y_test = test_df[target]


In [5]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [None]:
# Random Forest
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

# Count class occurrences
neg_count = sum(y_train == 0)
pos_count = sum(y_train == 1)

# Calculate imbalance ratio
imbalance_ratio = neg_count / pos_count




ValueError: DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, the experimental DMatrix parameter`enable_categorical` must be set to `True`.  Invalid columns:GP: object, MIN: object, PTS: object, FGM: object, FGA: object, FG%: object, 3PM: object, 3PA: object, 3P%: object, FTM: object, FTA: object, FT%: object, REB: object, AST: object, STL: object, BLK: object, TO: object, DD2: object, TD3: object

In [10]:
# print(X_train.dtypes[X_train.dtypes == 'object'])

# Convert all columns in X_train and X_test to numeric
X_train = X_train.apply(pd.to_numeric, errors='coerce')
X_test = X_test.apply(pd.to_numeric, errors='coerce')

# Drop any rows with NaNs (caused by conversion issues)
X_train = X_train.dropna()
y_train = y_train[X_train.index]  # Align target with filtered rows

print(X_train.dtypes[X_train.dtypes == 'object'])


Series([], dtype: object)


In [11]:
from xgboost import XGBClassifier

# Train the XGBoost model
xgb = XGBClassifier(
    use_label_encoder=False,
    eval_metric='logloss',
    scale_pos_weight=imbalance_ratio  # Only if you calculated this
)
xgb.fit(X_train, y_train)


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


Generating MVP probabilities

In [12]:
# Predict probabilities for class 1 (MVP)
y_pred_proba = xgb.predict_proba(X_test)[:, 1]

# Add predictions to a copy of X_test
season_rankings = X_test.copy()
season_rankings['season'] = df.loc[X_test.index, 'season']
season_rankings['player_id'] = df.loc[X_test.index, 'player_id']
season_rankings['is_mvp'] = y_test
season_rankings['mvp_prob'] = y_pred_proba


In [13]:
# Pick player with highest predicted MVP probability per season
predicted_mvp = season_rankings.sort_values('mvp_prob', ascending=False).groupby('season').head(1)


In [14]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# True MVPs per season
true_mvp = season_rankings[season_rankings['is_mvp'] == 1].groupby('season').head(1)

# Match predicted MVPs to actual MVPs
merged = pd.merge(predicted_mvp, true_mvp, on='season', suffixes=('_pred', '_true'))

# Correct prediction = same player_id
merged['correct'] = merged['player_id_pred'] == merged['player_id_true']

# Metrics
accuracy = merged['correct'].mean()
precision = precision_score(merged['correct'], [1]*len(merged))  # All predicted = 1
recall = recall_score(merged['correct'], [1]*len(merged))        # True MVPs = 1
f1 = f1_score(merged['correct'], [1]*len(merged))

print(f"🎯 Accuracy: {accuracy:.2f}")
print(f"🔍 Precision: {precision:.2f}")
print(f"📈 Recall: {recall:.2f}")
print(f"🏅 F1 Score: {f1:.2f}")


🎯 Accuracy: 0.50
🔍 Precision: 0.50
📈 Recall: 1.00
🏅 F1 Score: 0.67


In [15]:
season_rankings.head(10)  # Display top 10 rows of season rankings with MVP probabilities

Unnamed: 0,GP,MIN,PTS,FGM,FGA,FG%,3PM,3PA,3P%,FTM,...,AST,STL,BLK,TO,DD2,TD3,season,player_id,is_mvp,mvp_prob
349,69,32.5,17.6,7.0,15.3,45.9,0.3,1.2,27.5,3.2,...,2.0,0.9,0.2,2.7,40,0,2008,284,0,5.1e-05
999,70,31.4,16.4,6.3,12.6,49.9,1.5,3.7,39.5,2.4,...,8.9,1.4,0.3,2.2,25,1,2021,52,0,0.001106
244,80,35.0,17.1,6.5,13.7,47.0,2.1,5.2,41.1,2.1,...,3.8,1.3,0.3,1.7,2,0,2006,137,0,5.5e-05
759,79,33.4,25.3,8.5,18.3,46.8,4.1,10.0,41.1,4.1,...,6.6,1.8,0.2,3.0,9,0,2017,257,0,0.004698
874,70,33.5,21.1,7.0,16.0,43.8,2.2,6.1,36.4,4.9,...,6.4,1.3,0.3,1.9,10,0,2019,211,0,9e-05
365,79,39.5,21.4,7.8,18.0,43.7,1.9,5.2,36.0,3.8,...,5.8,1.1,0.2,2.5,4,1,2009,147,0,0.000106
14,74,42.1,21.7,8.0,18.4,43.8,2.0,5.1,39.5,3.6,...,2.5,1.5,0.5,2.4,1,0,2002,60,0,0.000101
82,80,37.4,18.7,6.4,15.6,41.4,1.6,4.6,34.1,4.2,...,8.9,2.2,0.3,3.7,31,4,2003,135,0,0.00199
821,58,34.0,21.4,7.5,17.2,43.8,1.9,5.6,34.5,4.5,...,5.8,0.7,0.3,2.8,16,3,2018,28,0,4.3e-05
1066,74,36.2,24.5,8.9,17.6,50.4,0.6,1.9,32.4,6.2,...,5.1,1.1,0.5,2.1,6,0,2023,71,0,0.000148
