# XGBoost model to predict MVP

In [38]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import RandomizedSearchCV
import warnings
warnings.filterwarnings('ignore')

### Load data, select appropriate variables

In [2]:
df = pd.read_csv("mvp_data.csv", index_col=0).reset_index(drop=True)
df = df.fillna(0)
df.columns

Index(['Year', 'Player', 'Tm', 'Share', 'G', 'MP', 'PTS', 'TRB', 'AST', 'STL',
       'BLK', 'FG%', '3P%', 'FT%', 'WS', 'WS/48', 'MVP',
       'player_efficiency_rating', 'true_shooting_percentage',
       'three_point_attempt_rate', 'free_throw_attempt_rate',
       'offensive_rebound_percentage', 'defensive_rebound_percentage',
       'total_rebound_percentage', 'assist_percentage', 'steal_percentage',
       'block_percentage', 'turnover_percentage', 'usage_percentage',
       'offensive_win_shares', 'defensive_win_shares',
       'offensive_box_plus_minus', 'defensive_box_plus_minus',
       'box_plus_minus', 'value_over_replacement_player', 'wl_pct', 'seed',
       'highest_teammate_vorp', 'highest_teammate_ws'],
      dtype='object')

In [3]:
# select variables used for regression (mutual_info from lin_reg model)
cols = ['PTS', 'WS/48', 'player_efficiency_rating', 'free_throw_attempt_rate',
       'defensive_rebound_percentage', 'box_plus_minus',
       'value_over_replacement_player', 'wl_pct', 'seed']
X = df[cols]
y = df['Share']


### XGBoost model with entire data

In [4]:
xg = xgb.XGBRegressor(n_estimators=250, max_depth=7, eta=0.1, subsample=0.7, colsample_bytree=0.8)
xg.fit(X, y)

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=0.8, early_stopping_rounds=None,
             enable_categorical=False, eta=0.1, eval_metric=None,
             feature_types=None, gamma=None, gpu_id=None, grow_policy=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=None, max_bin=None, max_cat_threshold=None,
             max_cat_to_onehot=None, max_delta_step=None, max_depth=7,
             max_leaves=None, min_child_weight=None, missing=nan,
             monotone_constraints=None, n_estimators=250, n_jobs=None,
             num_parallel_tree=None, predictor=None, ...)

In [5]:
# Feature importance
for score, name in zip(xg.feature_importances_, X.columns):
    print(round(score, 2), name)

0.06 PTS
0.07 WS/48
0.06 player_efficiency_rating
0.05 free_throw_attempt_rate
0.06 defensive_rebound_percentage
0.13 box_plus_minus
0.3 value_over_replacement_player
0.09 wl_pct
0.19 seed


### Holdout single year as test set

In [6]:
df_train = df[df['Year'] != 2023]
df_test = df[df['Year'] == 2023]
X_train = df_train[cols]
y_train = df_train['Share']
X_test =  df_test[cols]
y_test = df_test['Share']


In [7]:
xg = xgb.XGBRegressor(n_estimators=250, max_depth=7, eta=0.1, subsample=0.7, colsample_bytree=0.8)
xg.fit(X_train, y_train)
pred = xg.predict(X_test)

### Evaluate with R2, MSE

In [8]:
r2_score(y_test, pred)

0.5324449078942486

In [9]:
mean_squared_error(y_test, pred)

0.04989254204775316

In [10]:
results = pd.DataFrame(y_test)
results['prediction'] = pred
results.index = df_test['Player']
results

Unnamed: 0_level_0,Share,prediction
Player,Unnamed: 1_level_1,Unnamed: 2_level_1
Joel Embiid,0.915,0.633281
Nikola Jokić,0.674,0.993687
Giannis Antetokounmpo,0.606,0.730525
Jayson Tatum,0.28,0.454374
Shai Gilgeous-Alexander,0.046,0.237084
Donovan Mitchell,0.03,0.117645
Domantas Sabonis,0.027,0.21718
Luka Dončić,0.01,0.407599
Stephen Curry,0.005,0.117451
Jimmy Butler,0.003,0.145668


In [18]:
# random search to select best hyperparameters
xg = xgb.XGBRegressor()

params = {
    'n_estimators': list(range(50,250,50)),
    'max_depth': list(range(5, 30, 5)),
    'eta': [0.01, 0.1, 0.3],
    'subsample': np.arange(0.5,1.1,0.1),
    'colsample_bytree': np.arange(0.5,1.1,0.1)
}

random_search = RandomizedSearchCV(xg, param_distributions=params, cv=5, scoring='r2')
random_search.fit(X_train, y_train)

RandomizedSearchCV(cv=5,
                   estimator=XGBRegressor(base_score=None, booster=None,
                                          callbacks=None,
                                          colsample_bylevel=None,
                                          colsample_bynode=None,
                                          colsample_bytree=None,
                                          early_stopping_rounds=None,
                                          enable_categorical=False,
                                          eval_metric=None, feature_types=None,
                                          gamma=None, gpu_id=None,
                                          grow_policy=None,
                                          importance_type=None,
                                          interaction_constraints=None,
                                          learning_rate=...
                                          min_child_weight=None, missing=nan,
                             

In [19]:
#xg = xgb.XGBRegressor(n_estimators=250, max_depth=7, eta=0.1, subsample=0.7, colsample_bytree=0.8)
print(random_search.best_params_)

{'subsample': 0.8999999999999999, 'n_estimators': 100, 'max_depth': 20, 'eta': 0.3, 'colsample_bytree': 0.6}


In [20]:
best_xg = random_search.best_estimator_
preds = best_xg.predict(X_test)

In [21]:
print('R2:', r2_score(y_test, preds))
print('MSE:', mean_squared_error(y_test, preds))

R2: 0.5173864167388249
MSE: 0.05149942520619338


In [22]:
results = pd.DataFrame(y_test)
results['prediction'] = preds
results.index = df_test['Player']
results

Unnamed: 0_level_0,Share,prediction
Player,Unnamed: 1_level_1,Unnamed: 2_level_1
Joel Embiid,0.915,0.597984
Nikola Jokić,0.674,0.916014
Giannis Antetokounmpo,0.606,0.414797
Jayson Tatum,0.28,0.292931
Shai Gilgeous-Alexander,0.046,0.152146
Donovan Mitchell,0.03,0.139058
Domantas Sabonis,0.027,0.23429
Luka Dončić,0.01,0.351301
Stephen Curry,0.005,0.233346
Jimmy Butler,0.003,0.293587


### Create function to determine if MVP winner is correct

In [23]:
actual = results['Share'].idxmax()
prediction = results['prediction'].idxmax()

In [24]:
def mvp_prediction(actual, prediction):
    if actual == prediction:
        return 'Correct'
    else:
        return 'Wrong'

In [31]:
dat = pd.DataFrame()
dat['R2'] = [r2_score(y_test, preds)]
dat['MSE'] = [mean_squared_error(y_test, preds)]
dat['MVP'] = [mvp_prediction(actual, prediction)]
dat.index = [2021]
dat

Unnamed: 0,R2,MSE,MVP
2021,0.545516,0.051343,Wrong


### Create function for hyperparameter tuning for each variable

In [None]:
# random search to select best hyperparameters
rf = RandomForestRegressor()

params = {
    'max_depth': list(range(5, 35, 5)),
    'max_features': list(range(3,8,1)),
    'min_samples_split': list(range(2,5,1)),
    'min_samples_leaf': [1,2,3]
}

random_search = RandomizedSearchCV(rf, param_distributions=params, cv=5, scoring='r2')
random_search.fit(X_train, y_train)

best_rf = random_search.best_estimator_
preds = best_rf.predict(X_test)

### Create function for all evaluations

In [25]:
def evaluate(y_test, pred):
    evals = []
    results = pd.DataFrame(y_test)
    results['prediction'] = pred
    results.index = df_test['Player']
    actual = results['Share'].idxmax()
    prediction = results['prediction'].idxmax()
    evals.append(r2_score(y_test, pred))
    evals.append(mean_squared_error(y_test, pred))
    evals.append(mvp_prediction(actual, prediction))
    return evals

### For loop to use each year as a test set

In [26]:
# empty dict to store results
evals_by_year = {}

best_params_year={}

for year in range(1980, 2024):
    df_train = df[df['Year'] != year]
    df_test = df[df['Year'] == year]
    X_train =  df_train[cols]
    y_train = df_train['Share']
    X_test =  df_test[cols]
    y_test = df_test['Share']
    xg = xgb.XGBRegressor()
    random_search = RandomizedSearchCV(xg, param_distributions=params, cv=5, scoring='r2')
    random_search.fit(X_train, y_train)
    best_xg = random_search.best_estimator_
    pred = best_xg.predict(X_test)
    eval = evaluate(y_test, pred)
    evals_by_year[year] = eval
    best_params_year[year] = random_search.best_params_

In [27]:
xg_res = pd.DataFrame(list(evals_by_year.values()), index=evals_by_year.keys())
xg_res.columns = ['R2', 'MSE', 'MVP']
xg_res.index.name='Idx'
xg_res.sort_values(by = ['MVP', 'Idx'], ascending = [False, True])

Unnamed: 0_level_0,R2,MSE,MVP
Idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1980,0.105262,0.036155,Wrong
1982,0.277727,0.035184,Wrong
1983,0.217528,0.063562,Wrong
1988,0.760125,0.021607,Wrong
1989,0.739162,0.019339,Wrong
1990,0.719531,0.020492,Wrong
1993,0.740436,0.024913,Wrong
1994,0.580382,0.03861,Wrong
1999,0.101599,0.06739,Wrong
2001,0.194114,0.071491,Wrong


In [28]:
xg_res['MVP'].value_counts()

Correct    28
Wrong      16
Name: MVP, dtype: int64

In [29]:
xg_res.mean(axis=0)

R2     0.522344
MSE    0.039615
dtype: float64

In [30]:
best_params = pd.DataFrame(list(best_params_year.values()), index=best_params_year.keys())
best_params


Unnamed: 0,subsample,n_estimators,max_depth,eta,colsample_bytree
1980,0.8,200,20,0.1,0.9
1981,0.5,100,25,0.1,0.9
1982,0.5,50,20,0.1,0.5
1983,0.8,150,10,0.1,0.8
1984,0.7,100,25,0.1,0.5
1985,1.0,50,25,0.1,0.7
1986,0.5,200,20,0.01,0.7
1987,1.0,200,5,0.01,1.0
1988,0.8,100,25,0.1,0.6
1989,0.7,200,20,0.01,0.6


In [31]:
for c in best_params.columns:
    print(best_params[c].value_counts())

0.8    10
0.9     9
0.5     8
0.6     8
1.0     5
0.7     4
Name: subsample, dtype: int64
200    13
100    12
50     10
150     9
Name: n_estimators, dtype: int64
5     11
10    10
20     9
25     8
15     6
Name: max_depth, dtype: int64
0.10    32
0.01     8
0.30     4
Name: eta, dtype: int64
0.5    13
0.7     9
0.8     8
0.9     7
0.6     6
1.0     1
Name: colsample_bytree, dtype: int64


In [32]:
xgb1 = pd.concat([xg_res, best_params], axis=1)

Unnamed: 0,R2,MSE,MVP,subsample,n_estimators,max_depth,eta,colsample_bytree
1980,0.105262,0.036155,Wrong,0.8,200,20,0.1,0.9
1981,0.250706,0.038553,Correct,0.5,100,25,0.1,0.9
1982,0.277727,0.035184,Wrong,0.5,50,20,0.1,0.5
1983,0.217528,0.063562,Wrong,0.8,150,10,0.1,0.8
1984,0.396132,0.037456,Correct,0.7,100,25,0.1,0.5
1985,0.907664,0.006272,Correct,1.0,50,25,0.1,0.7
1986,0.581648,0.033521,Correct,0.5,200,20,0.01,0.7
1987,0.614396,0.03248,Correct,1.0,200,5,0.01,1.0
1988,0.760125,0.021607,Wrong,0.8,100,25,0.1,0.6
1989,0.739162,0.019339,Wrong,0.7,200,20,0.01,0.6


In [34]:
xgb_correct = xgb1[xgb1['MVP'] == 'Correct']
xgb_wrong = xgb1[xgb1['MVP'] == 'Wrong']


In [35]:
for c in best_params.columns:
    print(xgb_correct[c].value_counts())

0.5    7
1.0    5
0.8    5
0.9    5
0.6    4
0.7    2
Name: subsample, dtype: int64
50     8
200    7
150    7
100    6
Name: n_estimators, dtype: int64
5     8
15    6
25    5
20    5
10    4
Name: max_depth, dtype: int64
0.10    18
0.01     6
0.30     4
Name: eta, dtype: int64
0.5    8
0.9    6
0.7    6
0.8    4
0.6    3
1.0    1
Name: colsample_bytree, dtype: int64


In [36]:
for c in best_params.columns:
    print(xgb_wrong[c].value_counts())

0.8    5
0.6    4
0.9    4
0.7    2
0.5    1
Name: subsample, dtype: int64
200    6
100    6
50     2
150    2
Name: n_estimators, dtype: int64
10    6
20    4
25    3
5     3
Name: max_depth, dtype: int64
0.10    14
0.01     2
Name: eta, dtype: int64
0.5    5
0.8    4
0.6    3
0.7    3
0.9    1
Name: colsample_bytree, dtype: int64


### test with best hyperparameters

In [39]:
# empty dict to store results
evals_by_year = {}

for year in range(1980, 2024):
    df_train = df[df['Year'] != year]
    df_test = df[df['Year'] == year]
    X_train =  df_train[cols]
    y_train = df_train['Share']
    X_test =  df_test[cols]
    y_test = df_test['Share']
    xg = xgb.XGBRegressor(subsample=0.5, n_estimators=50, max_depth=5, eta=0.1, colsample_bytree=0.5)
    xg.fit(X_train, y_train)
    pred = xg.predict(X_test)
    eval = evaluate(y_test, pred)
    evals_by_year[year] = eval

In [40]:
xg_res = pd.DataFrame(list(evals_by_year.values()), index=evals_by_year.keys())
xg_res.columns = ['R2', 'MSE', 'MVP']
xg_res.index.name='Idx'
xg_res.sort_values(by = ['MVP', 'Idx'], ascending = [False, True])

Unnamed: 0_level_0,R2,MSE,MVP
Idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1982,0.5043,0.024147,Wrong
1983,0.236916,0.061987,Wrong
1987,0.59707,0.033939,Wrong
1988,0.577546,0.038052,Wrong
1989,0.795072,0.015194,Wrong
1990,0.551919,0.032738,Wrong
1993,0.629888,0.035524,Wrong
1994,0.52476,0.043727,Wrong
1997,0.89653,0.009771,Wrong
1998,0.420755,0.058415,Wrong
