# Random Forest model to predict MVP

In [77]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import RandomizedSearchCV
import os
import joblib
import pickle

### Load data, select appropriate variables

In [2]:
df = pd.read_csv("mvp_data.csv", index_col=0).reset_index(drop=True)
df = df.fillna(0)
df.columns

Index(['Year', 'Player', 'Tm', 'Share', 'G', 'MP', 'PTS', 'TRB', 'AST', 'STL',
       'BLK', 'FG%', '3P%', 'FT%', 'WS', 'WS/48', 'MVP',
       'player_efficiency_rating', 'true_shooting_percentage',
       'three_point_attempt_rate', 'free_throw_attempt_rate',
       'offensive_rebound_percentage', 'defensive_rebound_percentage',
       'total_rebound_percentage', 'assist_percentage', 'steal_percentage',
       'block_percentage', 'turnover_percentage', 'usage_percentage',
       'offensive_win_shares', 'defensive_win_shares',
       'offensive_box_plus_minus', 'defensive_box_plus_minus',
       'box_plus_minus', 'value_over_replacement_player', 'wl_pct', 'seed',
       'highest_teammate_vorp', 'highest_teammate_ws'],
      dtype='object')

In [3]:
# define points+rebounds+assists variable
df['PRA'] = df['PTS'] + df['TRB'] + df['AST']

In [4]:
# select variables used for regression (mutual_info from lin_reg model)
cols = ['PRA', 'WS/48', 'player_efficiency_rating', 'offensive_box_plus_minus',
       'value_over_replacement_player', 'wl_pct', 'seed']
X = df[cols]
y = df['Share']


### Random Forest model with entire data

In [5]:
rf = RandomForestRegressor(max_depth=20, max_features=5)
rf.fit(X, y)

RandomForestRegressor(max_depth=20, max_features=5)

In [6]:
# Feature importance
for score, name in zip(rf.feature_importances_, X.columns):
    print(round(score, 2), name)

0.14 PRA
0.16 WS/48
0.12 player_efficiency_rating
0.05 offensive_box_plus_minus
0.32 value_over_replacement_player
0.12 wl_pct
0.08 seed


### Holdout single year as test set

In [7]:
df_train = df[df['Year'] != 2023]
df_test = df[df['Year'] == 2023]
X_train = df_train[cols]
y_train = df_train['Share']
X_test =  df_test[cols]
y_test = df_test['Share']


In [8]:
rf = RandomForestRegressor()
rf.fit(X_train, y_train)
pred = rf.predict(X_test)

### Evaluate with R2, MSE

In [9]:
r2_score(y_test, pred)

0.6452739475532814

In [10]:
mean_squared_error(y_test, pred)

0.03785261840999999

In [11]:
results = pd.DataFrame(y_test)
results['prediction'] = pred
results.index = df_test['Player']
results

Unnamed: 0_level_0,Share,prediction
Player,Unnamed: 1_level_1,Unnamed: 2_level_1
Joel Embiid,0.915,0.54993
Nikola Jokić,0.674,0.83154
Giannis Antetokounmpo,0.606,0.53792
Jayson Tatum,0.28,0.26145
Shai Gilgeous-Alexander,0.046,0.11328
Donovan Mitchell,0.03,0.09478
Domantas Sabonis,0.027,0.20066
Luka Dončić,0.01,0.38181
Stephen Curry,0.005,0.09591
Jimmy Butler,0.003,0.17639


In [12]:
# random search to select best hyperparameters
rf = RandomForestRegressor()

params = {
    'max_depth': list(range(5, 35, 5)),
    'max_features': list(range(3,8,1)),
    'min_samples_split': list(range(2,5,1)),
    'min_samples_leaf': [1,2,3]
}

random_search = RandomizedSearchCV(rf, param_distributions=params, cv=5, scoring='r2')
random_search.fit(X_train, y_train)

RandomizedSearchCV(cv=5, estimator=RandomForestRegressor(),
                   param_distributions={'max_depth': [5, 10, 15, 20, 25, 30],
                                        'max_features': [3, 4, 5, 6, 7],
                                        'min_samples_leaf': [1, 2, 3],
                                        'min_samples_split': [2, 3, 4]},
                   scoring='r2')

In [13]:
print(random_search.best_params_)

{'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 3, 'max_depth': 20}


In [14]:
best_rf = random_search.best_estimator_
preds = best_rf.predict(X_test)

In [15]:
print('R2:', r2_score(y_test, preds))
print('MSE:', mean_squared_error(y_test, preds))

R2: 0.6353207779930248
MSE: 0.03891471556


In [16]:
results = pd.DataFrame(y_test)
results['prediction'] = preds
results.index = df_test['Player']
results

Unnamed: 0_level_0,Share,prediction
Player,Unnamed: 1_level_1,Unnamed: 2_level_1
Joel Embiid,0.915,0.5284
Nikola Jokić,0.674,0.89915
Giannis Antetokounmpo,0.606,0.50523
Jayson Tatum,0.28,0.29767
Shai Gilgeous-Alexander,0.046,0.10897
Donovan Mitchell,0.03,0.10897
Domantas Sabonis,0.027,0.22095
Luka Dončić,0.01,0.30194
Stephen Curry,0.005,0.11153
Jimmy Butler,0.003,0.18775


### Create function to determine if MVP winner is correct

In [30]:
actual = results['Share'].idxmax()
prediction = results['prediction'].idxmax()

In [17]:
def mvp_prediction(actual, prediction):
    if actual == prediction:
        return 'Correct'
    else:
        return 'Wrong'

In [32]:
dat = pd.DataFrame()
dat['R2'] = [r2_score(y_test, preds)]
dat['MSE'] = [mean_squared_error(y_test, preds)]
dat['MVP'] = [mvp_prediction(actual, prediction)]
dat.index = [2021]
dat

Unnamed: 0,R2,MSE,MVP
2021,0.595837,0.045659,Wrong


### Create function for all evaluations

In [28]:
def evaluate(y_test, pred):
    evals = []
    results = pd.DataFrame(y_test)
    results['prediction'] = pred
    results.index = df_test['Player']
    actual = results['Share'].idxmax()
    prediction = results['prediction'].idxmax()
    evals.append(r2_score(y_test, pred))
    evals.append(mean_squared_error(y_test, pred))
    evals.append(mvp_prediction(actual, prediction))
    return evals, results

### For loop to use each year as a test set

In [19]:
# empty dict to store results
evals_by_year = {}

best_params_year={}

results_by_year = {}

for year in range(1980, 2024):
    df_train = df[df['Year'] != year]
    df_test = df[df['Year'] == year]
    X_train =  df_train[cols]
    y_train = df_train['Share']
    X_test =  df_test[cols]
    y_test = df_test['Share']
    rf = RandomForestRegressor()
    random_search = RandomizedSearchCV(rf, param_distributions=params, cv=5, scoring='r2')
    random_search.fit(X_train, y_train)
    best_rf = random_search.best_estimator_
    pred = best_rf.predict(X_test)
    eval, results = evaluate(y_test, pred)
    evals_by_year[year] = eval
    results_by_year[year] = results
    best_params_year[year] = random_search.best_params_

In [20]:
rf_res = pd.DataFrame(list(evals_by_year.values()), index=evals_by_year.keys())
rf_res.columns = ['R2', 'MSE', 'MVP']
rf_res.index.name='Idx'
rf_res.sort_values(by = ['MVP', 'Idx'], ascending = [False, True])

Unnamed: 0_level_0,R2,MSE,MVP
Idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1982,0.259989,0.036048,Wrong
1993,0.74038,0.024919,Wrong
1994,0.508675,0.045207,Wrong
1998,0.572612,0.0431,Wrong
1999,-0.113008,0.083488,Wrong
2001,0.337644,0.058758,Wrong
2005,-0.265816,0.112268,Wrong
2006,0.129022,0.05552,Wrong
2008,0.621712,0.037988,Wrong
2011,0.377458,0.056669,Wrong


In [21]:
rf_res['MVP'].value_counts()

Correct    32
Wrong      12
Name: MVP, dtype: int64

In [22]:
rf_res.mean(axis=0)

  rf_res.mean(axis=0)


R2     0.577542
MSE    0.035695
dtype: float64

In [25]:
best_params = pd.DataFrame(list(best_params_year.values()), index=best_params_year.keys())
best_params[rf_res['MVP']=='Correct']


Unnamed: 0,min_samples_split,min_samples_leaf,max_features,max_depth
1980,3,3,3,15
1981,2,3,6,5
1983,3,3,7,15
1984,4,2,4,5
1985,4,2,3,15
1986,3,3,6,25
1987,4,3,5,5
1988,4,3,4,5
1989,3,3,4,5
1990,4,3,5,25


In [26]:
for c in best_params.columns:
    print(best_params[rf_res['MVP']=='Correct'][c].value_counts())


3    13
4    11
2     8
Name: min_samples_split, dtype: int64
3    16
2     9
1     7
Name: min_samples_leaf, dtype: int64
3    10
6     7
4     7
5     5
7     3
Name: max_features, dtype: int64
5     13
15     6
20     4
25     3
10     3
30     3
Name: max_depth, dtype: int64


### Test all years with most frequently used hyperparameters

In [29]:
# empty dict to store results
evals_by_year = {}

results_by_year = {}

for year in range(1980, 2024):
    df_train = df[df['Year'] != year]
    df_test = df[df['Year'] == year]
    X_train =  df_train[cols]
    y_train = df_train['Share']
    X_test =  df_test[cols]
    y_test = df_test['Share']
    rf = RandomForestRegressor(min_samples_split=3, min_samples_leaf=3, max_features=3, max_depth=5)
    rf.fit(X_train, y_train)
    pred = rf.predict(X_test)
    eval, results = evaluate(y_test, pred)
    evals_by_year[year] = eval
    results_by_year[year] = results

In [30]:
rf_res = pd.DataFrame(list(evals_by_year.values()), index=evals_by_year.keys())
rf_res.columns = ['R2', 'MSE', 'MVP']
rf_res.index.name='Idx'
rf_res.sort_values(by = ['MVP', 'Idx'], ascending = [False, True])

Unnamed: 0_level_0,R2,MSE,MVP
Idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1982,0.21824,0.038082,Wrong
1989,0.743293,0.019033,Wrong
1993,0.723333,0.026555,Wrong
1994,0.540062,0.042319,Wrong
1997,0.9249,0.007092,Wrong
1998,0.562427,0.044128,Wrong
1999,-0.070407,0.080293,Wrong
2001,0.160037,0.074514,Wrong
2005,-0.187169,0.105292,Wrong
2006,0.148095,0.054304,Wrong


In [31]:
rf_res['MVP'].value_counts()

Correct    30
Wrong      14
Name: MVP, dtype: int64

In [32]:
rf_res.mean(axis=0)

  rf_res.mean(axis=0)


R2     0.571983
MSE    0.036068
dtype: float64

### Test without years 1982, 1999, 2005, 2006

In [34]:
drop_yrs = [1982, 1999, 2005, 2006]
mask = ~df['Year'].isin(drop_yrs)
df1 = df[mask]
yrs = range(1980, 2024)
yrs = [y for y in yrs if y not in drop_yrs]
#yrs

In [45]:
# empty dict to store results
evals_by_year = {}

best_params_year={}

results_by_year={}

for year in yrs:
    df_train = df1[df1['Year'] != year]
    df_test = df1[df1['Year'] == year]
    X_train =  df_train[cols]
    y_train = df_train['Share']
    X_test =  df_test[cols]
    y_test = df_test['Share']
    rf = RandomForestRegressor()
    random_search = RandomizedSearchCV(rf, param_distributions=params, cv=5, scoring='r2')
    random_search.fit(X_train, y_train)
    best_rf = random_search.best_estimator_
    pred = best_rf.predict(X_test)
    eval, results = evaluate(y_test, pred)
    evals_by_year[year] = eval
    results_by_year[year] = results
    best_params_year[year] = random_search.best_params_

    

In [46]:
rf_res = pd.DataFrame(list(evals_by_year.values()), index=evals_by_year.keys())
rf_res.columns = ['R2', 'MSE', 'MVP']
rf_res.index.name='Idx'
rf_res.sort_values(by = ['MVP', 'Idx'], ascending = [False, True])

Unnamed: 0_level_0,R2,MSE,MVP
Idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1983,0.535506,0.037732,Wrong
1989,0.823336,0.013098,Wrong
1994,0.564102,0.040107,Wrong
1997,0.953533,0.004388,Wrong
1998,0.650828,0.035213,Wrong
2001,0.437666,0.049885,Wrong
2008,0.57198,0.042982,Wrong
2011,0.307392,0.063048,Wrong
2023,0.655954,0.036713,Wrong
1980,0.715153,0.01151,Correct


In [47]:
rf_res['MVP'].value_counts()

Correct    31
Wrong       9
Name: MVP, dtype: int64

In [48]:
rf_res.mean(axis=0)

  rf_res.mean(axis=0)


R2     0.652527
MSE    0.030648
dtype: float64

In [51]:
best_params = pd.DataFrame(list(best_params_year.values()), index=best_params_year.keys())
for c in best_params.columns:
    print(best_params[rf_res['MVP']=='Correct'][c].value_counts())

2    12
3    12
4     7
Name: min_samples_split, dtype: int64
2    16
3     8
1     7
Name: min_samples_leaf, dtype: int64
3    19
4     8
6     2
5     1
7     1
Name: max_features, dtype: int64
25    7
30    7
15    6
10    5
20    4
5     2
Name: max_depth, dtype: int64


In [52]:
best_params 

Unnamed: 0,min_samples_split,min_samples_leaf,max_features,max_depth
1980,2,1,4,20
1981,2,1,3,10
1983,3,2,3,25
1984,2,1,3,15
1985,4,2,4,25
1986,2,2,4,10
1987,2,3,3,15
1988,4,2,3,30
1989,4,2,4,20
1990,3,3,4,30


In [54]:
results_by_year[2022]

Unnamed: 0_level_0,Share,prediction
Player,Unnamed: 1_level_1,Unnamed: 2_level_1
Nikola Jokić,0.875,0.728056
Joel Embiid,0.706,0.496549
Giannis Antetokounmpo,0.595,0.670634
Devin Booker,0.216,0.188041
Luka Dončić,0.146,0.26883
Jayson Tatum,0.043,0.160157
Ja Morant,0.01,0.235764
Stephen Curry,0.004,0.088716
Chris Paul,0.002,0.058742
DeMar DeRozan,0.001,0.047779


### Test all years with best hyperparameters

In [58]:
# empty dict to store results
evals_by_year = {}

results_by_year={}

for year in yrs:
    df_train = df1[df1['Year'] != year]
    df_test = df1[df1['Year'] == year]
    X_train =  df_train[cols]
    y_train = df_train['Share']
    X_test =  df_test[cols]
    y_test = df_test['Share']
    rf = RandomForestRegressor(min_samples_split=2, min_samples_leaf=2, max_features=3, max_depth=30, random_state=0)
    rf.fit(X_train, y_train)
    pred = rf.predict(X_test)
    eval, results = evaluate(y_test, pred)
    evals_by_year[year] = eval
    results_by_year[year] = results

In [59]:
rf_res = pd.DataFrame(list(evals_by_year.values()), index=evals_by_year.keys())
rf_res.columns = ['R2', 'MSE', 'MVP']
rf_res.index.name='Idx'
rf_res.sort_values(by = ['MVP', 'Idx'], ascending = [False, True])

Unnamed: 0_level_0,R2,MSE,MVP
Idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1989,0.779716,0.016333,Wrong
1994,0.580091,0.038636,Wrong
1998,0.638401,0.036466,Wrong
2001,0.392399,0.053901,Wrong
2008,0.65819,0.034325,Wrong
2011,0.326811,0.06128,Wrong
2017,0.086956,0.09283,Wrong
2023,0.61233,0.041368,Wrong
1980,0.808957,0.00772,Correct
1981,0.532202,0.024069,Correct


In [60]:
rf_res['MVP'].value_counts()

Correct    32
Wrong       8
Name: MVP, dtype: int64

In [61]:
rf_res.mean(axis=0)

  rf_res.mean(axis=0)


R2     0.650897
MSE    0.030743
dtype: float64

In [70]:
results_by_year[2011]

Unnamed: 0_level_0,Share,prediction
Player,Unnamed: 1_level_1,Unnamed: 2_level_1
Derrick Rose,0.977,0.387933
Dwight Howard,0.531,0.195147
LeBron James,0.431,0.622994
Kobe Bryant,0.354,0.089448
Kevin Durant,0.157,0.120868
Dirk Nowitzki,0.093,0.139885
Dwyane Wade,0.02,0.205628
Manu Ginóbili,0.017,0.081133
Amar'e Stoudemire,0.007,0.043025
Blake Griffin,0.004,0.056712


In [78]:
# save dict of best results
filehandler = open('rf_results_year.pkl', 'wb')
pickle.dump(results_by_year, filehandler)

### Train model with best results and entire data, save model

In [72]:
X =  df1[cols]
y = df1['Share']

In [74]:
rf_best = RandomForestRegressor(min_samples_split=2, min_samples_leaf=2, max_features=3, max_depth=30, random_state=0)
rf_best.fit(X,y)

RandomForestRegressor(max_depth=30, max_features=3, min_samples_leaf=2,
                      random_state=0)

In [75]:
joblib.dump(rf_best, "./rf_best.joblib")

['./rf_best.joblib']

In [76]:
cols

['PRA',
 'WS/48',
 'player_efficiency_rating',
 'offensive_box_plus_minus',
 'value_over_replacement_player',
 'wl_pct',
 'seed']