In [1]:
import pandas as pd

In [2]:
#read data in
stats = pd.read_csv("player_mvp_stats.csv")

In [3]:
stats

Unnamed: 0.1,Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,Pts Max,Share,Team,W,L,W/L%,GB,PS/G,PA/G,SRS
0,0,A.C. Green,PF,27,LAL,82,21,26.4,3.1,6.6,...,0.0,0.0,Los Angeles Lakers,58,24,0.707,5.0,106.3,99.6,6.73
1,1,Byron Scott,SG,29,LAL,82,82,32.1,6.1,12.8,...,0.0,0.0,Los Angeles Lakers,58,24,0.707,5.0,106.3,99.6,6.73
2,2,Elden Campbell,PF,22,LAL,52,0,7.3,1.1,2.4,...,0.0,0.0,Los Angeles Lakers,58,24,0.707,5.0,106.3,99.6,6.73
3,3,Irving Thomas,PF,25,LAL,26,0,4.2,0.7,1.9,...,0.0,0.0,Los Angeles Lakers,58,24,0.707,5.0,106.3,99.6,6.73
4,4,James Worthy,SF,29,LAL,78,74,38.6,9.2,18.7,...,0.0,0.0,Los Angeles Lakers,58,24,0.707,5.0,106.3,99.6,6.73
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14692,14692,Spencer Hawes,PF,28,MIL,54,1,14.8,2.5,5.1,...,0.0,0.0,Milwaukee Bucks,42,40,0.512,9.0,103.6,103.8,-0.45
14693,14693,Steve Novak,PF,33,MIL,8,0,2.8,0.3,0.9,...,0.0,0.0,Milwaukee Bucks,42,40,0.512,9.0,103.6,103.8,-0.45
14694,14694,Terrence Jones,PF,25,MIL,54,12,23.5,4.3,9.1,...,0.0,0.0,Milwaukee Bucks,42,40,0.512,9.0,103.6,103.8,-0.45
14695,14695,Thon Maker,C,19,MIL,57,34,9.9,1.5,3.2,...,0.0,0.0,Milwaukee Bucks,42,40,0.512,9.0,103.6,103.8,-0.45


In [4]:
#Clean a little bit more data up
del stats["Unnamed: 0"]

In [6]:
#Get rid of some missing data if we have any 
pd.isnull(stats).sum()

Player        0
Pos           0
Age           0
Tm            0
G             0
GS            0
MP            0
FG            0
FGA           0
FG%          59
3P            0
3PA           0
3P%        2086
2P            0
2PA           0
2P%         100
eFG%         59
FT            0
FTA           0
FT%         521
ORB           0
DRB           0
TRB           0
AST           0
STL           0
BLK           0
TOV           0
PF            0
PTS           0
Year          0
Pts Won       0
Pts Max       0
Share         0
Team          0
W             0
L             0
W/L%          0
GB            0
PS/G          0
PA/G          0
SRS           0
dtype: int64

In [7]:
#look at the 3pt % that can be null
stats[pd.isnull(stats["3P%"])][["Player", "3PA"]]

Unnamed: 0,Player,3PA
2,Elden Campbell,0.0
3,Irving Thomas,0.0
18,Jack Haley,0.0
20,Keith Owens,0.0
30,Benoit Benjamin,0.0
...,...,...
14666,Evan Eschmeyer,0.0
14667,Gheorghe Mureșan,0.0
14669,Jim McIlvaine,0.0
14675,Mark Hendrickson,0.0


In [8]:
#Check for ft 
stats[pd.isnull(stats["FT%"])][["Player", "FTA"]]

Unnamed: 0,Player,FTA
77,John Coker,0.0
92,Jason Sasser,0.0
103,Adrian Caldwell,0.0
119,Bruno Šundov,0.0
158,Jamal Robinson,0.0
...,...,...
14556,Mark McNamara,0.0
14584,Luke Zeller,0.0
14637,Myron Brown,0.0
14659,Malcolm Lee,0.0


In [10]:
#replace the missing percentages with 0
stats = stats.fillna(0)

In [11]:
#Training a machine learning model
stats.columns 

Index(['Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB',
       'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Year',
       'Pts Won', 'Pts Max', 'Share', 'Team', 'W', 'L', 'W/L%', 'GB', 'PS/G',
       'PA/G', 'SRS'],
      dtype='object')

In [22]:
#Going to use all the numeric columns to make predictions
predictors = ["Age", "G", "GS", "MP", "FG", "FGA", 'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'W', 'L', 'W/L%',
       'GB', 'PS/G', 'PA/G', 'SRS']

In [23]:
train = stats[stats["Year"] < 2022]

In [24]:
test = stats[stats["Year"] == 2022]

In [25]:
#We dont have finished data from 2023-2024 so we need to be careful not to use
#data from after the test set.
#We may have over fitting issues on the algo

In [26]:
#use a straight forward model
from sklearn.linear_model import Ridge
#use ridge regression similar to linear regression. It will prevent overfitting

#create model
reg = Ridge(alpha = .1)


In [27]:
#fit the model
#take all predictors and make predictions off of them
reg.fit(train[predictors], train["Share"])

In [28]:
predictions = reg.predict(test[predictors])
predictions = pd.DataFrame(predictions, columns = ["predictions"], index = test.index)

In [29]:
predictions

Unnamed: 0,predictions
648,0.013638
649,-0.027154
650,-0.005272
651,0.016935
652,-0.003576
...,...
12508,-0.017870
12509,-0.009067
12510,0.005286
12511,0.001350


In [30]:
#compare actual values to predictions
combination = pd.concat([test[["Player", "Share"]], predictions], axis = 1)

In [31]:
combination

Unnamed: 0,Player,Share,predictions
648,Aaron Gordon,0.0,0.013638
649,Austin Rivers,0.0,-0.027154
650,Bol Bol,0.0,-0.005272
651,Bones Hyland,0.0,0.016935
652,Bryn Forbes,0.0,-0.003576
...,...,...,...
12508,Micah Potter,0.0,-0.017870
12509,Rodney McGruder,0.0,-0.009067
12510,Saben Lee,0.0,0.005286
12511,Saddiq Bey,0.0,0.001350


In [32]:
#see people who won mvp
combination.sort_values("Share", ascending = False).head(10)

Unnamed: 0,Player,Share,predictions
663,Nikola Jokić,0.875,0.189411
837,Joel Embiid,0.706,0.191151
11678,Giannis Antetokounmpo,0.595,0.219107
907,Devin Booker,0.216,0.092999
11469,Luka Dončić,0.146,0.156889
1179,Jayson Tatum,0.043,0.095388
12226,Ja Morant,0.01,0.12208
6398,Stephen Curry,0.004,0.093766
905,Chris Paul,0.002,0.080389
8241,LeBron James,0.001,0.15786


In [33]:
#Identifying an error metric 
from sklearn.metrics import mean_squared_error

mean_squared_error(combination["Share"], combination["predictions"])

0.002247264085632915

In [35]:
combination["Share"].value_counts()
#most players don't get any mvp votes

0.000    593
0.001      3
0.875      1
0.706      1
0.002      1
0.216      1
0.043      1
0.004      1
0.146      1
0.595      1
0.010      1
Name: Share, dtype: int64

In [40]:
combination = combination.sort_values("Share", ascending = False)

In [41]:
combination["Rk"] = list(range(1, combination.shape[0] + 1))

In [42]:
combination.head(10)

Unnamed: 0,Player,Share,predictions,Rk
663,Nikola Jokić,0.875,0.189411,1
837,Joel Embiid,0.706,0.191151,2
11678,Giannis Antetokounmpo,0.595,0.219107,3
907,Devin Booker,0.216,0.092999,4
11469,Luka Dončić,0.146,0.156889,5
1179,Jayson Tatum,0.043,0.095388,6
12226,Ja Morant,0.01,0.12208,7
6398,Stephen Curry,0.004,0.093766,8
905,Chris Paul,0.002,0.080389,9
8241,LeBron James,0.001,0.15786,10


In [43]:
combination = combination.sort_values("predictions", ascending = False)
combination["Predicted_Rk"] = list(range(1, combination.shape[0] + 1))

In [44]:
combination.head(10)

Unnamed: 0,Player,Share,predictions,Rk,Predicted_Rk
11678,Giannis Antetokounmpo,0.595,0.219107,3,1
837,Joel Embiid,0.706,0.191151,2,2
663,Nikola Jokić,0.875,0.189411,1,3
8241,LeBron James,0.001,0.15786,10,4
11469,Luka Dončić,0.146,0.156889,5,5
6185,Kevin Durant,0.001,0.140659,12,6
12226,Ja Morant,0.01,0.12208,7,7
11820,Trae Young,0.0,0.110532,289,8
8231,Anthony Davis,0.0,0.108508,112,9
836,James Harden,0.0,0.103964,393,10


In [45]:
combination.sort_values("Share", ascending = False).head(10)

Unnamed: 0,Player,Share,predictions,Rk,Predicted_Rk
663,Nikola Jokić,0.875,0.189411,1,3
837,Joel Embiid,0.706,0.191151,2,2
11678,Giannis Antetokounmpo,0.595,0.219107,3,1
907,Devin Booker,0.216,0.092999,4,16
11469,Luka Dončić,0.146,0.156889,5,5
1179,Jayson Tatum,0.043,0.095388,6,14
12226,Ja Morant,0.01,0.12208,7,7
6398,Stephen Curry,0.004,0.093766,8,15
905,Chris Paul,0.002,0.080389,9,21
3938,DeMar DeRozan,0.001,0.101564,11,11


In [51]:
def find_ap(combination):
    actual = combination.sort_values("Share", ascending= False).head(5)
    predicted = combination.sort_values("predictions", ascending = False)
    ps = []
    found = 0
    seen = 1
    for index, row in predicted.iterrows():
        if row["Player"] in actual ["Player"].values:
            found += 1
            ps.append(found/seen)
        seen += 1
    return sum(ps)/len(ps) #returns error metric

In [52]:
find_ap(combination)
#the lower the number the longer it takes to find the mvp

0.8225

In [54]:
years = list(range(1991,2023))

In [56]:
aps = []
all_predictions = []
for year in years[5:]:
    #training and testing sets
    train = stats[stats["Year"] < year]
    test = stats[stats["Year"] == year]
    #fitting the model
    reg.fit(train[predictors], train["Share"])
    
    #create predictions
    predictions = reg.predict(test[predictors])
    predictions = pd.DataFrame(predictions, columns = ["predictions"], index = test.index)
    combination = pd.concat([test[["Player", "Share"]], predictions], axis =1)
    all_predictions.append(combination)
    #append 
    aps.append(find_ap(combination))
    

In [57]:
sum(aps)/len(aps) #generate mean average precision, pretty accurate

0.715194005962712

In [61]:
#Create a function to add the ranks in
def add_ranks(combination):
    #we want good diagonistic info
    combination = combination.sort_values("Share", ascending = False)
    combination["Rk"] = list(range(1, combination.shape[0] + 1))
    combination = combination.sort_values("predictions", ascending = False)
    combination["Predicted_Rk"] = list(range(1, combination.shape[0] + 1))
    combination["Diff"] = combination["Rk"] - combination["Predicted_Rk"]
    return combination
    

In [65]:
ranking = add_ranks(all_predictions[1])
ranking[ranking["Rk"] < 6].sort_values("Diff", ascending = False)

Unnamed: 0,Player,Share,predictions,Rk,Predicted_Rk,Diff
1710,Karl Malone,0.857,0.19236,1,2,-1
10976,Michael Jordan,0.832,0.167672,2,3,-1
970,Grant Hill,0.327,0.128664,3,6,-3
4912,Tim Hardaway,0.207,0.059992,4,20,-16
8642,Glen Rice,0.117,0.033122,5,53,-48


In [69]:
#back testing function to continously run 
def backtest(stats, model, years, predictors):
    aps = []
    all_predictions = []
    for year in years:
        train = stats[stats["Year"] < year]
        test = stats[stats["Year"] == year]
        model.fit(train[predictors],train["Share"])
        predictions = model.predict(test[predictors])
        predictions = pd.DataFrame(predictions, columns=["predictions"], index=test.index)
        combination = pd.concat([test[["Player", "Share"]], predictions], axis=1)
        combination = add_ranks(combination)
        all_predictions.append(combination)
        aps.append(find_ap(combination))
    return sum(aps) / len(aps), aps, pd.concat(all_predictions)

In [70]:
mean_ap, aps, all_predictions = backtest(stats, reg, years[5:], predictors)

In [71]:
mean_ap #got the same result as before but can call upon function

0.715194005962712

In [72]:
all_predictions[all_predictions["Rk"] < 5].sort_values("Diff").head(10)
#we can dig deeper and look at stats and see where people differ compared to other candidates

Unnamed: 0,Player,Share,predictions,Rk,Predicted_Rk,Diff
1334,Jason Kidd,0.712,0.027957,2,52,-50
5420,Steve Nash,0.839,0.032686,1,47,-46
8910,Peja Stojaković,0.228,0.035581,4,39,-35
5438,Steve Nash,0.739,0.051338,1,35,-34
13331,Joakim Noah,0.258,0.04739,4,37,-33
5453,Steve Nash,0.785,0.070791,2,23,-21
4912,Tim Hardaway,0.207,0.059992,4,20,-16
907,Devin Booker,0.216,0.092999,4,16,-12
7041,Kobe Bryant,0.291,0.077829,4,14,-10
1267,Gary Payton,0.372,0.076145,3,13,-10


In [75]:
pd.concat([pd.Series(reg.coef_), pd.Series(predictors)], axis = 1).sort_values(0, ascending =False)
#variables that are most important to the regression
#Effective fg the most important in this predictor

Unnamed: 0,0,1
13,0.083233,eFG%
18,0.033221,DRB
28,0.024016,W/L%
17,0.021163,ORB
10,0.01543,2P
21,0.012472,STL
22,0.010828,BLK
15,0.010539,FTA
25,0.007105,PTS
20,0.007091,AST


In [99]:
#adding more predictors 
stats_ratios = stats[["PTS", "AST", "STL", "BLK", "3P", "Year"]].groupby("Year", group_keys=False).apply(lambda x: x/x.mean())

In [100]:
stats_ratios

Unnamed: 0,PTS,AST,STL,BLK,3P,Year
0,1.013334,0.420714,0.961127,0.673469,0.508587,1.0
1,1.614653,1.028412,1.647646,0.673469,4.577279,1.0
2,0.311795,0.093492,0.274608,1.571429,0.000000,1.0
3,0.200440,0.186984,0.274608,0.000000,0.000000,1.0
4,2.383005,1.636110,1.784950,0.897959,1.525760,1.0
...,...,...,...,...,...,...
14692,0.735752,0.819562,0.479763,1.528302,0.650951,1.0
14693,0.071202,0.000000,0.000000,0.000000,0.130190,1.0
14694,1.281633,0.601012,1.119447,2.547170,0.520761,1.0
14695,0.474679,0.218550,0.319842,1.273585,0.650951,1.0


In [105]:
stats[["PTS_T", "AST_R", "STL_R", "BLK_R", "3P_R"]] = stats_ratios[["PTS", "AST", "STL", "BLK", "3P"]]

In [102]:
stats.head()

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,W/L%,GB,PS/G,PA/G,SRS,PTS_T,AST_R,STL_R,BLK_R,3P_R
0,A.C. Green,PF,27,LAL,82,21,26.4,3.1,6.6,0.476,...,0.707,5.0,106.3,99.6,6.73,1.013334,0.420714,0.961127,0.673469,0.508587
1,Byron Scott,SG,29,LAL,82,82,32.1,6.1,12.8,0.477,...,0.707,5.0,106.3,99.6,6.73,1.614653,1.028412,1.647646,0.673469,4.577279
2,Elden Campbell,PF,22,LAL,52,0,7.3,1.1,2.4,0.455,...,0.707,5.0,106.3,99.6,6.73,0.311795,0.093492,0.274608,1.571429,0.0
3,Irving Thomas,PF,25,LAL,26,0,4.2,0.7,1.9,0.34,...,0.707,5.0,106.3,99.6,6.73,0.20044,0.186984,0.274608,0.0,0.0
4,James Worthy,SF,29,LAL,78,74,38.6,9.2,18.7,0.492,...,0.707,5.0,106.3,99.6,6.73,2.383005,1.63611,1.78495,0.897959,1.52576


In [106]:
stats[["PTS_R", "AST_R", "STL_R", "BLK_R", "3P_R"]] = stat_ratios[["PTS", "AST", "STL", "BLK", "3P"]]

In [107]:
predictors += ["PTS_R", "AST_R", "STL_R", "BLK_R", "3P_R"]

In [108]:
mean_ap, aps, all_predictions = backtest(stats, reg, years[5:], predictors)

In [109]:
mean_ap
#higher up in predictive ranking

0.7236714851898696

In [110]:
stats["NPos"] = stats["Pos"].astype("category").cat.codes

In [111]:
stats.head()

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,PS/G,PA/G,SRS,PTS_T,AST_R,STL_R,BLK_R,3P_R,PTS_R,NPos
0,A.C. Green,PF,27,LAL,82,21,26.4,3.1,6.6,0.476,...,106.3,99.6,6.73,1.013334,0.420714,0.961127,0.673469,0.508587,1.013334,2
1,Byron Scott,SG,29,LAL,82,82,32.1,6.1,12.8,0.477,...,106.3,99.6,6.73,1.614653,1.028412,1.647646,0.673469,4.577279,1.614653,12
2,Elden Campbell,PF,22,LAL,52,0,7.3,1.1,2.4,0.455,...,106.3,99.6,6.73,0.311795,0.093492,0.274608,1.571429,0.0,0.311795,2
3,Irving Thomas,PF,25,LAL,26,0,4.2,0.7,1.9,0.34,...,106.3,99.6,6.73,0.20044,0.186984,0.274608,0.0,0.0,0.20044,2
4,James Worthy,SF,29,LAL,78,74,38.6,9.2,18.7,0.492,...,106.3,99.6,6.73,2.383005,1.63611,1.78495,0.897959,1.52576,2.383005,8


In [112]:
stats["NTm"] = stats["Tm"].astype("category").cat.codes
#see if team effects whether you win mvp

In [114]:
stats["NTm"].value_counts()

7     522
27    514
8     514
17    512
14    510
11    508
0     506
12    505
9     501
1     501
31    497
26    496
15    496
13    494
18    494
5     493
24    489
34    486
10    486
30    486
28    485
29    484
19    482
33    441
36    410
16    353
20    343
32    263
25    240
2     182
4     177
23    163
3     157
21    143
6     130
37    114
35     88
22     32
Name: NTm, dtype: int64

In [None]:
#use random forest model
#create a series of decision trees and average the values
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=50, random_state=1, min_samples_split=5)

mean_ap, aps, all_predictions = backtest(stats, rf, years[28:], predictors + ["NPos", "NTm"])

In [None]:
mean_ap

In [None]:
mean_ap, aps, all_predictions = backtest(stats, reg, years[28:], predictors)

In [None]:
mean_ap

In [None]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()

In [None]:
def backtest(stats, model, years, predictors):
    aps = []
    all_predictions = []
    for year in years:
        train = stats[stats["Year"] < year].copy()
        test = stats[stats["Year"] == year].copy()
        sc.fit(train[predictors])
        train[predictors] = sc.transform(train[predictors])
        test[predictors] = sc.transform(test[predictors])
        model.fit(train[predictors],train["Share"])
        predictions = model.predict(test[predictors])
        predictions = pd.DataFrame(predictions, columns=["predictions"], index=test.index)
        combination = pd.concat([test[["Player", "Share"]], predictions], axis=1)
        combination = add_ranks(combination)
        all_predictions.append(combination)
        aps.append(find_ap(combination))
    return sum(aps) / len(aps), aps, pd.concat(all_predictions)

In [None]:
mean_ap, aps, all_predictions = backtest(stats, reg, years[28:], predictors)

In [None]:
mean_ap

In [None]:
sc.transform(stats[predictors])

In [None]:
#we can invesigate and find how to make better predictions
#We can play around with a neural network