In [1]:
import pandas as pd

In [2]:
stats=pd.read_csv("player_mvp_stats.csv")

In [3]:
del stats["Unnamed: 0"]

In [6]:
stats=stats.fillna(0)

In [7]:
stats.columns

Index(['Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB',
       'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Year',
       'Pts Won', 'Pts Max', 'Share', 'Team', 'W', 'L', 'W/L%', 'GB', 'PS/G',
       'PA/G', 'SRS'],
      dtype='object')

In [8]:
predictors=['Age', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB',
       'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Year', 'W', 'L', 'W/L%', 'GB', 'PS/G',
       'PA/G', 'SRS']

In [9]:
train= stats[stats["Year"]<2024]

In [10]:
test= stats[stats["Year"]==2024]

In [12]:
from sklearn.linear_model import Ridge

In [13]:
reg=Ridge(alpha=.1)

In [14]:
reg.fit(train[predictors],train["Share"])

In [15]:
predictions=reg.predict(test[predictors])

In [16]:
predictions=pd.DataFrame(predictions,columns=["predictions"],index=test.index)

In [17]:
combination=pd.concat([test[["Player","Share"]],predictions],axis=1)

In [18]:
combination.sort_values("Share",ascending=False).head(10)

Unnamed: 0,Player,Share,predictions
1449,Shai Gilgeous-Alexander,0.646,0.158917
236,Giannis Antetokounmpo,0.194,0.197882
4086,Jalen Brunson,0.143,0.092926
3078,Jayson Tatum,0.087,0.106823
8964,Anthony Edwards,0.018,0.084015
4513,Domantas Sabonis,0.003,0.082432
12754,Kevin Durant,0.001,0.09579
229,A.J. Green,0.0,0.002784
10511,Nic Claxton,0.0,0.034264
10507,Keita Bates-Diop,0.0,-0.001677


In [19]:
from sklearn.metrics import mean_squared_error
mean_squared_error(combination["Share"],combination["predictions"])

0.0012032687247542245

In [20]:
combination=combination.sort_values("Share",ascending=False)
combination["Rk"]=list(range(1,combination.shape[0]+1))

In [25]:
combination.head()

Unnamed: 0,Player,Share,predictions,Rk,Predicted_Rk
236,Giannis Antetokounmpo,0.194,0.197882,2,1
14107,Joel Embiid,0.0,0.192867,206,2
346,Luka Dončić,0.0,0.176341,374,3
1449,Shai Gilgeous-Alexander,0.646,0.158917,1,4
811,Nikola Jokić,0.0,0.149822,412,5


In [22]:
combination=combination.sort_values("predictions",ascending=False)
combination["Predicted_Rk"]=list(range(1,combination.shape[0]+1))

In [24]:
combination.head()

Unnamed: 0,Player,Share,predictions,Rk,Predicted_Rk
236,Giannis Antetokounmpo,0.194,0.197882,2,1
14107,Joel Embiid,0.0,0.192867,206,2
346,Luka Dončić,0.0,0.176341,374,3
1449,Shai Gilgeous-Alexander,0.646,0.158917,1,4
811,Nikola Jokić,0.0,0.149822,412,5


In [32]:
def find_ap(combination):
    actual=combination.sort_values("Share",ascending=False).head(5)
    predicted=combination.sort_values("predictions",ascending=False)
    ps=[]
    found=0
    seen=1
    for index,row in predicted.iterrows():
        if row["Player"] in actual["Player"].values:
            found+=1
            ps.append(found/seen)
        seen+=1
    return sum(ps)/len(ps)

In [33]:
find_ap(combination)

0.5065508021390375

In [34]:
years=list(range(1991,2025))

In [36]:
aps=[]
all_predictions=[]
for year in years[5:]:
    train=stats[stats["Year"]<year]
    test=stats[stats["Year"]==year]
    reg.fit(train[predictors],train["Share"])
    predictions=reg.predict(test[predictors])
    predictions=pd.DataFrame(predictions,columns=["predictions"],index=test.index)
    combination=pd.concat([test[["Player","Share"]],predictions],axis=1)
    all_predictions.append(combination)
    aps.append(find_ap(combination))

In [37]:
sum(aps)/len(aps)

0.6952816288310577

In [38]:
def add_ranks(combination):
    combination=combination.sort_values("Share",ascending=False)
    combination["Rk"]=list(range(1,combination.shape[0]+1))
    combination=combination.sort_values("predictions",ascending=False)
    combination["Predicted_Rk"]=list(range(1,combination.shape[0]+1))
    combination["Diff"]=combination["Rk"]-combination["Predicted_Rk"]
    return combination

In [40]:
ranking=add_ranks(all_predictions[1])
ranking[ranking["Rk"]<6].sort_values("Diff",ascending=False)

Unnamed: 0,Player,Share,predictions,Rk,Predicted_Rk,Diff
1943,Karl Malone,0.857,0.19416,1,2,-1
11771,Michael Jordan,0.832,0.170544,2,3,-1
1133,Grant Hill,0.327,0.132544,3,5,-2
5411,Tim Hardaway,0.207,0.061164,4,20,-16
9378,Glen Rice,0.117,0.034557,5,51,-46


In [53]:
def backtest(stats,model,year,predictors):
    aps=[]
    all_predictions=[]
    for year in years[5:]:
        train=stats[stats["Year"]<year]
        test=stats[stats["Year"]==year]
        model.fit(train[predictors],train["Share"])
        predictions=reg.predict(test[predictors])
        predictions=pd.DataFrame(predictions,columns=["predictions"],index=test.index)
        combination=pd.concat([test[["Player","Share"]],predictions],axis=1)
        combination=add_ranks(combination)
        all_predictions.append(combination)
        aps.append(find_ap(combination))
    return sum(aps)/len(aps),aps,pd.concat(all_predictions)

In [54]:
mean_ap,aps,all_predictions=backtest(stats,reg,years[5:],predictors)

In [55]:
all_predictions.head()

Unnamed: 0,Player,Share,predictions,Rk,Predicted_Rk,Diff
8244,Shaquille O'Neal,0.056,0.214224,9,1,8
11039,David Robinson,0.508,0.211592,2,2,0
5711,Hakeem Olajuwon,0.211,0.205126,4,3,1
1933,Karl Malone,0.075,0.183691,7,4,3
11757,Michael Jordan,0.986,0.177949,1,5,-4


In [56]:
all_predictions[all_predictions["Rk"]<6].sort_values("Diff").head(10)

Unnamed: 0,Player,Share,predictions,Rk,Predicted_Rk,Diff
1567,Jason Kidd,0.712,0.028998,2,55,-53
9378,Glen Rice,0.117,0.034557,5,51,-46
5934,Steve Nash,0.839,0.039664,1,42,-41
14372,Joakim Noah,0.258,0.047022,4,40,-36
5952,Steve Nash,0.739,0.056164,1,33,-32
4230,Chauncey Billups,0.344,0.053216,5,35,-30
1732,Chris Paul,0.138,0.0744,4,33,-29
14236,Donovan Mitchell,0.03,0.079203,5,26,-21
5758,Jason Kidd,0.135,0.021076,5,25,-20
5967,Steve Nash,0.785,0.076723,2,21,-19


In [57]:
reg.coef_

array([ 2.80348960e-04,  8.30039216e-05,  6.63836428e-06, -3.50509625e-03,
       -1.28246818e-03,  6.38341791e-03, -1.80119936e-01,  3.42631527e-03,
       -1.08812152e-02, -9.43419092e-03,  1.60103856e-02, -1.51156570e-02,
        1.17095445e-02,  1.17578461e-01, -6.75038593e-03,  1.21608671e-02,
       -5.31665642e-03,  1.59105247e-02,  2.93751803e-02, -2.28242209e-02,
        5.60833905e-03,  1.22919003e-02,  1.11941944e-02, -8.36648497e-03,
       -3.00681316e-03,  5.76970954e-03, -2.13196450e-04, -3.12053086e-04,
        2.03847330e-04,  1.14910021e-01,  2.45906178e-04, -7.02556412e-04,
        1.75170202e-04, -6.13459454e-04])

In [59]:
pd.concat([pd.Series(reg.coef_),pd.Series(predictors)],axis=1).sort_values(0,ascending=False)

Unnamed: 0,0,1
13,0.117578,eFG%
29,0.11491,W/L%
18,0.029375,DRB
10,0.01601,2P
17,0.015911,ORB
21,0.012292,STL
15,0.012161,FTA
12,0.01171,2P%
22,0.011194,BLK
5,0.006383,FGA


In [60]:
stat_ratios=stats[["PTS","AST","STL","BLK","3P","Year"]].groupby("Year").apply(lambda x:x/x.mean())

In [66]:
stat_ratios

Unnamed: 0,PTS,AST,STL,BLK,3P,Year
0,1.013334,0.420714,0.961127,0.673469,0.508587,1.0
1,1.614653,1.028412,1.647646,0.673469,4.577279,1.0
2,0.311795,0.093492,0.274608,1.571429,0.000000,1.0
3,0.200440,0.186984,0.274608,0.000000,0.000000,1.0
4,2.383005,1.636110,1.784950,0.897959,1.525760,1.0
...,...,...,...,...,...,...
14261,1.448372,1.998602,1.522176,0.993056,2.550725,1.0
14262,0.047488,0.000000,0.169131,0.000000,0.106280,1.0
14263,0.949752,0.899371,0.507392,0.248264,2.444444,1.0
14264,0.391773,0.499651,0.338261,0.744792,0.000000,1.0


In [67]:
stats[["PTS_T","AST_R","STL_R","BLK_R","3P_R"]]=stat_ratios[["PTS","AST","STL","BLK","3P"]]

In [68]:
stats.head()

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,W/L%,GB,PS/G,PA/G,SRS,PTS_T,AST_R,STL_R,BLK_R,3P_R
0,A.C. Green,PF,27,LAL,82,21,26.4,3.1,6.6,0.476,...,0.707,5.0,106.3,99.6,6.73,1.013334,0.420714,0.961127,0.673469,0.508587
1,Byron Scott,SG,29,LAL,82,82,32.1,6.1,12.8,0.477,...,0.707,5.0,106.3,99.6,6.73,1.614653,1.028412,1.647646,0.673469,4.577279
2,Elden Campbell,PF,22,LAL,52,0,7.3,1.1,2.4,0.455,...,0.707,5.0,106.3,99.6,6.73,0.311795,0.093492,0.274608,1.571429,0.0
3,Irving Thomas,PF,25,LAL,26,0,4.2,0.7,1.9,0.34,...,0.707,5.0,106.3,99.6,6.73,0.20044,0.186984,0.274608,0.0,0.0
4,James Worthy,SF,29,LAL,78,74,38.6,9.2,18.7,0.492,...,0.707,5.0,106.3,99.6,6.73,2.383005,1.63611,1.78495,0.897959,1.52576


In [69]:
predictors+=["PTS_T","AST_R","STL_R","BLK_R","3P_R"]

In [70]:
mean_ap,aps,all_predictions=backtest(stats,reg,years[5:],predictors)

In [71]:
mean_ap

0.6934471822746294

In [72]:
stats["NPos"]=stats["Pos"].astype("category").cat.codes

In [73]:
stats.head()

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,GB,PS/G,PA/G,SRS,PTS_T,AST_R,STL_R,BLK_R,3P_R,NPos
0,A.C. Green,PF,27,LAL,82,21,26.4,3.1,6.6,0.476,...,5.0,106.3,99.6,6.73,1.013334,0.420714,0.961127,0.673469,0.508587,2
1,Byron Scott,SG,29,LAL,82,82,32.1,6.1,12.8,0.477,...,5.0,106.3,99.6,6.73,1.614653,1.028412,1.647646,0.673469,4.577279,12
2,Elden Campbell,PF,22,LAL,52,0,7.3,1.1,2.4,0.455,...,5.0,106.3,99.6,6.73,0.311795,0.093492,0.274608,1.571429,0.0,2
3,Irving Thomas,PF,25,LAL,26,0,4.2,0.7,1.9,0.34,...,5.0,106.3,99.6,6.73,0.20044,0.186984,0.274608,0.0,0.0,2
4,James Worthy,SF,29,LAL,78,74,38.6,9.2,18.7,0.492,...,5.0,106.3,99.6,6.73,2.383005,1.63611,1.78495,0.897959,1.52576,8


In [74]:
stats["NTm"]=stats["Tm"].astype("category").cat.codes

In [79]:
from sklearn.ensemble import RandomForestRegressor
rf=RandomForestRegressor(n_estimators=50,random_state=1,min_samples_split=5)

mean_ap,aps,all_predictions=backtest(stats,rf,years[28:],predictors)

In [80]:
mean_ap

0.7315156202496711

In [81]:
mean_ap,aps,all_predictions=backtest(stats,reg,years[28:],predictors)

In [82]:
mean_ap

0.6934471822746294