### Now that we have a fully cleaned dataframe, we can start building out our model

In [97]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor

In [3]:
"""Imported functions from ml_metrics library"""

def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.

    This function computes the average prescision at k between two lists of
    items.

    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements

    Returns
    -------
    score : double
            The average precision at k over the input lists

    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k.

    This function computes the mean average prescision at k between two lists
    of lists of items.

    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements

    Returns
    -------
    score : double
            The mean average precision at k over the input lists

    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

In [4]:
mvp_df = pd.read_csv("player_mvp_stats.csv")

In [5]:
# Dropping previous index as well as Tm since redundant with Team and
# Hornets have two different abbreviations
mvp_df = mvp_df.drop(["Unnamed: 0", "Tm"], axis=1)

In [6]:
# setting up everything for our variables
# Removed qualitative variables like Team as well as Year
predictors = ['Age', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB',
       'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'PER',
       'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%',
       'TOV%', 'USG%', 'OWS', 'DWS', 'WS', 'WS/48', 'OBPM', 'DBPM', 'BPM',
       'VORP', 'W', 'L', 'W/L%', 'GB',
       'PS/G', 'PA/G', 'SRS']

In [7]:
train = mvp_df[mvp_df["Year"] < 2023]
test = mvp_df[mvp_df["Year"] == 2023]

In [8]:
X_train = train[predictors]
y_train = train["Share"]

X_test = test[predictors]
y_test = test["Share"]

In [9]:
# Setting up first model (Ridge Regression)

reg = Ridge(alpha=.1)
reg.fit(X_train, y_train)
ridge_predictions = reg.predict(X_test)

In [10]:
# Converting to df to be cleaner
ridge_df = pd.DataFrame(ridge_predictions,
                                 columns=["MVP Share Predictions"],
                                 index=X_test.index)
ridge_df

Unnamed: 0,MVP Share Predictions
48,0.017688
49,0.022474
50,0.029433
51,0.229242
52,-0.004037
...,...
11431,-0.012121
11432,0.007565
11433,-0.007648
11434,-0.018394


In [11]:
ridge_combination = pd.concat([test[["Player", "Share"]], ridge_df], axis = 1)
ridge_combination.sort_values("MVP Share Predictions", ascending=False).head(20)

Unnamed: 0,Player,Share,MVP Share Predictions
51,Giannis Antetokounmpo,0.606,0.229242
559,Nikola Jokić,0.674,0.227922
143,Luka Dončić,0.01,0.196454
11253,Joel Embiid,0.915,0.161215
3426,Domantas Sabonis,0.027,0.13535
2271,Jayson Tatum,0.28,0.119459
1072,Shai Gilgeous-Alexander,0.046,0.112464
11429,Julius Randle,0.0,0.112139
9504,Ja Morant,0.001,0.109441
6727,LeBron James,0.0,0.108317


In [69]:
# generalize this later for all types of models
# outputs two new columns: Rank and Predicted Rank

def convert_to_ranks(df, actual_share_col, predicted_share_col):
    ranked_df = df.copy(deep=True)
    ranked_df["Rank"] = df[actual_share_col].rank(method="min", ascending=False)
    ranked_df["Predicted Rank"] = df[predicted_share_col].rank(method="min", ascending=False)
    return ranked_df.sort_values("Rank")

In [53]:
# create an error metric function that is based on ranking since
# most players' MVP share is 0
# assumption is table looks like ridge_combination
# with cols: Player, Share, MVP Share Predictions
# outdated not using the convert_to_ranks function but not useful
def find_ap(df, actual_share_col, predicted_share_col, num):
    actual_ranking = df.sort_values(actual_share_col, ascending=False).head(num)
    actual_ranking["Rank"] = list(range(1, actual_ranking.shape[0] + 1))
    predicted_ranking = df.sort_values(predicted_share_col, ascending=False)
    predicted_ranking["Pred_Rank"] = list(range(1, predicted_ranking.shape[0] + 1))
    abs_diff = 0
    for index, row in actual_ranking.iterrows():
        player_name = row["Player"]
        predicted_player_row = predicted_ranking[
            predicted_ranking["Player"] == player_name]
        abs_diff += abs(row["Rank"] - predicted_player_row["Pred_Rank"].values[0])
    return abs_diff

In [54]:
"""
Finding mean square error

This will give you a measure of the overall discrepancy or difference in ranks between the two for the top 10 MVP
candidates.

Input is a ranked dataframe after applying the convert_to_ranks function.
"""

def get_mse_top_10(ranked_df, num=10):
    top_n_df = ranked_df.head(num)
    return mean_squared_error(top_n_df["Rank"].to_numpy(), top_n_df["Predicted Rank"].to_numpy(), squared=True)

In [55]:
"""
Determine the AP@K (Average Precision at K) for your predicted ranking. 

Interested in top n since those are the MVP finalists. AP@K will provide an assessment
of the precision and correctness of your predicted ranking for the top candidates.

Input is a ranked dataframe after applying the convert_to_ranks function.
"""

def get_apk_n(ranked_df, k):
    top_k_actual = ranked_df["Player"][:k]
    top_k_predicted = ranked_df.sort_values("Predicted Rank")["Player"][:k]
    return apk(top_k_actual.tolist(), top_k_predicted.tolist(), 5)

In [70]:
""" 
Combines both of our metrics together to provide a clean look at how our model does 
in these two important criteria. """

def evaluate_model(df, actual_share_col, predicted_share_col, k):
    combined_df = convert_to_ranks(df, actual_share_col, predicted_share_col)
    
    #return combined_df
    mse = get_mse_top_10(combined_df)
    apk = get_apk_n(combined_df, k)
    
    return [mse, apk]
    

In [72]:
def print_model_results(mse, apk, k):
    print("Mean Squared Error:", mse)
    print("Average Precision at", str(k) + ":", apk)

In [73]:
# Final output for evaluating how our model did on the two key metrics
naive_results = evaluate_model(ridge_combination, "Share", "MVP Share Predictions", 5)
print_model_results(naive_results[0], naive_results[1], 5)

Mean Squared Error: 13.4
Average Precision at 5: 0.55


## Backtesting

In [74]:
years = list(range(2000, 2024))
years[5]

2005

In [78]:
# Following along, we make a function to find the biggest differences each year
# inside code we run function of convert_to_ranks
def difference_in_rank(ranked_df):
    ranked_df["Difference"] = ranked_df["Rank"] - ranked_df["Predicted Rank"]
    return ranked_df

In [82]:
"""
Creating a backtesting function to run all the code we did and put it in a loop
"""
def backtest(df, model_bt, timeframe, predictors):
    """
    Looping through years (starting with 2005 because we need at least 5 years of data to come up with a model)
    Then making predictions for every year based off previous traine data
    The more years we can train on, the more data our algorithm can build off to make accurate predictions
    """

    all_predictions = [] # list of dataframes holding our predictions for every year
    mse_ar = []
    apk_ar = []

    for year in timeframe[5:]:
        train_bt = df[df["Year"] < year]
        test_bt = df[df["Year"] == year]

        X_train_bt = train_bt[predictors]
        y_train_bt = train_bt["Share"]
        X_test_bt = test_bt[predictors]
        y_test_bt = test_bt["Share"]

        model_bt.fit(X_train_bt, y_train_bt)
        model_bt_predictions = model_bt.predict(X_test_bt) # predict for testing set (the current year starting w 2005)

        model_bt_df = pd.DataFrame(model_bt_predictions,
                                     columns=["MVP Share Predictions"],
                                     index=X_test_bt.index) # putting predictions into a dataframe

        model_bt_combination = pd.concat([test_bt[["Player", "Share"]],
                                          model_bt_df], axis = 1) # concatenating predictions with the players and their MVP share

        combined_bt_df = convert_to_ranks(model_bt_combination, "Share", "MVP Share Predictions") # adding actual and predicted rank into list
        with_differences = difference_in_rank(combined_bt_df) # adding difference between actual and predicted rank
        with_differences["Year"] = year # adding year to add more information to overall dataframe
        
        all_predictions.append(with_differences) # appending predictions to outer list

        mse_ar.append(get_mse_top_10(combined_bt_df))
        apk_ar.append(get_apk_n(combined_bt_df, 5))
        
    return ((sum(mse_ar) / len(mse_ar)), mse_ar, (sum(apk_ar) / len(apk_ar)), apk_ar, pd.concat(all_predictions))

In [83]:
avg_mse, mse_ar, avg_apk, apk_ar, all_predictions = backtest(mvp_df, reg, years, predictors)

In [87]:
# Made it to 35:22 on Part 3
all_predictions[all_predictions["Rank"] <=10].sort_values("Difference", ascending=False).head(15)

Unnamed: 0,Player,Share,MVP Share Predictions,Rank,Predicted Rank,Difference,Year
2500,Blake Griffin,0.004,0.138157,10.0,4.0,6.0,2011
10757,Dwight Howard,0.011,0.154385,7.0,2.0,5.0,2012
143,Luka Dončić,0.01,0.196454,8.0,3.0,5.0,2023
6709,LeBron James,0.001,0.136468,10.0,5.0,5.0,2022
7275,Tim Duncan,0.002,0.137486,10.0,5.0,5.0,2009
8513,Tim Duncan,0.026,0.194453,8.0,3.0,5.0,2006
1253,Russell Westbrook,0.008,0.144176,10.0,5.0,5.0,2019
11164,Russell Westbrook,0.007,0.098555,9.0,5.0,4.0,2013
4729,LeBron James,0.142,0.204179,5.0,1.0,4.0,2007
9194,Nikola Jokić,0.018,0.129722,9.0,5.0,4.0,2020


In [88]:
pd.concat([pd.Series(reg.coef_), pd.Series(predictors)], axis=1).sort_values(0, ascending=False)

Unnamed: 0,0,1
13,0.142618,eFG%
48,0.080804,W/L%
18,0.034384,DRB
45,0.027272,VORP
15,0.024229,FTA
16,0.020869,FT%
17,0.019185,ORB
41,0.018951,WS/48
23,0.018599,TOV
4,0.0082,FG


In [89]:
mvp_df.columns

Index(['Player', 'Pos', 'Age', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB',
       'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Year', 'PER',
       'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%',
       'TOV%', 'USG%', 'OWS', 'DWS', 'WS', 'WS/48', 'OBPM', 'DBPM', 'BPM',
       'VORP', 'Pts Won', 'Pts Max', 'Share', 'Team', 'W', 'L', 'W/L%', 'GB',
       'PS/G', 'PA/G', 'SRS'],
      dtype='object')

In [90]:
# Trying to use ratios to normalize our data by year
mvp_normalized_ratios = mvp_df[["FG", "FGA", "FG%", '3P',
       '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB',
       'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Year']].groupby(
    "Year", group_keys=False).apply(lambda x: x/x.mean())
mvp_normalized_ratios

Unnamed: 0,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Year
0,0.695196,0.686188,1.043082,0.000000,0.000000,1.175054,0.797491,0.823245,1.008452,0.987215,...,1.864515,1.554808,1.618509,0.540640,0.894701,0.467519,0.475967,0.733786,0.627089,1.0
1,0.562778,0.642389,0.891404,0.512850,0.702681,1.457066,0.531661,0.630571,0.893906,0.903478,...,0.559354,0.894015,0.795538,1.459729,0.745584,0.467519,0.793278,0.684867,0.514213,1.0
2,0.695196,0.905185,0.807397,1.794977,1.844538,1.471167,0.569637,0.718150,0.815296,0.881442,...,0.279677,0.621923,0.493782,1.513793,1.491168,0.000000,0.793278,0.929463,0.790132,1.0
3,0.364151,0.423393,0.907738,0.769276,0.878351,1.598073,0.303806,0.350317,0.925350,0.978401,...,0.559354,0.349832,0.411485,0.108128,0.298234,0.233759,0.317311,0.538110,0.401337,1.0
4,1.754543,1.795770,1.003413,2.820678,2.547219,1.724979,1.594983,1.646491,1.008452,1.040102,...,0.652580,1.321587,1.124727,1.189409,0.894701,0.467519,1.110589,1.076220,1.994143,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11511,0.801240,0.737635,1.099075,0.650951,0.686635,1.114811,0.850022,0.761391,1.156583,1.074210,...,0.822136,1.031222,0.981705,0.819562,0.479763,1.528302,0.819562,0.833211,0.735752,1.0
11512,0.096149,0.130171,0.649453,0.130190,0.366205,0.602503,0.042501,0.021150,2.080186,0.724940,...,0.000000,0.147317,0.112195,0.000000,0.000000,0.000000,0.000000,0.059515,0.071202,1.0
11513,1.378132,1.316172,1.067283,0.520761,0.640859,0.912774,1.657543,1.628530,1.056734,0.992985,...,1.409377,1.657320,1.598776,0.601012,1.119447,2.547170,0.819562,0.714181,1.281633,1.0
11514,0.480744,0.462830,1.042304,0.650951,0.595083,1.363750,0.425011,0.401845,1.069215,1.088425,...,0.822136,0.478781,0.560974,0.218550,0.319842,1.273585,0.273187,0.892726,0.474679,1.0


In [91]:
# trying it out on a copied dataframe adding all normalized stats
ratio_mvp_df = mvp_df.copy(deep=True)
ratio_mvp_df[["FG_n", "FGA_n", "FG%_n", '3P_n',
       '3PA_n', '3P%_n', '2P_n', '2PA_n', '2P%_n', 'eFG%_n', 'FT_n', 'FTA_n', 'FT%_n', 'ORB_n',
       'DRB_n', 'TRB_n', 'AST_n', 'STL_n', 'BLK_n', 'TOV_n', 'PF_n', 'PTS_n']] = mvp_normalized_ratios[["FG", "FGA", "FG%", '3P',
       '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB',
       'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS']]
ratio_mvp_df

Unnamed: 0,Player,Pos,Age,G,GS,MP,FG,FGA,FG%,3P,...,FT%_n,ORB_n,DRB_n,TRB_n,AST_n,STL_n,BLK_n,TOV_n,PF_n,PTS_n
0,A.C. Green,PF,36,82,82,23.5,2.1,4.7,0.447,0.0,...,0.991873,1.864515,1.554808,1.618509,0.540640,0.894701,0.467519,0.475967,0.733786,0.627089
1,Brian Shaw,SG,33,74,2,16.9,1.7,4.4,0.382,0.2,...,1.083211,0.559354,0.894015,0.795538,1.459729,0.745584,0.467519,0.793278,0.684867,0.514213
2,Derek Fisher,PG,25,78,22,23.1,2.1,6.2,0.346,0.7,...,1.033260,0.279677,0.621923,0.493782,1.513793,1.491168,0.000000,0.793278,0.929463,0.790132
3,Devean George,SF,22,49,1,7.0,1.1,2.9,0.389,0.3,...,0.940495,0.559354,0.349832,0.411485,0.108128,0.298234,0.233759,0.317311,0.538110,0.401337
4,Glen Rice,SF,32,80,80,31.6,5.3,12.3,0.430,1.1,...,1.247333,0.652580,1.321587,1.124727,1.189409,0.894701,0.467519,1.110589,1.076220,1.994143
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11511,Spencer Hawes,PF,28,54,1,14.8,2.5,5.1,0.484,0.5,...,1.177814,0.822136,1.031222,0.981705,0.819562,0.479763,1.528302,0.819562,0.833211,0.735752
11512,Steve Novak,PF,33,8,0,2.8,0.3,0.9,0.286,0.1,...,0.000000,0.000000,0.147317,0.112195,0.000000,0.000000,0.000000,0.000000,0.059515,0.071202
11513,Terrence Jones,PF,25,54,12,23.5,4.3,9.1,0.470,0.4,...,0.843682,1.409377,1.657320,1.598776,0.601012,1.119447,2.547170,0.819562,0.714181,1.281633
11514,Thon Maker,C,19,57,34,9.9,1.5,3.2,0.459,0.5,...,0.909116,0.822136,0.478781,0.560974,0.218550,0.319842,1.273585,0.273187,0.892726,0.474679


In [92]:
normalized_predictors = predictors.copy()
normalized_predictors += ["FG_n", "FGA_n", "FG%_n", '3P_n',
       '3PA_n', '3P%_n', '2P_n', '2PA_n', '2P%_n', 'eFG%_n', 'FT_n', 'FTA_n', 'FT%_n', 'ORB_n',
       'DRB_n', 'TRB_n', 'AST_n', 'STL_n', 'BLK_n', 'TOV_n', 'PF_n', 'PTS_n']

In [93]:
avg_mse_n, mse_ar_n, avg_apk_n, apk_ar_n, all_predictions_n = backtest(ratio_mvp_df, reg, years, normalized_predictors)


In [95]:
avg_mse_n, avg_mse, avg_apk_n, avg_apk

(999.821052631579, 198.50000000000006, 0.6033333333333333, 0.5857894736842105)

In [96]:
all_predictions_n[all_predictions_n["Predicted Rank"] <= 3].sort_values("Difference", ascending=False).head(15)

Unnamed: 0,Player,Share,MVP Share Predictions,Rank,Predicted Rank,Difference,Year
8706,Blake Griffin,0.0,0.157521,16.0,3.0,13.0,2012
4178,Shaquille O'Neal,0.0,0.201785,12.0,2.0,10.0,2006
6631,Kevin Garnett,0.012,0.284721,11.0,2.0,9.0,2005
4194,Shaquille O'Neal,0.002,0.129286,12.0,3.0,9.0,2007
216,Kevin Love,0.02,0.190003,11.0,3.0,8.0,2014
143,Luka Dončić,0.01,0.189076,8.0,2.0,6.0,2023
6085,James Harden,0.007,0.208679,9.0,3.0,6.0,2016
10757,Dwight Howard,0.011,0.204515,7.0,1.0,6.0,2012
8513,Tim Duncan,0.026,0.171681,8.0,3.0,5.0,2006
327,James Harden,0.027,0.161624,8.0,3.0,5.0,2013


## Trying Random Forest

In [98]:
rf = RandomForestRegressor(n_estimators=100, random_state=42, min_samples_split=5)
avg_mse_rf, mse_ar_rf, avg_apk_rf, apk_ar_rf, all_predictions_rf = backtest(
    mvp_df, rf, years, predictors)

In [99]:
avg_mse_rf_n, mse_ar_rf_n, avg_apk_rf_n, apk_ar_rf_n, all_predictions_rf_n = backtest(
    ratio_mvp_df, rf, years, normalized_predictors)

In [100]:
avg_mse_rf, avg_mse_rf_n, avg_mse, avg_mse_n

(68.08947368421055, 67.60000000000001, 198.50000000000006, 999.821052631579)

In [101]:
avg_apk_rf, avg_apk_rf_n, avg_apk, avg_apk_n

(0.672982456140351, 0.6835087719298246, 0.5857894736842105, 0.6033333333333333)

In [104]:
all_predictions_rf_n[(all_predictions_rf_n["Rank"] <= 5) & (all_predictions_rf_n["Year"] == 2023)]

Unnamed: 0,Player,Share,MVP Share Predictions,Rank,Predicted Rank,Difference,Year
11253,Joel Embiid,0.915,0.550736,1.0,2.0,-1.0,2023
559,Nikola Jokić,0.674,0.721843,2.0,1.0,1.0,2023
51,Giannis Antetokounmpo,0.606,0.424492,3.0,3.0,0.0,2023
2271,Jayson Tatum,0.28,0.201485,4.0,6.0,-2.0,2023
1072,Shai Gilgeous-Alexander,0.046,0.238713,5.0,5.0,0.0,2023


In [136]:
all_predictions_rf_n[all_predictions_rf_n["Year"] == 2005].sort_values("Predicted Rank").head(40)

Unnamed: 0,Player,Share,MVP Share Predictions,Rank,Predicted Rank,Difference,Year
6631,Kevin Garnett,0.012,0.498383,11.0,1.0,10.0,2005
4712,LeBron James,0.073,0.328997,6.0,2.0,4.0,2005
4363,Amar'e Stoudemire,0.032,0.231234,9.0,3.0,6.0,2005
4160,Shaquille O'Neal,0.813,0.228586,2.0,4.0,-2.0,2005
2813,Dirk Nowitzki,0.275,0.201972,3.0,5.0,-2.0,2005
916,Allen Iverson,0.189,0.147705,5.0,6.0,-1.0,2005
4373,Steve Nash,0.839,0.140996,1.0,7.0,-6.0,2005
8498,Tim Duncan,0.258,0.120847,4.0,8.0,-4.0,2005
6919,Jermaine O'Neal,0.0,0.0983,17.0,9.0,8.0,2005
8814,Tracy McGrady,0.035,0.081036,7.0,10.0,-3.0,2005
