### Now that we have a fully cleaned dataframe, we can start building out our model

In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor

In [47]:
"""Imported functions from ml_metrics library"""

def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.

    This function computes the average prescision at k between two lists of
    items.

    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements

    Returns
    -------
    score : double
            The average precision at k over the input lists

    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k.

    This function computes the mean average prescision at k between two lists
    of lists of items.

    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements

    Returns
    -------
    score : double
            The mean average precision at k over the input lists

    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

In [48]:
mvp_df = pd.read_csv("all_player_mvp_stats.csv")

In [49]:
# Dropping previous index as well as Tm since redundant with Team and
# Hornets have two different abbreviations
mvp_df = mvp_df.drop(["Unnamed: 0", "Tm"], axis=1)

In [50]:
years = list(range(1980, 2024))

In [None]:
"""
Function in progress

mvp_df["Previous MVP's Won"] = 0
for year in years:
    year_winner = mvp_df[mvp_df["Year"] == year].sort_values("Share", ascending=False).iloc[:1]["Player"].item()
    years_after_winning_for_player = mvp_df[(mvp_df["Player"] == year_winner) & (mvp_df["Year"] > year)]
    years_after_index = years_after_winning_for_player.index.values.tolist()
    mvp_df.loc[years_after_index, ["Previous MVP's Won"]] += 1
    
"""

In [None]:
# This in progress as well
"""jordan = mvp_df[mvp_df["Player"] == "Michael Jordan"]
jordan[["Player", "Year", "Previous MVP's Won"]]"""

In [51]:
# setting up everything for our variables
# Removed qualitative variables like Team as well as Year
predictors = ['Age', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB',
       'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'PER',
       'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%',
       'TOV%', 'USG%', 'OWS', 'DWS', 'WS', 'WS/48', 'OBPM', 'DBPM', 'BPM',
       'VORP', 'W', 'L', 'W/L%', 'GB',
       'PS/G', 'PA/G', 'SRS']

In [52]:
train = mvp_df[mvp_df["Year"] < 2023]
test = mvp_df[mvp_df["Year"] == 2023]

In [53]:
X_train = train[predictors]
y_train = train["Share"]

X_test = test[predictors]
y_test = test["Share"]

In [54]:
# Setting up first model (Ridge Regression)

reg = Ridge(alpha=.1)
reg.fit(X_train, y_train)
ridge_predictions = reg.predict(X_test)

In [55]:
# Converting to df to be cleaner
ridge_df = pd.DataFrame(ridge_predictions,
                                 columns=["MVP Share Predictions"],
                                 index=X_test.index)
ridge_df

Unnamed: 0,MVP Share Predictions
7312,0.018240
7313,0.029694
7314,0.029982
7315,0.223518
7316,-0.010886
...,...
18695,-0.017917
18696,0.005571
18697,-0.017361
18698,-0.021624


In [56]:
ridge_combination = pd.concat([test[["Player", "Share"]], ridge_df], axis = 1)
ridge_combination.sort_values("MVP Share Predictions", ascending=False).head(20)

Unnamed: 0,Player,Share,MVP Share Predictions
7315,Giannis Antetokounmpo,0.606,0.223518
7823,Nikola Jokić,0.674,0.223
7407,Luka Dončić,0.01,0.193656
18517,Joel Embiid,0.915,0.17444
10690,Domantas Sabonis,0.027,0.130161
9535,Jayson Tatum,0.28,0.129465
8336,Shai Gilgeous-Alexander,0.046,0.120248
18693,Julius Randle,0.0,0.114498
13991,LeBron James,0.0,0.110243
16768,Ja Morant,0.001,0.107225


In [57]:
# generalize this later for all types of models
# outputs two new columns: Rank and Predicted Rank

def convert_to_ranks(df, actual_share_col, predicted_share_col):
    ranked_df = df.copy(deep=True)
    ranked_df["Rank"] = df[actual_share_col].rank(method="min", ascending=False)
    ranked_df["Predicted Rank"] = df[predicted_share_col].rank(method="min", ascending=False)
    return ranked_df.sort_values("Rank")

In [58]:
# create an error metric function that is based on ranking since
# most players' MVP share is 0
# assumption is table looks like ridge_combination
# with cols: Player, Share, MVP Share Predictions
# outdated not using the convert_to_ranks function but not useful
def find_ap(df, actual_share_col, predicted_share_col, num):
    actual_ranking = df.sort_values(actual_share_col, ascending=False).head(num)
    actual_ranking["Rank"] = list(range(1, actual_ranking.shape[0] + 1))
    predicted_ranking = df.sort_values(predicted_share_col, ascending=False)
    predicted_ranking["Pred_Rank"] = list(range(1, predicted_ranking.shape[0] + 1))
    abs_diff = 0
    for index, row in actual_ranking.iterrows():
        player_name = row["Player"]
        predicted_player_row = predicted_ranking[
            predicted_ranking["Player"] == player_name]
        abs_diff += abs(row["Rank"] - predicted_player_row["Pred_Rank"].values[0])
    return abs_diff

In [59]:
"""
Finding mean square error

This will give you a measure of the overall discrepancy or difference in ranks between the two for the top 10 MVP
candidates.

Input is a ranked dataframe after applying the convert_to_ranks function.
"""

def get_mse_top_10(ranked_df, num=10):
    top_n_df = ranked_df.head(num)
    return mean_squared_error(top_n_df["Rank"].to_numpy(), top_n_df["Predicted Rank"].to_numpy(), squared=True)

In [60]:
"""
Determine the AP@K (Average Precision at K) for your predicted ranking. 

Interested in top n since those are the MVP finalists. AP@K will provide an assessment
of the precision and correctness of your predicted ranking for the top candidates.

Input is a ranked dataframe after applying the convert_to_ranks function.
"""

def get_apk_n(ranked_df, k):
    top_k_actual = ranked_df["Player"][:k]
    top_k_predicted = ranked_df.sort_values("Predicted Rank")["Player"][:k]
    return apk(top_k_actual.tolist(), top_k_predicted.tolist(), 5)

In [61]:
""" 
Combines both of our metrics together to provide a clean look at how our model does 
in these two important criteria. """

def evaluate_model(df, actual_share_col, predicted_share_col, k):
    combined_df = convert_to_ranks(df, actual_share_col, predicted_share_col)
    
    #return combined_df
    mse = get_mse_top_10(combined_df)
    apk = get_apk_n(combined_df, k)
    
    return [mse, apk]
    

In [62]:
def print_model_results(mse, apk, k):
    print("Mean Squared Error:", mse)
    print("Average Precision at", str(k) + ":", apk)

In [63]:
# Final output for evaluating how our model did on the two key metrics
naive_results = evaluate_model(ridge_combination, "Share", "MVP Share Predictions", 5)
print_model_results(naive_results[0], naive_results[1], 5)

Mean Squared Error: 16.5
Average Precision at 5: 0.55


## Backtesting

In [64]:
# Following along, we make a function to find the biggest differences each year
# inside code we run function of convert_to_ranks
def difference_in_rank(ranked_df):
    ranked_df["Difference"] = ranked_df["Rank"] - ranked_df["Predicted Rank"]
    return ranked_df

In [65]:
"""
Creating a backtesting function to run all the code we did and put it in a loop
"""
def backtest(orig_df, model_bt, timeframe, predictors, normalizing=False):
    """
    Looping through years (starting with 2005 because we need at least 5 years of data to come up with a model)
    Then making predictions for every year based off previous traine data
    The more years we can train on, the more data our algorithm can build off to make accurate predictions
    """

    all_predictions = [] # list of dataframes holding our predictions for every year
    mse_ar = []
    apk_ar = []
    
    df = orig_df.copy(deep=True)
    
    if normalizing: # if our data needs to be normalized (for certain models where scaling is important)
        predictors_plus_year = predictors.copy()
        predictors_plus_year.append("Year")
        df[predictors] = df[predictors_plus_year].groupby("Year").transform(lambda x: (x-np.mean(x, axis=0))/np.std(x, ddof=1))

    for year in timeframe[5:]:
        train_bt = df[df["Year"] < year]
        test_bt = df[df["Year"] == year]

        X_train_bt = train_bt[predictors]
        y_train_bt = train_bt["Share"]
        X_test_bt = test_bt[predictors]
        y_test_bt = test_bt["Share"]

        model_bt.fit(X_train_bt, y_train_bt)
        model_bt_predictions = model_bt.predict(X_test_bt) # predict for testing set (the current year starting w 2005)

        model_bt_df = pd.DataFrame(model_bt_predictions,
                                     columns=["MVP Share Predictions"],
                                     index=X_test_bt.index) # putting predictions into a dataframe

        model_bt_combination = pd.concat([test_bt[["Player", "Share"]],
                                          model_bt_df], axis = 1) # concatenating predictions with the players and their MVP share

        combined_bt_df = convert_to_ranks(model_bt_combination, "Share", "MVP Share Predictions") # adding actual and predicted rank into list
        with_differences = difference_in_rank(combined_bt_df) # adding difference between actual and predicted rank
        with_differences["Year"] = year # adding year to add more information to overall dataframe
        
        all_predictions.append(with_differences) # appending predictions to outer list

        mse_ar.append(get_mse_top_10(combined_bt_df))
        apk_ar.append(get_apk_n(combined_bt_df, 5))
        
    return ((sum(mse_ar) / len(mse_ar)), mse_ar, (sum(apk_ar) / len(apk_ar)), apk_ar, pd.concat(all_predictions))

In [67]:
normalized_avg_mse, normalized_mse_ar, normalized_avg_apk, normalized_apk_ar, normalized_all_predictions = backtest(mvp_df, reg, years, predictors, normalizing=True)

In [68]:
avg_mse, mse_ar, avg_apk, apk_ar, all_predictions = backtest(mvp_df, reg, years, predictors)

In [73]:
# Made it to 35:22 on Part 3
normalized_all_predictions[normalized_all_predictions["Rank"] <=1].sort_values("Difference", ascending=True).head(15)

Unnamed: 0,Player,Share,MVP Share Predictions,Rank,Predicted Rank,Difference,Year
11655,Steve Nash,0.739,0.06387,1.0,28.0,-27.0,2006
11637,Steve Nash,0.839,0.063994,1.0,26.0,-25.0,2005
13213,Kobe Bryant,0.873,0.134348,1.0,8.0,-7.0,2008
8117,Allen Iverson,0.904,0.158878,1.0,7.0,-6.0,2001
12546,Stephen Curry,0.922,0.155176,1.0,6.0,-5.0,2015
18517,Joel Embiid,0.915,0.191927,1.0,4.0,-3.0,2023
6518,Charles Barkley,0.852,0.209451,1.0,4.0,-3.0,1993
12059,Stephen Curry,1.0,0.187479,1.0,3.0,-2.0,2016
17262,Derrick Rose,0.977,0.172064,1.0,3.0,-2.0,2011
21,Magic Johnson,0.94,0.226794,1.0,3.0,-2.0,1987


In [74]:
pd.concat([pd.Series(reg.coef_), pd.Series(predictors)], axis=1).sort_values(0, ascending=False)

Unnamed: 0,0,1
13,0.17585,eFG%
48,0.063229,W/L%
18,0.026699,DRB
45,0.026132,VORP
16,0.020692,FT%
15,0.01909,FTA
41,0.018811,WS/48
10,0.015288,2P
5,0.014575,FGA
23,0.011459,TOV


## Trying Random Forest

In [75]:
rf = RandomForestRegressor(n_estimators=100, random_state=42, min_samples_split=5)

In [82]:
mvp_with_previous_df = mvp_df.copy(deep=True)
mvp_with_previous_df["Previous MVP's Won"] = 0
for year in years:
    year_winner = mvp_with_previous_df[mvp_with_previous_df["Year"] == year].sort_values("Share", ascending=False).iloc[:1]["Player"].item()
    years_after_winning_for_player = mvp_with_previous_df[(mvp_with_previous_df["Player"] == year_winner) & (mvp_with_previous_df["Year"] > year)]
    years_after_index = years_after_winning_for_player.index.values.tolist()
    mvp_with_previous_df.loc[years_after_index, ["Previous MVP's Won"]] += 1

In [None]:
predictors_with_previous = predictors.copy()
predictors_with_previous.append("Previous MVP's Won")
predictors_with_previous

In [83]:
%%time
avg_mse_rf, mse_ar_rf, avg_apk_rf, apk_ar_rf, all_predictions_rf = backtest(
    mvp_df, rf, years, predictors)

CPU times: user 13min 15s, sys: 6.05 s, total: 13min 21s
Wall time: 13min 45s


In [84]:
%%time
avg_mse_rf_n, mse_ar_rf_n, avg_apk_rf_n, apk_ar_rf_n, all_predictions_rf_n = backtest(
    mvp_df, rf, years, predictors, normalizing=True)

CPU times: user 22min 30s, sys: 7.77 s, total: 22min 37s
Wall time: 23min 1s


In [121]:
%%time
avg_mse_rf_votefatigue, mse_ar_rf_votefatigue, avg_apk_rf_votefatigue, apk_ar_rf_votefatigue, all_predictions_rf_votefatigue = backtest(
    mvp_with_previous_df, rf, years, predictors_with_previous)

CPU times: user 13min 33s, sys: 5.34 s, total: 13min 39s
Wall time: 13min 59s


In [30]:
avg_mse_rf, avg_mse_rf_n, avg_mse

(68.08947368421055, 78.3, 198.50000000000006)

In [31]:
avg_apk_rf, avg_apk_rf_n, avg_apk

(0.672982456140351, 0.6380701754385965, 0.5857894736842105)

In [89]:
all_predictions_rf[(all_predictions_rf["Rank"] <= 5) & (all_predictions_rf["Year"] == 2023)]

Unnamed: 0,Player,Share,MVP Share Predictions,Rank,Predicted Rank,Difference,Year
18517,Joel Embiid,0.915,0.463297,1.0,2.0,-1.0,2023
7823,Nikola Jokić,0.674,0.633653,2.0,1.0,1.0,2023
7315,Giannis Antetokounmpo,0.606,0.412487,3.0,3.0,0.0,2023
9535,Jayson Tatum,0.28,0.196441,4.0,5.0,-1.0,2023
8336,Shai Gilgeous-Alexander,0.046,0.163479,5.0,6.0,-1.0,2023


In [115]:
all_predictions_rf[all_predictions_rf["Year"] == 2011].sort_values("Predicted Rank").head(15)

Unnamed: 0,Player,Share,MVP Share Predictions,Rank,Predicted Rank,Difference,Year
17925,LeBron James,0.431,0.500233,3.0,1.0,2.0,2011
17262,Derrick Rose,0.977,0.376813,1.0,2.0,-1.0,2011
17917,Dwyane Wade,0.02,0.364968,7.0,3.0,4.0,2011
16715,Dwight Howard,0.531,0.240136,2.0,4.0,-2.0,2011
13231,Pau Gasol,0.0,0.199064,14.0,5.0,9.0,2011
13226,Kobe Bryant,0.354,0.158925,4.0,6.0,-2.0,2011
17540,Kevin Durant,0.157,0.134221,5.0,7.0,-2.0,2011
7873,Chris Paul,0.002,0.071215,12.0,8.0,4.0,2011
14483,Kevin Love,0.0,0.0598,14.0,9.0,5.0,2011
17546,Russell Westbrook,0.0,0.052946,14.0,10.0,4.0,2011


In [122]:
avg_mse_rf, avg_mse_rf_n, avg_mse_rf_votefatigue, avg_mse

(62.338461538461544, 69.35384615384616, 53.02564102564103, 236.84358974358975)

In [123]:
avg_apk_rf, avg_apk_rf_n, avg_apk_rf_votefatigue, avg_apk

(0.6741025641025644, 0.6485470085470089, 0.674700854700855, 0.6094871794871798)

# Saving models so we can work on them and look at them after

In [126]:
# Import Joblib Module from Scikit Learn

import joblib

In [128]:
# Save RF Model to file in the current working directory

joblib_file = "joblib_RF_Model.pkl"  
joblib.dump(rf, joblib_file)

# This is with voter fatigue as well which has the lowest MSE and the highest APK

['joblib_RF_Model.pkl']

## Installing dill to try to save session

In [129]:
pip install dill

Note: you may need to restart the kernel to use updated packages.


In [130]:
import dill                            
filepath = 'rf_session_7-7.pkl'
dill.dump_session(filepath) # Save the session

In [None]:
dill.load_session()