### Now that we have a fully cleaned dataframe, we can start building out our model

In [21]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
import xgboost as xgb
from sklearn import svm
import plotly.express as px
import joblib
import dill                            
filepath = 'rf_session_7-7.pkl'
dill.load_session(filepath)
import joblib
joblib_file = "joblib_RF_Model.pkl"  
newest_rf = joblib.load(joblib_file) # This is with voter fatigue as well which has the lowest MSE and the highest APK

In [7]:
# Work in progress
"""from lightgbm import LGBMClassifier
pip install lightgbm"""

Collecting lightgbm
  Using cached lightgbm-3.3.5.tar.gz (1.5 MB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: lightgbm
  Building wheel for lightgbm (setup.py) ... [?25lerror
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py bdist_wheel[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m [31m[97 lines of output][0m
  [31m   [0m INFO:root:running bdist_wheel
  [31m   [0m !!
  [31m   [0m 
  [31m   [0m         ********************************************************************************
  [31m   [0m         Please avoid running ``setup.py`` directly.
  [31m   [0m         Instead, use pypa/build, pypa/installer or other
  [31m   [0m         standards-based tools.
  [31m   [0m 
  [31m   [0m         See https://blog.ganssle.io/articles/2021/10/setup-py-deprecated.html for details.
  [31m   [0m         **************************************************

Failed to build lightgbm
[31mERROR: Could not build wheels for lightgbm, which is required to install pyproject.toml-based projects[0m[31m
[0mNote: you may need to restart the kernel to use updated packages.


In [22]:
"""Imported functions from ml_metrics library"""

def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.

    This function computes the average prescision at k between two lists of
    items.

    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements

    Returns
    -------
    score : double
            The average precision at k over the input lists

    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k.

    This function computes the mean average prescision at k between two lists
    of lists of items.

    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements

    Returns
    -------
    score : double
            The mean average precision at k over the input lists

    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

In [23]:
mvp_df = pd.read_csv("all_player_mvp_stats.csv")

In [24]:
# Dropping previous index as well as Tm since redundant with Team and
# Hornets have two different abbreviations
mvp_df = mvp_df.drop(["Unnamed: 0", "Tm"], axis=1)

In [25]:
years = list(range(1980, 2024))

In [26]:
# setting up everything for our variables
# Removed qualitative variables like Team as well as Year
predictors = ['Age', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB',
       'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'PER',
       'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%',
       'TOV%', 'USG%', 'OWS', 'DWS', 'WS', 'WS/48', 'OBPM', 'DBPM', 'BPM',
       'VORP', 'W', 'L', 'W/L%', 'GB',
       'PS/G', 'PA/G', 'SRS']

In [27]:
train = mvp_df[mvp_df["Year"] < 2023]
test = mvp_df[mvp_df["Year"] == 2023]

In [28]:
X_train = train[predictors]
y_train = train["Share"]

X_test = test[predictors]
y_test = test["Share"]

In [29]:
# Setting up first model (Ridge Regression)

reg = Ridge(alpha=.1)
reg.fit(X_train, y_train)
ridge_predictions = reg.predict(X_test)

In [30]:
# Converting to df to be cleaner
ridge_df = pd.DataFrame(ridge_predictions,
                                 columns=["MVP Share Predictions"],
                                 index=X_test.index)
ridge_df

Unnamed: 0,MVP Share Predictions
7312,0.018240
7313,0.029694
7314,0.029982
7315,0.223518
7316,-0.010886
...,...
18695,-0.017917
18696,0.005571
18697,-0.017361
18698,-0.021624


In [31]:
ridge_combination = pd.concat([test[["Player", "Share"]], ridge_df], axis = 1)
ridge_combination.sort_values("MVP Share Predictions", ascending=False).head(20)

Unnamed: 0,Player,Share,MVP Share Predictions
7315,Giannis Antetokounmpo,0.606,0.223518
7823,Nikola Jokić,0.674,0.223
7407,Luka Dončić,0.01,0.193656
18517,Joel Embiid,0.915,0.17444
10690,Domantas Sabonis,0.027,0.130161
9535,Jayson Tatum,0.28,0.129465
8336,Shai Gilgeous-Alexander,0.046,0.120248
18693,Julius Randle,0.0,0.114498
13991,LeBron James,0.0,0.110243
16768,Ja Morant,0.001,0.107225


In [32]:
# generalize this later for all types of models
# outputs two new columns: Rank and Predicted Rank

def convert_to_ranks(df, actual_share_col, predicted_share_col):
    ranked_df = df.copy(deep=True)
    ranked_df["Rank"] = df[actual_share_col].rank(method="min", ascending=False)
    ranked_df["Predicted Rank"] = df[predicted_share_col].rank(method="min", ascending=False)
    return ranked_df.sort_values("Rank")

In [33]:
# create an error metric function that is based on ranking since
# most players' MVP share is 0
# assumption is table looks like ridge_combination
# with cols: Player, Share, MVP Share Predictions
# outdated not using the convert_to_ranks function but not useful
def find_ap(df, actual_share_col, predicted_share_col, num):
    actual_ranking = df.sort_values(actual_share_col, ascending=False).head(num)
    actual_ranking["Rank"] = list(range(1, actual_ranking.shape[0] + 1))
    predicted_ranking = df.sort_values(predicted_share_col, ascending=False)
    predicted_ranking["Pred_Rank"] = list(range(1, predicted_ranking.shape[0] + 1))
    abs_diff = 0
    for index, row in actual_ranking.iterrows():
        player_name = row["Player"]
        predicted_player_row = predicted_ranking[
            predicted_ranking["Player"] == player_name]
        abs_diff += abs(row["Rank"] - predicted_player_row["Pred_Rank"].values[0])
    return abs_diff

In [34]:
"""
Finding mean square error

This will give you a measure of the overall discrepancy or difference in ranks between the two for the top 10 MVP
candidates.

Input is a ranked dataframe after applying the convert_to_ranks function.
"""

def get_mse_top_10(ranked_df, num=10):
    top_n_df = ranked_df.head(num)
    return mean_squared_error(top_n_df["Rank"].to_numpy(), top_n_df["Predicted Rank"].to_numpy(), squared=True)

In [35]:
"""
Determine the AP@K (Average Precision at K) for your predicted ranking. 

Interested in top n since those are the MVP finalists. AP@K will provide an assessment
of the precision and correctness of your predicted ranking for the top candidates.

Input is a ranked dataframe after applying the convert_to_ranks function.
"""

def get_apk_n(ranked_df, k):
    top_k_actual = ranked_df["Player"][:k]
    top_k_predicted = ranked_df.sort_values("Predicted Rank")["Player"][:k]
    return apk(top_k_actual.tolist(), top_k_predicted.tolist(), 5)

In [36]:
""" 
Combines both of our metrics together to provide a clean look at how our model does 
in these two important criteria. """

def evaluate_model(df, actual_share_col, predicted_share_col, k):
    combined_df = convert_to_ranks(df, actual_share_col, predicted_share_col)
    
    #return combined_df
    mse = get_mse_top_10(combined_df)
    apk = get_apk_n(combined_df, k)
    
    return [mse, apk]
    

In [37]:
def print_model_results(mse, apk, k):
    print("Mean Squared Error:", mse)
    print("Average Precision at", str(k) + ":", apk)

In [38]:
# Final output for evaluating how our model did on the two key metrics
naive_results = evaluate_model(ridge_combination, "Share", "MVP Share Predictions", 5)
print_model_results(naive_results[0], naive_results[1], 5)

Mean Squared Error: 16.5
Average Precision at 5: 0.55


## Backtesting

In [39]:
# Following along, we make a function to find the biggest differences each year
# inside code we run function of convert_to_ranks
def difference_in_rank(ranked_df):
    ranked_df["Difference"] = ranked_df["Rank"] - ranked_df["Predicted Rank"]
    return ranked_df

In [40]:
def model_prediction(df, year, model, predictors):
    train = df[df["Year"] < year]
    test = df[df["Year"] == year]
    
    X_train = train[predictors]
    y_train = train["Share"]
    X_test = test[predictors]
    y_test = test["Share"]
    
    model.fit(X_train, y_train)
    model_predictions = model.predict(X_test) # predict for testing set (the current year starting w 2005)

    model_df = pd.DataFrame(model_predictions,
                                     columns=["MVP Share Predictions"],
                                     index=X_test.index) # putting predictions into a dataframe

    model_combination = pd.concat([test[["Player", "Share"]],
                                          model_df], axis = 1) # concatenating predictions with the players and their MVP share
    return model_combination

In [41]:
"""
Creating a backtesting function to run all the code we did and put it in a loop
"""
def backtest(orig_df, model_bt, timeframe, predictors, normalizing=False, voteFatigue=True):
    """
    Looping through years (starting with 2005 because we need at least 5 years of data to come up with a model)
    Then making predictions for every year based off previous traine data
    The more years we can train on, the more data our algorithm can build off to make accurate predictions
    """

    all_predictions = [] # list of dataframes holding our predictions for every year
    mse_ar = []
    apk_ar = []
    
    df = orig_df.copy(deep=True)
    
    
    if normalizing: # if our data needs to be normalized (for certain models where scaling is important)
        predictors_plus_year = predictors.copy()
        predictors_plus_year.append("Year")
        df[predictors] = df[predictors_plus_year].groupby("Year").transform(lambda x: (x-np.mean(x, axis=0))/np.std(x, ddof=1))
    
    if voteFatigue:
        df = add_previous_mvps(df, years)
        predictors = add_previous_mvp_as_predictor(predictors)

    for year in timeframe[5:]:
        model_bt_combination = model_prediction(df, year, model_bt, predictors)

        combined_bt_df = convert_to_ranks(model_bt_combination, "Share", "MVP Share Predictions") # adding actual and predicted rank into list
        with_differences = difference_in_rank(combined_bt_df) # adding difference between actual and predicted rank
        with_differences["Year"] = year # adding year to add more information to overall dataframe
        
        all_predictions.append(with_differences) # appending predictions to outer list

        mse_ar.append(get_mse_top_10(combined_bt_df))
        apk_ar.append(get_apk_n(combined_bt_df, 5))
        
    return ((sum(mse_ar) / len(mse_ar)), mse_ar, (sum(apk_ar) / len(apk_ar)), apk_ar, pd.concat(all_predictions))

In [179]:
normalized_avg_mse, normalized_mse_ar, normalized_avg_apk, normalized_apk_ar, normalized_all_predictions = backtest(mvp_df, reg, years, predictors, normalizing=True, voteFatigue=False)

In [180]:
avg_mse, mse_ar, avg_apk, apk_ar, all_predictions = backtest(mvp_df, reg, years, predictors, normalizing=False, voteFatigue=False)

In [181]:
avg_mse_n_vf, mse_ar_n_vf, avg_apk_n_vf, apk_ar_n_vf, all_predictions_n_vf = backtest(mvp_df, reg, years, predictors, normalizing=True, voteFatigue=True)

In [44]:
# Made it to 35:22 on Part 3
normalized_all_predictions[normalized_all_predictions["Rank"] <=1].sort_values("Difference", ascending=True).head(15)

Unnamed: 0,Player,Share,MVP Share Predictions,Rank,Predicted Rank,Difference,Year
11637,Steve Nash,0.839,0.051732,1.0,26.0,-25.0,2005
11655,Steve Nash,0.739,0.127229,1.0,11.0,-10.0,2006
13213,Kobe Bryant,0.873,0.114097,1.0,9.0,-8.0,2008
8117,Allen Iverson,0.904,0.149585,1.0,8.0,-7.0,2001
17262,Derrick Rose,0.977,0.137933,1.0,8.0,-7.0,2011
12546,Stephen Curry,0.922,0.125533,1.0,7.0,-6.0,2015
18517,Joel Embiid,0.915,0.148936,1.0,6.0,-5.0,2023
15168,Dirk Nowitzki,0.882,0.169486,1.0,5.0,-4.0,2007
6518,Charles Barkley,0.852,0.185663,1.0,4.0,-3.0,1993
8089,James Harden,0.955,0.168807,1.0,4.0,-3.0,2018


#### Looking at what are important factors according to our Ridge Model

In [45]:
pd.concat([pd.Series(reg.coef_), pd.Series(predictors)], axis=1).sort_values(0, ascending=False)

Unnamed: 0,0,1
13,0.170643,eFG%
48,0.067717,W/L%
53,0.062744,
45,0.024224,VORP
18,0.023973,DRB
16,0.019096,FT%
15,0.016825,FTA
10,0.0145,2P
5,0.013638,FGA
23,0.010311,TOV


## Trying Random Forest

In [101]:
rf_naive = RandomForestRegressor(n_estimators=100, random_state=42, min_samples_split=5)
rf_normalized = RandomForestRegressor(n_estimators=100, random_state=42, min_samples_split=5)
rf = RandomForestRegressor(n_estimators=100, random_state=42, min_samples_split=5)

In [47]:
def add_previous_mvps(df, years):
    mvp_with_previous_df = mvp_df.copy(deep=True)
    mvp_with_previous_df["Previous MVP's Won"] = 0
    for year in years:
        year_winner = mvp_with_previous_df[mvp_with_previous_df["Year"] == year].sort_values("Share", ascending=False).iloc[:1]["Player"].item()
        years_after_winning_for_player = mvp_with_previous_df[(mvp_with_previous_df["Player"] == year_winner) & (mvp_with_previous_df["Year"] > year)]
        years_after_index = years_after_winning_for_player.index.values.tolist()
        mvp_with_previous_df.loc[years_after_index, ["Previous MVP's Won"]] += 1
    return mvp_with_previous_df

In [48]:
def add_previous_mvp_as_predictor(predictors):
    predictors_with_previous = predictors.copy()
    predictors_with_previous.append("Previous MVP's Won")
    return predictors_with_previous

In [102]:
%%time
avg_mse_rf, mse_ar_rf, avg_apk_rf, apk_ar_rf, all_predictions_rf = backtest(
    mvp_df, rf_naive, years, predictors, normalizing=False, voteFatigue=False)

CPU times: user 12min 25s, sys: 2.92 s, total: 12min 28s
Wall time: 12min 37s


In [103]:
%%time
avg_mse_rf_n, mse_ar_rf_n, avg_apk_rf_n, apk_ar_rf_n, all_predictions_rf_n = backtest(
    mvp_df, rf_normalized, years, predictors, normalizing=True, voteFatigue=False)

CPU times: user 22min 32s, sys: 3.05 s, total: 22min 35s
Wall time: 22min 55s


In [104]:
%%time
avg_mse_rf_votefatigue, mse_ar_rf_votefatigue, avg_apk_rf_votefatigue, apk_ar_rf_votefatigue, all_predictions_rf_votefatigue = backtest(
    mvp_df, rf, years, predictors, normalizing=False,voteFatigue=True)

CPU times: user 12min 47s, sys: 5.74 s, total: 12min 53s
Wall time: 13min


In [113]:
avg_mse_rf, avg_mse_rf_n, avg_mse, avg_mse_rf_votefatigue

(62.338461538461544, 69.35384615384616, 261.3666666666667, 53.02564102564103)

In [115]:
avg_apk_rf, avg_apk_rf_n, avg_apk, avg_apk_rf_votefatigue

(0.6741025641025644, 0.6485470085470089, 0.6511111111111113, 0.674700854700855)

In [54]:
all_predictions_rf[(all_predictions_rf["Rank"] <= 5) & (all_predictions_rf["Year"] == 2023)]

Unnamed: 0,Player,Share,MVP Share Predictions,Rank,Predicted Rank,Difference,Year
18517,Joel Embiid,0.915,0.451259,1.0,2.0,-1.0,2023
7823,Nikola Jokić,0.674,0.621481,2.0,1.0,1.0,2023
7315,Giannis Antetokounmpo,0.606,0.40819,3.0,3.0,0.0,2023
9535,Jayson Tatum,0.28,0.188764,4.0,5.0,-1.0,2023
8336,Shai Gilgeous-Alexander,0.046,0.164957,5.0,6.0,-1.0,2023


In [55]:
all_predictions_rf[all_predictions_rf["Year"] == 2011].sort_values("Predicted Rank").head(15)

Unnamed: 0,Player,Share,MVP Share Predictions,Rank,Predicted Rank,Difference,Year
17925,LeBron James,0.431,0.513816,3.0,1.0,2.0,2011
17262,Derrick Rose,0.977,0.390684,1.0,2.0,-1.0,2011
17917,Dwyane Wade,0.02,0.360189,7.0,3.0,4.0,2011
16715,Dwight Howard,0.531,0.215356,2.0,4.0,-2.0,2011
13231,Pau Gasol,0.0,0.199894,14.0,5.0,9.0,2011
13226,Kobe Bryant,0.354,0.163801,4.0,6.0,-2.0,2011
17540,Kevin Durant,0.157,0.143395,5.0,7.0,-2.0,2011
7873,Chris Paul,0.002,0.074084,12.0,8.0,4.0,2011
17546,Russell Westbrook,0.0,0.056025,14.0,9.0,5.0,2011
14483,Kevin Love,0.0,0.053422,14.0,10.0,4.0,2011


In [56]:
avg_mse_rf, avg_mse_rf_n, avg_mse_rf_votefatigue, avg_mse

(53.02564102564103, 53.02564102564103, 56.00512820512821, 212.60769230769233)

In [57]:
avg_apk_rf, avg_apk_rf_n, avg_apk_rf_votefatigue, avg_apk

(0.674700854700855, 0.674700854700855, 0.6614529914529916, 0.46316239316239316)

In [58]:
all_predictions_rf_votefatigue[(all_predictions_rf_votefatigue["Rank"] < 6) & (all_predictions_rf_votefatigue["Year"] > 2015)]

Unnamed: 0,Player,Share,MVP Share Predictions,Rank,Predicted Rank,Difference,Year
12059,Stephen Curry,1.0,0.855678,1.0,1.0,0.0,2016
12782,Kawhi Leonard,0.484,0.334518,2.0,4.0,-2.0,2016
17853,LeBron James,0.482,0.50793,3.0,2.0,1.0,2016
12869,Russell Westbrook,0.371,0.313306,4.0,5.0,-1.0,2016
12863,Kevin Durant,0.112,0.455406,5.0,3.0,2.0,2016
12881,Russell Westbrook,0.879,0.326622,1.0,6.0,-5.0,2017
16124,James Harden,0.746,0.38453,2.0,3.0,-1.0,2017
17461,Kawhi Leonard,0.495,0.513068,3.0,1.0,2.0,2017
13203,LeBron James,0.33,0.341059,4.0,5.0,-1.0,2017
9468,Isaiah Thomas,0.08,0.239619,5.0,7.0,-2.0,2017


## Linear Regression Model

In [59]:
linreg = LinearRegression()

In [60]:
linreg.fit(X_train_linreg, y_train_linreg)
linreg_predictions = linreg.predict(X_test_linreg) # predict for testing set (the current year starting w 2005)

linreg_df = pd.DataFrame(linreg_predictions,
                                     columns=["MVP Share Predictions"],
                                     index=X_test_linreg.index) # putting predictions into a dataframe

linreg_combination = pd.concat([test_linreg[["Player", "Share"]],
                                          linreg_df], axis = 1) # concatenating predictions with the players and their MVP share

combined_linreg_df = convert_to_ranks(linreg_combination, "Share", "MVP Share Predictions") # adding actual and predicted rank into list
with_differences_linreg = difference_in_rank(combined_linreg_df) # adding difference between actual and predicted rank
with_differences_linreg["Year"] = year # adding year to add more information to overall dataframe
with_differences_linreg

Unnamed: 0,Player,Share,MVP Share Predictions,Rank,Predicted Rank,Difference,Year
18517,Joel Embiid,0.915,0.191591,1.0,4.0,-3.0,2023
7823,Nikola Jokić,0.674,0.245383,2.0,1.0,1.0,2023
7315,Giannis Antetokounmpo,0.606,0.232926,3.0,2.0,1.0,2023
9535,Jayson Tatum,0.280,0.145225,4.0,5.0,-1.0,2023
8336,Shai Gilgeous-Alexander,0.046,0.142940,5.0,6.0,-1.0,2023
...,...,...,...,...,...,...,...
10559,Patrick Williams,0.000,-0.039880,14.0,522.0,-508.0,2023
10558,Patrick Beverley,0.000,-0.024746,14.0,486.0,-472.0,2023
10557,Nikola Vučević,0.000,0.062679,14.0,25.0,-11.0,2023
10694,Keon Ellis,0.000,0.022371,14.0,83.0,-69.0,2023


In [61]:
%%time
avg_mse_lr_votefatigue, mse_ar_lr_votefatigue, avg_apk_lr_votefatigue, apk_ar_lr_votefatigue, all_predictions_lr_votefatigue = backtest(
    mvp_with_previous_df, linreg, years, predictors_with_previous, normalizing=True, voteFatigue=True)

CPU times: user 8.82 s, sys: 7.09 s, total: 15.9 s
Wall time: 3.41 s


In [62]:
avg_mse_rf, avg_mse_rf_n, avg_mse_rf_votefatigue, avg_mse, avg_mse_lr_votefatigue

(53.02564102564103,
 53.02564102564103,
 56.00512820512821,
 212.60769230769233,
 320.9410256410257)

In [63]:
avg_apk_rf, avg_apk_rf_n, avg_apk_rf_votefatigue, avg_apk, avg_apk_lr_votefatigue

(0.674700854700855,
 0.674700854700855,
 0.6614529914529916,
 0.46316239316239316,
 0.4519658119658119)

## Gradient Boosting Regressor

In [64]:
xgb_model = xgb.XGBRegressor(
    n_estimators=100,
    tree_method="hist",
    objective='reg:logistic',
)

In [65]:
%%time
avg_mse_xgb_vf, mse_ar_xgb_vf, avg_apk_xgb_vf, apk_ar_xgb_vf, all_predictions_xgb_vf = backtest(
    mvp_df, xgb_model, years, predictors, normalizing=False, voteFatigue=True)

CPU times: user 28.1 s, sys: 14.6 s, total: 42.7 s
Wall time: 8.49 s


In [66]:
%%time
avg_mse_xgb, mse_ar_xgb, avg_apk_xgb, apk_ar_xgb, all_predictions_xgb = backtest(
    mvp_df, xgb_model, years, predictors, normalizing=False, voteFatigue=False)

CPU times: user 27.8 s, sys: 13.3 s, total: 41.1 s
Wall time: 7.11 s


In [67]:
avg_mse, avg_mse_rf, avg_mse_rf_n, avg_mse_rf_votefatigue, avg_mse_lr_votefatigue, avg_mse_xgb_vf, avg_mse_xgb

(212.60769230769233,
 53.02564102564103,
 53.02564102564103,
 56.00512820512821,
 320.9410256410257,
 59.73846153846156,
 56.56153846153846)

In [68]:
avg_apk, avg_apk_rf, avg_apk_rf_n, avg_apk_rf_votefatigue, avg_apk_lr_votefatigue, avg_apk_xgb_vf, avg_apk_xgb

(0.46316239316239316,
 0.674700854700855,
 0.674700854700855,
 0.6614529914529916,
 0.4519658119658119,
 0.6975213675213677,
 0.690940170940171)

In [69]:
all_predictions_xgb_vf.sort_values("MVP Share Predictions", ascending=False).head(10)

Unnamed: 0,Player,Share,MVP Share Predictions,Rank,Predicted Rank,Difference,Year
12059,Stephen Curry,1.0,0.961022,1.0,1.0,0.0,2016
12022,LeBron James,0.969,0.936465,1.0,1.0,0.0,2009
2568,Larry Bird,0.347,0.909047,3.0,1.0,2.0,1987
4270,Michael Jordan,0.938,0.90767,1.0,1.0,0.0,1992
12039,LeBron James,0.98,0.892047,1.0,1.0,0.0,2010
17890,LeBron James,0.998,0.877953,1.0,1.0,0.0,2013
21,Magic Johnson,0.94,0.868811,1.0,2.0,-1.0,1987
561,Karl Malone,0.726,0.86739,2.0,1.0,1.0,1998
4283,Michael Jordan,0.577,0.851732,3.0,1.0,2.0,1993
5227,Michael Jordan,0.934,0.841087,1.0,2.0,-1.0,1998


In [70]:
all_predictions_rf_votefatigue.sort_values("MVP Share Predictions", ascending=False).head(10)

Unnamed: 0,Player,Share,MVP Share Predictions,Rank,Predicted Rank,Difference,Year
12059,Stephen Curry,1.0,0.855678,1.0,1.0,0.0,2016
4270,Michael Jordan,0.938,0.842623,1.0,1.0,0.0,1992
5213,Michael Jordan,0.832,0.82526,2.0,1.0,1.0,1997
5199,Michael Jordan,0.986,0.824119,1.0,1.0,0.0,1996
12022,LeBron James,0.969,0.806791,1.0,1.0,0.0,2009
8089,James Harden,0.955,0.791238,1.0,1.0,0.0,2018
17890,LeBron James,0.998,0.789521,1.0,1.0,0.0,2013
12039,LeBron James,0.98,0.786356,1.0,1.0,0.0,2010
59,Magic Johnson,0.691,0.776632,1.0,1.0,0.0,1990
7275,Shaquille O'Neal,0.998,0.76787,1.0,1.0,0.0,2000


In [71]:
all_predictions_xgb.sort_values("MVP Share Predictions", ascending=False).head(10)

Unnamed: 0,Player,Share,MVP Share Predictions,Rank,Predicted Rank,Difference,Year
12022,LeBron James,0.969,0.935641,1.0,1.0,0.0,2009
12059,Stephen Curry,1.0,0.93433,1.0,1.0,0.0,2016
17890,LeBron James,0.998,0.918661,1.0,1.0,0.0,2013
12039,LeBron James,0.98,0.913523,1.0,1.0,0.0,2010
4270,Michael Jordan,0.938,0.909596,1.0,1.0,0.0,1992
59,Magic Johnson,0.691,0.905253,1.0,1.0,0.0,1990
2568,Larry Bird,0.347,0.901237,3.0,1.0,2.0,1987
4283,Michael Jordan,0.577,0.860601,3.0,1.0,2.0,1993
21,Magic Johnson,0.94,0.836464,1.0,2.0,-1.0,1987
6518,Charles Barkley,0.852,0.834436,1.0,2.0,-1.0,1993


### Our metric tested how accurate the top 5 players are. I want to look at the top 3 candidates and their accuracy as well as how often the models got the actual MVP right

In [94]:
get_apk_n(all_predictions[all_predictions["Year"] == 2023].sort_values("Rank"), 3)
all_predictions[all_predictions["Year"] == 2018].sort_values("Predicted Rank")

Unnamed: 0,Player,Share,MVP Share Predictions,Rank,Predicted Rank,Difference,Year
13804,LeBron James,0.731,0.457199,2.0,1.0,1.0,2018
12900,Russell Westbrook,0.075,0.216127,5.0,2.0,3.0,2018
12554,Kevin Durant,0.065,0.170247,7.0,3.0,4.0,2018
8089,James Harden,0.955,0.168807,1.0,4.0,-3.0,2018
12562,Stephen Curry,0.005,0.165612,10.0,5.0,5.0,2018
...,...,...,...,...,...,...,...
16946,Marshall Plumlee,0.000,-0.063945,14.0,536.0,-522.0,2018
12548,Chris Boucher,0.000,-0.066427,14.0,537.0,-523.0,2018
13386,MarShon Brooks,0.000,-0.070822,14.0,538.0,-524.0,2018
10452,Andre Ingram,0.000,-0.086954,14.0,539.0,-525.0,2018


In [215]:
prediction_arrays = [all_predictions, normalized_all_predictions, all_predictions_n_vf, all_predictions_rf, 
                     all_predictions_rf_n, all_predictions_rf_votefatigue, all_predictions_lr_votefatigue,
                     all_predictions_xgb_vf, all_predictions_xgb]

In [105]:
def get_apk_from_prediction_array(prediction_array, k):
    new_ar = []
    for year in years:
        value = get_apk_n(prediction_array[prediction_array["Year"] == year].sort_values("Rank"), k)
        new_ar.append(value)
    return new_ar

In [182]:
apk_3_all_predictions = get_apk_from_prediction_array(all_predictions, 3)
apk_3_normalized_all_predictions = get_apk_from_prediction_array(normalized_all_predictions, 3)
apk_3_all_predictions_n_vf = get_apk_from_prediction_array(all_predictions_n_vf, 3)
apk_3_all_predictions_rf = get_apk_from_prediction_array(all_predictions_rf, 3)
apk_3_all_predictions_rf_n = get_apk_from_prediction_array(all_predictions_rf_n, 3)
apk_3_all_predictions_rf_votefatigue = get_apk_from_prediction_array(all_predictions_rf_votefatigue, 3)
apk_3_all_predictions_lr_votefatigue = get_apk_from_prediction_array(all_predictions_lr_votefatigue, 3)
apk_3_all_predictions_xgb_vf = get_apk_from_prediction_array(all_predictions_xgb_vf, 3)
apk_3_all_predictions_xgb = get_apk_from_prediction_array(all_predictions_xgb, 3)

In [183]:
apk_3_prediction_arrays = [apk_3_all_predictions, apk_3_normalized_all_predictions, apk_3_all_predictions_n_vf, apk_3_all_predictions_rf, 
                     apk_3_all_predictions_rf_n, apk_3_all_predictions_rf_votefatigue, apk_3_all_predictions_lr_votefatigue,
                     apk_3_all_predictions_xgb_vf, apk_3_all_predictions_xgb]

In [208]:
apk_3_average = []
for array in apk_3_prediction_arrays:
    apk_3_average.append(sum(array) / len(array))

#### Now accuracy

In [185]:
def check_accuracy(prediction_array):
    acc_sum = 0
    for year in years[5:]:
        actual_winner = prediction_array[prediction_array["Year"] == year].iloc[:1]["Player"].item()
        predicted_winner = prediction_array[prediction_array["Year"] == year].sort_values("Predicted Rank").iloc[:1]["Player"].item()
        acc_sum += actual_winner == predicted_winner
    return acc_sum / len(years[5:])

In [216]:
accuracy_array = []
for array in prediction_arrays:
    accuracy_array.append(check_accuracy(array))

## Assessing performance of all models

In [201]:
mse_arrays = [mse_ar, normalized_mse_ar, mse_ar_n_vf, mse_ar_rf, mse_ar_rf_n, mse_ar_rf_votefatigue,
              mse_ar_lr_votefatigue, mse_ar_xgb_vf, mse_ar_xgb]
apk_arrays = [apk_ar, normalized_apk_ar, apk_ar_n_vf, apk_ar_rf, apk_ar_rf_n, apk_ar_rf_votefatigue, 
              apk_ar_lr_votefatigue, apk_ar_xgb_vf, apk_ar_xgb]

In [202]:
def checking_recent_years(list_of_model_arrays, year_to_start):
    year_index = years.index(year_to_start) - 4
    results = []
    results.append(year_to_start)
    for model in list_of_model_arrays:
        results.append(sum(model[:year_index]) / len(model[:year_index]))
    return results

In [203]:
# Converting this data to dataframe so I can graph
every_year_mse_results = []
every_year_apk_results = []
for year in years[5:]:
    up_to_year_accuracy_mse = checking_recent_years(mse_arrays, year)
    up_to_year_accuracy_apk = checking_recent_years(apk_arrays, year)
    every_year_mse_results.append(up_to_year_accuracy_mse)
    every_year_apk_results.append(up_to_year_accuracy_apk)

In [204]:
graphing_results_mse_df = pd.DataFrame(data=every_year_mse_results, columns = ["Year", "Ridge Regression", "Normalized Ridge Regression", "Normalized Ridge w/Voter Fatigue",  "Random Forest Naive",
                                              "Random Forest Normalized", "Random Forest w/Voter Fatigue",
                                              "Linear Regression w/Voter Fatigue", "XGradient Boost w/Voter Fatigue", "XGradient Boost"])

graphing_results_apk_df = pd.DataFrame(data=every_year_apk_results, columns = ["Year", "Ridge Regression", "Normalized Ridge Regression", "Normalized Ridge w/Voter Fatigue", "Random Forest Naive",
                                              "Random Forest Normalized", "Random Forest w/Voter Fatigue",
                                              "Linear Regression w/Voter Fatigue", "XGradient Boost w/Voter Fatigue", "XGradient Boost"])

In [205]:
important_columns = ["Year", "Ridge Regression", "Linear Regression", "Random Forest w/Voter Fatigue", "XGradient Boost w/Voter Fatigue", "XGradient Boost"]

In [206]:
mse_fig = px.line(graphing_results_mse_df, x='Year', y=graphing_results_mse_df.columns[1:8])
mse_fig

In [207]:
apk_fig = px.line(graphing_results_apk_df, x='Year', y=graphing_results_apk_df.columns[1:8])
apk_fig

In [227]:
all_results_data = {"Average Mean Square Error": [avg_mse, normalized_avg_mse, avg_mse_n_vf, avg_mse_rf, avg_mse_rf_n, avg_mse_rf_votefatigue, avg_mse_lr_votefatigue, avg_mse_xgb_vf, avg_mse_xgb],
                    "Mean Average Precision at k = 5": [avg_apk, normalized_avg_apk, avg_apk_n_vf, avg_apk_rf, avg_apk_rf_n, avg_apk_rf_votefatigue, avg_apk_lr_votefatigue, avg_apk_xgb_vf, avg_apk_xgb], 
                    "Mean Average Precision at k = 3": apk_3_average, "Accuracy": accuracy_array}

In [232]:
ensemble_model_df = pd.DataFrame(data=all_results_data, index=["Ridge Regression", "Normalized Ridge Regression", "Normalized Ridge w/Voter Fatigue",  "Random Forest Naive",
                                              "Random Forest Normalized", "Random Forest w/Voter Fatigue",
                                              "Linear Regression w/Voter Fatigue", "XGradient Boost w/Voter Fatigue", "XGradient Boost"]) 

In [233]:
ensemble_model_df

Unnamed: 0,Average Mean Square Error,Mean Average Precision at k = 5,Mean Average Precision at k = 3,Accuracy
Ridge Regression,236.84359,0.609487,0.539141,0.487179
Normalized Ridge Regression,261.366667,0.651111,0.546717,0.487179
Normalized Ridge w/Voter Fatigue,212.607692,0.463162,0.392677,0.282051
Random Forest Naive,62.338462,0.674103,0.598485,0.641026
Random Forest Normalized,69.353846,0.648547,0.584596,0.666667
Random Forest w/Voter Fatigue,53.025641,0.674701,0.598485,0.641026
Linear Regression w/Voter Fatigue,320.941026,0.451966,0.376263,0.25641
XGradient Boost w/Voter Fatigue,59.738462,0.697521,0.584596,0.717949
XGradient Boost,56.561538,0.69094,0.589646,0.692308


## Neural Networks

##  Saving models so we can work on them and look at them after

In [79]:
# Save RF Model to file in the current working directory

joblib_file_rf = "joblib_RF_Model.pkl"  
joblib.dump(rf, joblib_file)
# This is with voter fatigue as well which has the lowest MSE and the highest APK

['joblib_RF_Normalized_Model.pkl']

In [80]:
joblib_file_rf_naive = "joblib_RF_Naive_Model.pkl"  
joblib.dump(rf_naive, joblib_file_rf_naive)

['joblib_RF_Naive_Model.pkl']

In [81]:
joblib_file_rf_normalized = "joblib_RF_Normalized_Model.pkl"  
joblib.dump(rf_naive, joblib_file_rf_normalized)

['joblib_RF_Normalized_Model.pkl']

In [82]:
filepath = 'rf_session_7-10.pkl'
dill.dump_session(filepath) # Save the session

In [83]:
rf

In [381]:
mvp_df[(mvp_df["STL"] > 2.3) & (mvp_df["Team"] == "Indiana Pacers")]

Unnamed: 0,Player,Pos,Age,G,GS,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Year,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP,Pts Won,Pts Max,Share,Team,W,L,W/L%,GB,PS/G,PA/G,SRS
5238,Dudley Bradley,SG,22,82,31,24.7,3.4,7.4,0.452,0.0,0.1,0.4,3.3,7.4,0.452,0.453,1.7,2.1,0.782,0.8,1.9,2.7,3.1,2.6,0.6,2.0,2.4,8.4,1980,13.5,0.502,0.008,0.286,3.4,8.1,5.7,15.9,4.8,1.3,19.5,16.3,0.2,2.9,3.1,0.073,-2.0,3.0,1.0,1.5,0.0,0.0,0.0,Indiana Pacers,37,45,0.451,13.0,111.2,111.9,-0.54
6829,Micheal Williams,PG,25,79,68,34.8,5.1,10.4,0.49,0.1,0.4,0.242,5.0,10.0,0.501,0.495,4.7,5.4,0.871,0.9,2.6,3.6,8.2,2.9,0.3,3.0,3.3,15.0,1992,18.4,0.587,0.04,0.518,3.1,8.0,5.7,32.2,4.1,0.5,19.2,19.1,5.8,3.2,9.0,0.156,1.9,1.8,3.7,3.9,0.0,0.0,0.0,Indiana Pacers,40,42,0.488,27.0,112.2,110.3,1.85
9190,Metta World Peace,SF,22,55,50,29.9,4.9,11.6,0.423,1.0,3.1,0.312,3.9,8.5,0.464,0.465,2.5,3.7,0.667,1.3,3.6,4.9,2.3,2.6,0.7,2.1,3.9,13.2,2002,15.8,0.501,0.267,0.321,5.1,13.8,9.5,14.1,4.5,1.7,14.0,23.5,0.0,2.7,2.7,0.079,-0.4,3.0,2.6,1.9,0.0,0.0,0.0,Indiana Pacers,42,40,0.512,8.0,96.8,96.5,-0.07
9727,Victor Oladipo,SG,25,75,75,34.0,8.5,17.9,0.477,2.1,5.8,0.371,6.4,12.1,0.528,0.537,3.9,4.9,0.799,0.6,4.6,5.2,4.3,2.4,0.8,2.9,2.3,23.1,2018,23.1,0.577,0.323,0.274,2.1,15.1,8.6,21.2,3.5,2.0,12.7,30.1,4.3,4.0,8.2,0.155,4.1,1.7,5.8,5.0,2.0,1010.0,0.002,Indiana Pacers,48,34,0.585,2.0,105.6,104.2,1.18
