### Now that we have a fully cleaned dataframe, we can start building out our model

In [768]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
import xgboost as xgb
from sklearn import svm
import plotly.express as px
import joblib
import dill 
import pdb
#filepath = 'rf_session_7-7.pkl'
#dill.load_session(filepath)
import joblib
#joblib_file = "joblib_RF_Model.pkl"  
#newest_rf = joblib.load(joblib_file) # This is with voter fatigue as well which has the lowest MSE and the highest APK

In [262]:
# Neural Network import
import tensorflow
import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from scikeras.wrappers import KerasRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [263]:
"""Imported functions from ml_metrics library"""

def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.

    This function computes the average prescision at k between two lists of
    items.

    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements

    Returns
    -------
    score : double
            The average precision at k over the input lists

    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k.

    This function computes the mean average prescision at k between two lists
    of lists of items.

    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements

    Returns
    -------
    score : double
            The mean average precision at k over the input lists

    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

In [264]:
mvp_df = pd.read_csv("all_player_mvp_stats.csv")

In [265]:
# Dropping previous index as well as Tm since redundant with Team and
# Hornets have two different abbreviations
mvp_df = mvp_df.drop(["Unnamed: 0", "Tm"], axis=1)

In [266]:
years = list(range(1980, 2024))

In [267]:
# setting up everything for our variables
# Removed qualitative variables like Team as well as Year
predictors = ['Age', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB',
       'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'PER',
       'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%',
       'TOV%', 'USG%', 'OWS', 'DWS', 'WS', 'WS/48', 'OBPM', 'DBPM', 'BPM',
       'VORP', 'W', 'L', 'W/L%', 'GB',
       'PS/G', 'PA/G', 'SRS']

In [268]:
train = mvp_df[mvp_df["Year"] < 2023]
test = mvp_df[mvp_df["Year"] == 2023]

In [269]:
X_train = train[predictors]
y_train = train["Share"]

X_test = test[predictors]
y_test = test["Share"]

In [270]:
# Setting up first model (Ridge Regression)

reg = Ridge(alpha=.1)
reg.fit(X_train, y_train)
ridge_predictions = reg.predict(X_test)

In [271]:
# Converting to df to be cleaner
ridge_df = pd.DataFrame(ridge_predictions,
                                 columns=["MVP Share Predictions"],
                                 index=X_test.index)
ridge_df

Unnamed: 0,MVP Share Predictions
7312,0.018240
7313,0.029694
7314,0.029982
7315,0.223518
7316,-0.010886
...,...
18695,-0.017917
18696,0.005571
18697,-0.017361
18698,-0.021624


In [272]:
ridge_combination = pd.concat([test[["Player", "Share"]], ridge_df], axis = 1)
ridge_combination.sort_values("MVP Share Predictions", ascending=False).head(20)

Unnamed: 0,Player,Share,MVP Share Predictions
7315,Giannis Antetokounmpo,0.606,0.223518
7823,Nikola Jokić,0.674,0.223
7407,Luka Dončić,0.01,0.193656
18517,Joel Embiid,0.915,0.17444
10690,Domantas Sabonis,0.027,0.130161
9535,Jayson Tatum,0.28,0.129465
8336,Shai Gilgeous-Alexander,0.046,0.120248
18693,Julius Randle,0.0,0.114498
13991,LeBron James,0.0,0.110243
16768,Ja Morant,0.001,0.107225


In [273]:
# generalize this later for all types of models
# outputs two new columns: Rank and Predicted Rank

def convert_to_ranks(df, actual_share_col, predicted_share_col):
    ranked_df = df.copy(deep=True)
    ranked_df["Rank"] = df[actual_share_col].rank(method="min", ascending=False)
    ranked_df["Predicted Rank"] = df[predicted_share_col].rank(method="min", ascending=False)
    return ranked_df.sort_values("Rank")

In [274]:
# create an error metric function that is based on ranking since
# most players' MVP share is 0
# assumption is table looks like ridge_combination
# with cols: Player, Share, MVP Share Predictions
# outdated not using the convert_to_ranks function but not useful
def find_ap(df, actual_share_col, predicted_share_col, num):
    actual_ranking = df.sort_values(actual_share_col, ascending=False).head(num)
    actual_ranking["Rank"] = list(range(1, actual_ranking.shape[0] + 1))
    predicted_ranking = df.sort_values(predicted_share_col, ascending=False)
    predicted_ranking["Pred_Rank"] = list(range(1, predicted_ranking.shape[0] + 1))
    abs_diff = 0
    for index, row in actual_ranking.iterrows():
        player_name = row["Player"]
        predicted_player_row = predicted_ranking[
            predicted_ranking["Player"] == player_name]
        abs_diff += abs(row["Rank"] - predicted_player_row["Pred_Rank"].values[0])
    return abs_diff

In [275]:
"""
Finding mean square error

This will give you a measure of the overall discrepancy or difference in ranks between the two for the top 10 MVP
candidates.

Input is a ranked dataframe after applying the convert_to_ranks function.
"""

def get_mse_top_n(ranked_df, num=10):
    top_n_df = ranked_df.head(num)
    return mean_squared_error(top_n_df["Rank"].to_numpy(), top_n_df["Predicted Rank"].to_numpy(), squared=True)

In [276]:
"""
Determine the AP@K (Average Precision at K) for your predicted ranking. 

Interested in top n since those are the MVP finalists. AP@K will provide an assessment
of the precision and correctness of your predicted ranking for the top candidates.

Input is a ranked dataframe after applying the convert_to_ranks function.
"""

def get_apk_n(ranked_df, k):
    top_k_actual = ranked_df["Player"][:k]
    top_k_predicted = ranked_df.sort_values("Predicted Rank")["Player"][:k]
    return apk(top_k_actual.tolist(), top_k_predicted.tolist(), 5)

In [277]:
""" 
Combines both of our metrics together to provide a clean look at how our model does 
in these two important criteria. """

def evaluate_model(df, actual_share_col, predicted_share_col, k):
    combined_df = convert_to_ranks(df, actual_share_col, predicted_share_col)
    
    #return combined_df
    mse = get_mse_top_10(combined_df)
    apk = get_apk_n(combined_df, k)
    
    return [mse, apk]
    

In [278]:
def print_model_results(mse, apk, k):
    print("Mean Squared Error:", mse)
    print("Average Precision at", str(k) + ":", apk)

In [279]:
# Final output for evaluating how our model did on the two key metrics
naive_results = evaluate_model(ridge_combination, "Share", "MVP Share Predictions", 5)
print_model_results(naive_results[0], naive_results[1], 5)

Mean Squared Error: 16.5
Average Precision at 5: 0.55


## Backtesting

In [280]:
# Following along, we make a function to find the biggest differences each year
# inside code we run function of convert_to_ranks
def difference_in_rank(ranked_df):
    ranked_df["Difference"] = ranked_df["Rank"] - ranked_df["Predicted Rank"]
    return ranked_df

In [281]:
def model_prediction(df, year, model, predictors):
    train = df[df["Year"] < year]
    test = df[df["Year"] == year]
    
    X_train = train[predictors]
    y_train = train["Share"]
    X_test = test[predictors]
    y_test = test["Share"]
    
    model.fit(X_train, y_train)
    model_predictions = model.predict(X_test) # predict for testing set (the current year starting w 2005)

    model_df = pd.DataFrame(model_predictions,
                                     columns=["MVP Share Predictions"],
                                     index=X_test.index) # putting predictions into a dataframe

    model_combination = pd.concat([test[["Player", "Share"]],
                                          model_df], axis = 1) # concatenating predictions with the players and their MVP share
    return model_combination

In [303]:
"""
Creating a backtesting function to run all the code we did and put it in a loop
"""
def backtest(orig_df, model_bt, timeframe, predictors, normalizing=False, voteFatigue=True):
    """
    Looping through years (starting with 2005 because we need at least 5 years of data to come up with a model)
    Then making predictions for every year based off previous traine data
    The more years we can train on, the more data our algorithm can build off to make accurate predictions
    """

    all_predictions = [] # list of dataframes holding our predictions for every year
    mse_ar = []
    apk_ar = []
    
    df = orig_df.copy(deep=True)
    
    
    if normalizing: # if our data needs to be normalized (for certain models where scaling is important)
        predictors_plus_year = predictors.copy()
        predictors_plus_year.append("Year")
        df[predictors] = df[predictors_plus_year].groupby("Year").transform(lambda x: (x-np.mean(x, axis=0))/np.std(x, ddof=1))
    
    if voteFatigue:
        df = add_previous_mvps(df, years)
        predictors = add_previous_mvp_as_predictor(predictors)

    for year in timeframe[5:]:
        model_bt_combination = model_prediction(df, year, model_bt, predictors)

        combined_bt_df = convert_to_ranks(model_bt_combination, "Share", "MVP Share Predictions") # adding actual and predicted rank into list
        with_differences = difference_in_rank(combined_bt_df) # adding difference between actual and predicted rank
        with_differences["Year"] = year # adding year to add more information to overall dataframe
        
        all_predictions.append(with_differences) # appending predictions to outer list

        mse_ar.append(get_mse_top_n(combined_bt_df))
        apk_ar.append(get_apk_n(combined_bt_df, 5))
        
    return ((sum(mse_ar) / len(mse_ar)), mse_ar, (sum(apk_ar) / len(apk_ar)), apk_ar, pd.concat(all_predictions))

In [283]:
normalized_avg_mse, normalized_mse_ar, normalized_avg_apk, normalized_apk_ar, normalized_all_predictions = backtest(mvp_df, reg, years, predictors, normalizing=True, voteFatigue=False)

In [284]:
avg_mse, mse_ar, avg_apk, apk_ar, all_predictions = backtest(mvp_df, reg, years, predictors, normalizing=False, voteFatigue=False)

In [285]:
avg_mse_n_vf, mse_ar_n_vf, avg_apk_n_vf, apk_ar_n_vf, all_predictions_n_vf = backtest(mvp_df, reg, years, predictors, normalizing=True, voteFatigue=True)

In [44]:
# Made it to 35:22 on Part 3
normalized_all_predictions[normalized_all_predictions["Rank"] <=1].sort_values("Difference", ascending=True).head(15)

Unnamed: 0,Player,Share,MVP Share Predictions,Rank,Predicted Rank,Difference,Year
11637,Steve Nash,0.839,0.051732,1.0,26.0,-25.0,2005
11655,Steve Nash,0.739,0.127229,1.0,11.0,-10.0,2006
13213,Kobe Bryant,0.873,0.114097,1.0,9.0,-8.0,2008
8117,Allen Iverson,0.904,0.149585,1.0,8.0,-7.0,2001
17262,Derrick Rose,0.977,0.137933,1.0,8.0,-7.0,2011
12546,Stephen Curry,0.922,0.125533,1.0,7.0,-6.0,2015
18517,Joel Embiid,0.915,0.148936,1.0,6.0,-5.0,2023
15168,Dirk Nowitzki,0.882,0.169486,1.0,5.0,-4.0,2007
6518,Charles Barkley,0.852,0.185663,1.0,4.0,-3.0,1993
8089,James Harden,0.955,0.168807,1.0,4.0,-3.0,2018


#### Looking at what are important factors according to our Ridge Model

In [45]:
pd.concat([pd.Series(reg.coef_), pd.Series(predictors)], axis=1).sort_values(0, ascending=False)

Unnamed: 0,0,1
13,0.170643,eFG%
48,0.067717,W/L%
53,0.062744,
45,0.024224,VORP
18,0.023973,DRB
16,0.019096,FT%
15,0.016825,FTA
10,0.0145,2P
5,0.013638,FGA
23,0.010311,TOV


## Trying Random Forest

In [248]:
rf_naive = RandomForestRegressor(n_estimators=100, random_state=42, min_samples_split=5)
rf_normalized = RandomForestRegressor(n_estimators=100, random_state=42, min_samples_split=5)
rf = RandomForestRegressor(n_estimators=100, random_state=42, min_samples_split=5)

In [249]:
def add_previous_mvps(df, years):
    mvp_with_previous_df = mvp_df.copy(deep=True)
    mvp_with_previous_df["Previous MVP's Won"] = 0
    for year in years:
        year_winner = mvp_with_previous_df[mvp_with_previous_df["Year"] == year].sort_values("Share", ascending=False).iloc[:1]["Player"].item()
        years_after_winning_for_player = mvp_with_previous_df[(mvp_with_previous_df["Player"] == year_winner) & (mvp_with_previous_df["Year"] > year)]
        years_after_index = years_after_winning_for_player.index.values.tolist()
        mvp_with_previous_df.loc[years_after_index, ["Previous MVP's Won"]] += 1
    return mvp_with_previous_df

In [250]:
def add_previous_mvp_as_predictor(predictors):
    predictors_with_previous = predictors.copy()
    predictors_with_previous.append("Previous MVP's Won")
    return predictors_with_previous

In [255]:
%%time
avg_mse_rf, mse_ar_rf, avg_apk_rf, apk_ar_rf, all_predictions_rf = backtest(
    mvp_df, rf_naive, years, predictors, normalizing=False, voteFatigue=False)

CPU times: user 12min 22s, sys: 3.77 s, total: 12min 26s
Wall time: 12min 30s


In [256]:
%%time
avg_mse_rf_n, mse_ar_rf_n, avg_apk_rf_n, apk_ar_rf_n, all_predictions_rf_n = backtest(
    mvp_df, rf_normalized, years, predictors, normalizing=True, voteFatigue=False)

CPU times: user 21min 11s, sys: 7.03 s, total: 21min 18s
Wall time: 21min 23s


In [257]:
%%time
avg_mse_rf_votefatigue, mse_ar_rf_votefatigue, avg_apk_rf_votefatigue, apk_ar_rf_votefatigue, all_predictions_rf_votefatigue = backtest(
    mvp_df, rf, years, predictors, normalizing=False,voteFatigue=True)

CPU times: user 12min 47s, sys: 5.93 s, total: 12min 53s
Wall time: 13min 11s


## Linear Regression Model

In [288]:
linreg = LinearRegression()

In [292]:
%%time
avg_mse_lr_votefatigue, mse_ar_lr_votefatigue, avg_apk_lr_votefatigue, apk_ar_lr_votefatigue, all_predictions_lr_votefatigue = backtest(
    mvp_df, linreg, years, predictors, normalizing=True, voteFatigue=True)

CPU times: user 6.51 s, sys: 207 ms, total: 6.72 s
Wall time: 1.44 s


## Gradient Boosting Regressor

In [293]:
xgb_model = xgb.XGBRegressor(
    n_estimators=100,
    tree_method="hist",
    objective='reg:logistic',
)

In [294]:
%%time
avg_mse_xgb_vf, mse_ar_xgb_vf, avg_apk_xgb_vf, apk_ar_xgb_vf, all_predictions_xgb_vf = backtest(
    mvp_df, xgb_model, years, predictors, normalizing=False, voteFatigue=True)

CPU times: user 27.5 s, sys: 12.2 s, total: 39.7 s
Wall time: 6.37 s


In [295]:
%%time
avg_mse_xgb, mse_ar_xgb, avg_apk_xgb, apk_ar_xgb, all_predictions_xgb = backtest(
    mvp_df, xgb_model, years, predictors, normalizing=False, voteFatigue=False)

CPU times: user 26.6 s, sys: 11 s, total: 37.6 s
Wall time: 5.42 s


## Neural Networks

In [296]:
dnn = MLPRegressor(
    solver='lbfgs',
    hidden_layer_sizes=100,
    max_iter=1000,
    random_state=42,
    activation='logistic',
    learning_rate ='adaptive')

In [297]:
%%time
avg_mse_nn, mse_ar_nn, avg_apk_nn, apk_ar_nn, all_predictions_nn = backtest(
    mvp_df, dnn, years, predictors, normalizing=True, voteFatigue=False)

CPU times: user 17min 22s, sys: 31.1 s, total: 17min 53s
Wall time: 2min 24s


In [298]:
%%time
avg_mse_nn_vf, mse_ar_nn_vf, avg_apk_nn_vf, apk_ar_nn_vf, all_predictions_nn_vf = backtest(
    mvp_df, dnn, years, predictors, normalizing=True,voteFatigue=True)

CPU times: user 28min 32s, sys: 47.2 s, total: 29min 19s
Wall time: 3min 58s


### Our metric tested how accurate the top 5 players are. I want to look at the top 3 candidates and their accuracy as well as how often the models got the actual MVP right

In [94]:
get_apk_n(all_predictions[all_predictions["Year"] == 2023].sort_values("Rank"), 3)
all_predictions[all_predictions["Year"] == 2018].sort_values("Predicted Rank")

Unnamed: 0,Player,Share,MVP Share Predictions,Rank,Predicted Rank,Difference,Year
13804,LeBron James,0.731,0.457199,2.0,1.0,1.0,2018
12900,Russell Westbrook,0.075,0.216127,5.0,2.0,3.0,2018
12554,Kevin Durant,0.065,0.170247,7.0,3.0,4.0,2018
8089,James Harden,0.955,0.168807,1.0,4.0,-3.0,2018
12562,Stephen Curry,0.005,0.165612,10.0,5.0,5.0,2018
...,...,...,...,...,...,...,...
16946,Marshall Plumlee,0.000,-0.063945,14.0,536.0,-522.0,2018
12548,Chris Boucher,0.000,-0.066427,14.0,537.0,-523.0,2018
13386,MarShon Brooks,0.000,-0.070822,14.0,538.0,-524.0,2018
10452,Andre Ingram,0.000,-0.086954,14.0,539.0,-525.0,2018


In [679]:
prediction_arrays = [all_predictions, normalized_all_predictions, all_predictions_n_vf, all_predictions_rf, 
                     all_predictions_rf_n, all_predictions_rf_votefatigue, all_predictions_lr_votefatigue,
                     all_predictions_xgb_vf, all_predictions_xgb, all_predictions_nn, all_predictions_nn_vf, all_predictions_en]

In [680]:
quant_years = list(range(1985, 2024))
past_5_years = list(range(2019,2024))
past_10_years = list(range(2014, 2024))
since_2000 = list(range(2000, 2024))

In [681]:
def check_accuracy(ranked_df):
    actual_winner = ranked_df.iloc[:1]["Player"].item()
    predicted_winner = ranked_df.sort_values("Predicted Rank").iloc[:1]["Player"].item()
    return actual_winner == predicted_winner

In [682]:
## Need to fix this bc getting wrong results

def get_metric_from_prediction_array(metric, prediction_list, num, timeframe):
    new_ar = []
    for model in prediction_list:
        model_ar = []
        for year in timeframe:
            if (metric == get_apk_n):
                value = get_apk_n(model[model["Year"] == year].sort_values("Rank"), num)
                model_ar.append(value)
            elif (metric == get_mse_top_n):
                value = get_mse_top_n(model[model["Year"] == year].sort_values("Rank"), num)
                model_ar.append(value)
            elif (metric == check_accuracy):
                value = check_accuracy(model[model["Year"] == year].sort_values("Rank"))
                model_ar.append(value)
        average = sum(model_ar) / len(model_ar)
        new_ar.append(average)
    return new_ar

In [683]:
testing_ar = []
for year in quant_years:
    test_val = get_apk_n(all_predictions[all_predictions["Year"] == year].sort_values("Rank"), 5)
    testing_ar.append(test_val)

sum(testing_ar) / len(testing_ar)

0.6094871794871798

In [684]:
apk_3_all_years = get_metric_from_prediction_array(get_apk_n, prediction_arrays, 3, quant_years)
apk_3_since_2000 = get_metric_from_prediction_array(get_apk_n, prediction_arrays, 3, since_2000)
apk_3_past_5_years = get_metric_from_prediction_array(get_apk_n, prediction_arrays, 3, past_5_years)
apk_3_past_10_years = get_metric_from_prediction_array(get_apk_n, prediction_arrays, 3, past_10_years)

#### Now accuracy

In [685]:
accuracy_all_years = get_metric_from_prediction_array(check_accuracy, prediction_arrays,0, quant_years)
accuracy_since_2000 = get_metric_from_prediction_array(check_accuracy, prediction_arrays,0, since_2000)
accuracy_past_5_years = get_metric_from_prediction_array(check_accuracy, prediction_arrays,0, past_5_years)
accuracy_past_10_years = get_metric_from_prediction_array(check_accuracy, prediction_arrays,0, past_10_years)

In [686]:
mse_top_5 = get_metric_from_prediction_array(get_mse_top_n, prediction_arrays,5, quant_years)
mse_top_5_past_10_years = get_metric_from_prediction_array(get_mse_top_n, prediction_arrays,5, past_10_years)

## Assessing performance of all models

In [687]:
mse_arrays = [mse_ar, normalized_mse_ar, mse_ar_n_vf, mse_ar_rf, mse_ar_rf_n, mse_ar_rf_votefatigue,
              mse_ar_lr_votefatigue, mse_ar_xgb_vf, mse_ar_xgb, mse_ar_nn, mse_ar_nn_vf, mse_ar_en]
apk_arrays = [apk_ar, normalized_apk_ar, apk_ar_n_vf, apk_ar_rf, apk_ar_rf_n, apk_ar_rf_votefatigue, 
              apk_ar_lr_votefatigue, apk_ar_xgb_vf, apk_ar_xgb, apk_ar_nn, apk_ar_nn_vf, apk_ar_en]

In [688]:
def checking_recent_years(list_of_model_arrays, year_to_start):
    year_index = years.index(year_to_start) - 4
    results = []
    results.append(year_to_start)
    for model in list_of_model_arrays:
        results.append(sum(model[:year_index]) / len(model[:year_index]))
    return results

In [689]:
# Converting this data to dataframe so I can graph
every_year_mse_results = []
every_year_apk_results = []
for year in years[5:]:
    up_to_year_accuracy_mse = checking_recent_years(mse_arrays, year)
    up_to_year_accuracy_apk = checking_recent_years(apk_arrays, year)
    every_year_mse_results.append(up_to_year_accuracy_mse)
    every_year_apk_results.append(up_to_year_accuracy_apk)

In [690]:
model_cols_with_year = ["Year", "Ridge Regression", "Normalized Ridge Regression", "Normalized Ridge w/Voter Fatigue",
              "Random Forest Naive","Random Forest Normalized", "Random Forest w/Voter Fatigue", 
              "Linear Regression w/Voter Fatigue", "XGradient Boost w/Voter Fatigue", "XGradient Boost",
              "Neural Network", "Neural Network w/Voter Fatigue", "Ensemble Model"]
model_cols_without_year = ["Ridge Regression", "Normalized Ridge Regression", "Normalized Ridge w/Voter Fatigue",
              "Random Forest Naive","Random Forest Normalized", "Random Forest w/Voter Fatigue", 
              "Linear Regression w/Voter Fatigue", "XGradient Boost w/Voter Fatigue", "XGradient Boost",
              "Neural Network", "Neural Network w/Voter Fatigue", "Ensemble Model"]

In [691]:
graphing_results_mse_df = pd.DataFrame(data=every_year_mse_results, columns = model_cols_with_year)

graphing_results_apk_df = pd.DataFrame(data=every_year_apk_results, columns = model_cols_with_year)

In [692]:
important_columns = ["Year", "Ridge Regression", "Linear Regression", "Random Forest w/Voter Fatigue", "XGradient Boost w/Voter Fatigue", "XGradient Boost"]

In [693]:
mse_fig = px.line(graphing_results_mse_df, x='Year', y=graphing_results_mse_df.columns[1:8])
mse_fig

In [694]:
apk_fig = px.line(graphing_results_apk_df, x='Year', y=graphing_results_apk_df.columns[1:8])
apk_fig

In [695]:
all_results_data = {"Average Mean Square Error Top 10": [avg_mse, normalized_avg_mse, avg_mse_n_vf, avg_mse_rf, avg_mse_rf_n, avg_mse_rf_votefatigue, avg_mse_lr_votefatigue, avg_mse_xgb_vf, avg_mse_xgb, avg_mse_nn, avg_mse_nn_vf, avg_mse_en],
                    "Mean Square Error Top 5": mse_top_5,
                    "Mean Square Error Top 5 Past 10 Years": mse_top_5_past_10_years,
                    "Mean Average Precision at k = 5": [avg_apk, normalized_avg_apk, avg_apk_n_vf, avg_apk_rf, avg_apk_rf_n, avg_apk_rf_votefatigue, avg_apk_lr_votefatigue, avg_apk_xgb_vf, avg_apk_xgb, avg_apk_nn, avg_apk_nn_vf, avg_apk_en], 
                    "Mean Average Precision at k = 3": apk_3_all_years, "MAP (k=3) since 2000": apk_3_since_2000,
                    "MAP (k=3) past 5 years":apk_3_past_5_years, "MAP (k=3) past 10 years":apk_3_past_10_years,
                    "Overall Accuracy": accuracy_all_years, "Accuracy since 2000": accuracy_since_2000,
                    "Accuracy past 5 years": accuracy_past_5_years, "Accuracy past 10 years": accuracy_past_10_years}

In [696]:
ensemble_model_df = pd.DataFrame(data=all_results_data, index=model_cols_without_year) 

In [697]:
ensemble_model_df

Unnamed: 0,Average Mean Square Error Top 10,Mean Square Error Top 5,Mean Square Error Top 5 Past 10 Years,Mean Average Precision at k = 5,Mean Average Precision at k = 3,MAP (k=3) since 2000,MAP (k=3) past 5 years,MAP (k=3) past 10 years,Overall Accuracy,Accuracy since 2000,Accuracy past 5 years,Accuracy past 10 years
Ridge Regression,236.84359,43.794872,20.1,0.609487,0.608262,0.553241,0.666667,0.594444,0.487179,0.5,0.6,0.5
Normalized Ridge Regression,261.366667,50.671795,16.64,0.651111,0.616809,0.574074,0.666667,0.622222,0.487179,0.5,0.6,0.5
Normalized Ridge w/Voter Fatigue,212.607692,103.492308,23.06,0.463162,0.44302,0.356481,0.455556,0.455556,0.282051,0.166667,0.0,0.0
Random Forest Naive,62.338462,9.887179,10.62,0.674103,0.675214,0.62963,0.8,0.733333,0.641026,0.625,0.4,0.6
Random Forest Normalized,69.353846,13.507692,10.7,0.648547,0.659544,0.618056,0.8,0.733333,0.666667,0.666667,0.8,0.8
Random Forest w/Voter Fatigue,53.025641,10.871795,10.74,0.674701,0.675214,0.634259,0.8,0.733333,0.641026,0.625,0.4,0.6
Linear Regression w/Voter Fatigue,352.812821,152.010256,23.06,0.468718,0.450142,0.358796,0.455556,0.455556,0.282051,0.166667,0.0,0.0
XGradient Boost w/Voter Fatigue,59.738462,10.825641,6.24,0.697521,0.659544,0.615741,0.866667,0.722222,0.717949,0.708333,0.6,0.8
XGradient Boost,56.561538,10.14359,9.58,0.69094,0.665242,0.631944,0.866667,0.711111,0.692308,0.666667,0.6,0.6
Neural Network,2954.315385,26.85641,7.24,0.694017,0.717949,0.648148,0.844444,0.744444,0.641026,0.708333,0.6,0.8


In [707]:
28/39

0.717948717948718

Average Mean Square Error Top 10:
1. Random Forest w/Voter Fatigue (5)
2. XGradient Boost (4)
3. XGradient Boost w/Voter Fatigue (3)
4. Random Forest Naive (2)
5. Random Forest Normalized (1)

Mean Square Error Top 5
1. Random Forest Naive (5)
2. XGradient Boost (4)
3. XGradient Boost w/Voter Fatigue (3)
4. Random Forest w/Voter Fatigue (2)
5. Neural Network (1)

Mean Square Error Top 5 Past 10 Years
1. Neural Network w/Voter Fatigue (5)
2. XGradient Boost w/Voter Fatigue (4)
3. Neural Network (3)
4. XGradient Boost (2)
5. Random Forest Naive (1)

Mean Average Precision at k = 5
1. XGradient Boost w/Voter Fatigue (5)
2. Neural Network (4)
3. XGradient Boost (3)
4. Random Forest w/Voter Fatigue (2)
5. Random Forest Naive (1)

Mean Average Precision at k = 3
1. Neural Network (5)
2. Neural Network w/Voter Fatigue (4)
3. Random Forest Naive (3)
4. Random Forest w/Voter Fatigue (2)
5. XGradient Boost (1)

MAP (k=3) since 2000
1. Neural Network w/Voter Fatigue (5)
2. Neural Network (4)
3. Random Forest w/Voter Fatigue (3)
4. XGradient Boost (2)
5. Random Forest Naive (1)

MAP (k=3) past 5 years
1. XGradient Boost w/Voter Fatigue (4.5)
2. XGradient Boost (4.5)
3. Neural Network (2.5)
4. Neural Network w/Voter Fatigue (2.5)
5. Random Forest Models all (0.33 each)

MAP (k=3) past 10 years
1. Neural Network w/Voter Fatigue (5)
2. Neural Network (4)
3. Random Forest Models all (2 each)

Overall Accuracy
1. XGradient Boost w/Voter Fatigue (5)
2. XGradient Boost (4)
3. Random Forest Normalized (3)
4. Random Forest other 2 and Neural Networks (0.75 each)

Accuracy past 5 years
1. Random Forest Normalized (4.5)
2. Neural Network w/Voter Fatigue (4.5)
3. XGradient Boost both + Neural Network (2 each)

Accuracy past 10 years
1. Random Forest Normalized, XGradient Boost w/Voter Fatigue, both Neurals (3.5 each)
2. Other Random Forests + XGradient Boost (0.33 each)

In [445]:
RandomForestWithVoterFatigue = 5 + 2 + 2 + 2 + 3 + .33 + 2 + .75
XGradientBoost = 4 + 4 + 2 + 3 + 1 + 2 + 4.5 + 4 + 2 + .33
XGradientBoostWithVoterFatigue = 3 + 3 + 4 + 5 + 4.5 + 5 + 2 + 3.5
RandomForestNaive = 2 + 5 + 1 + 1 + 3 + 1 + .33 + 2 + .75 + .33
RandomForestNormalized = 1 + .33 + 2 + 3 + 4.5 + 3.5 + .33
NeuralNetwork = 1 + 3 + 4 + 5 + 4 + 2.5 + 5 + .75 + 2 + 3.5
NeuralNetworkWithVoterFatigue = 5 + 4 + 5 + 2.5 + 5 + .75 + 4.5 + 3.5

In [446]:
model_scores = [RandomForestNaive, RandomForestNormalized, RandomForestWithVoterFatigue, XGradientBoost,
                XGradientBoostWithVoterFatigue, NeuralNetwork, NeuralNetworkWithVoterFatigue]

In [635]:
np.array(model_scores) / 165.98

array([0.09886733, 0.08832389, 0.10290396, 0.16164598, 0.18074467,
       0.18526328, 0.18225087])

### All 3 are good at different things, but seems like Neural Networks are more precise at the top as well as X Gradient Boost (especially with Voter Fatigue)

In [461]:
all_predictions_nn_vf[all_predictions_nn_vf["Year"] == 2022].sort_values("Predicted Rank").head(10)

Unnamed: 0,Player,Share,MVP Share Predictions,Rank,Predicted Rank,Difference,Year
7807,Nikola Jokić,0.875,0.885087,1.0,1.0,0.0,2022
16232,Giannis Antetokounmpo,0.595,0.418069,3.0,2.0,1.0,2022
7997,Joel Embiid,0.706,0.369637,2.0,3.0,-1.0,2022
16109,Luka Dončić,0.146,0.219377,5.0,4.0,1.0,2022
8067,Devin Booker,0.216,0.086132,4.0,5.0,-1.0,2022
8065,Chris Paul,0.002,0.080245,9.0,6.0,3.0,2022
8266,Jayson Tatum,0.043,0.046944,6.0,7.0,-1.0,2022
16748,Ja Morant,0.01,0.040069,7.0,8.0,-1.0,2022
16374,Trae Young,0.0,0.018373,13.0,9.0,4.0,2022
7996,James Harden,0.0,0.018049,13.0,10.0,3.0,2022


In [488]:
all_predictions_xgb_vf[all_predictions_xgb_vf["Year"] == 2023].sort_values("Rank").head(10)

Unnamed: 0,Player,Share,MVP Share Predictions,Rank,Predicted Rank,Difference,Year
18517,Joel Embiid,0.915,0.36777,1.0,2.0,-1.0,2023
7823,Nikola Jokić,0.674,0.690522,2.0,1.0,1.0,2023
7315,Giannis Antetokounmpo,0.606,0.210678,3.0,3.0,0.0,2023
9535,Jayson Tatum,0.28,0.138125,4.0,4.0,0.0,2023
8336,Shai Gilgeous-Alexander,0.046,0.044135,5.0,7.0,-2.0,2023
17623,Donovan Mitchell,0.03,0.051468,6.0,6.0,0.0,2023
10690,Domantas Sabonis,0.027,0.015215,7.0,8.0,-1.0,2023
7407,Luka Dončić,0.01,0.060247,8.0,5.0,3.0,2023
12631,Stephen Curry,0.005,0.003489,9.0,15.0,-6.0,2023
15559,Jimmy Butler,0.003,0.005803,10.0,12.0,-2.0,2023


### Can we make an ensemble to create potentially "the best prediction"? Will it improve accuracy?

In [676]:
best_models = [all_predictions_nn, all_predictions_nn_vf, all_predictions_xgb_vf, all_predictions_xgb, all_predictions_rf_votefatigue, all_predictions_rf]

In [677]:
# Code to combine models for a holistic prediction for a single year's MVP race
def ensemble_predictions_single_year(model_predictions, year, num_candidates):
    mvp_dict = {} # make dictionary of candidates with key being their name and their value being their MVP Share Prediction
    for model in model_predictions: # going through each model
        top_predicted = model[model["Year"] == year].sort_values("Predicted Rank").head(num_candidates)
        for index, row in top_predicted.iterrows(): #iterating through each model's rows
            player, predicted_share = row["Player"], row["MVP Share Predictions"]
            if (player in mvp_dict):
                mvp_dict[player] += predicted_share
            else:
                mvp_dict[player] = predicted_share
    
    # Now that we have the dictionary, we can append this data to the actual rank for that year
    ensemble_df = mvp_df[["Player", "Share", "Year"]]
    ensemble_df = ensemble_df[(ensemble_df["Year"] == year) & ensemble_df["Player"].isin(mvp_dict.keys())]
    filtered_df = ensemble_df.copy(deep=True)
    filtered_df.loc[:, "MVP Share Predictions"] = 0
    
    for player, share in mvp_dict.items():
        player_row = filtered_df[filtered_df["Player"] == player].index.values[0]
        filtered_df.loc[player_row, "MVP Share Predictions"] = share
    
    # Now that we have assigned our predictions we can use our previous function to convert it to rankings
    converted_df = convert_to_ranks(filtered_df, "Share", "MVP Share Predictions")
    with_differences_df = difference_in_rank(converted_df)
    with_differences_df = with_differences_df[["Player", "Share", "MVP Share Predictions", "Rank", "Predicted Rank", "Difference", "Year"]]
    
    return with_differences_df

#### Now, time to do all the metrics on this ensemble model

In [678]:
%%time
"""Need to do 'backtesting' for the ensemble as well"""
mse_ar_en = []
apk_ar_en = []
predictions_en = []
for year in quant_years:
    ensemble_year_df = ensemble_predictions_single_year(best_models, year, 15)
    mse_ar_en.append(get_mse_top_n(ensemble_year_df))
    apk_ar_en.append(get_apk_n(ensemble_year_df, 5))
    predictions_en.append(ensemble_year_df)

avg_mse_en, avg_apk_en, all_predictions_en = (sum(mse_ar_en) / len(mse_ar_en)), (sum(apk_ar_en) / len(apk_ar_en)), pd.concat(predictions_en)

CPU times: user 304 ms, sys: 11.3 ms, total: 315 ms
Wall time: 329 ms


In [None]:
# good: 31, 24, 23
# bad: 29, 20, 17
apk_ar_en

In [730]:
ensemble_calc = np.sum(0 + 1 + 1 + 2 + 1 + 1 + 1 + 1 + 16 + 1) / 10
bball_ref_calc = np.sum(0 + 0 + 0 + 4 + 1 + 1 + 4 + 25 + 16 + 4) / 10
ensemble_calc, bball_ref_calc

(2.5, 5.5)

##  Saving models so we can work on them and look at them after

In [733]:
# Save RF Model to file in the current working directory

joblib_file_rf = "joblib_RF_Model.pkl"  
joblib.dump(rf, joblib_file_rf)
# This is with voter fatigue as well which has the lowest MSE and the highest APK

['joblib_RF_Model.pkl']

In [734]:
joblib_file_rf_naive = "joblib_RF_Naive_Model.pkl"  
joblib.dump(rf_naive, joblib_file_rf_naive)

['joblib_RF_Naive_Model.pkl']

In [735]:
joblib_file_rf_normalized = "joblib_RF_Normalized_Model.pkl"  
joblib.dump(rf_naive, joblib_file_rf_normalized)

['joblib_RF_Normalized_Model.pkl']

In [736]:
filepath = 'rf_session_7-11.pkl'
dill.dump_session(filepath) # Save the session

In [83]:
rf

In [381]:
mvp_df[(mvp_df["STL"] > 2.3) & (mvp_df["Team"] == "Indiana Pacers")]

Unnamed: 0,Player,Pos,Age,G,GS,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Year,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP,Pts Won,Pts Max,Share,Team,W,L,W/L%,GB,PS/G,PA/G,SRS
5238,Dudley Bradley,SG,22,82,31,24.7,3.4,7.4,0.452,0.0,0.1,0.4,3.3,7.4,0.452,0.453,1.7,2.1,0.782,0.8,1.9,2.7,3.1,2.6,0.6,2.0,2.4,8.4,1980,13.5,0.502,0.008,0.286,3.4,8.1,5.7,15.9,4.8,1.3,19.5,16.3,0.2,2.9,3.1,0.073,-2.0,3.0,1.0,1.5,0.0,0.0,0.0,Indiana Pacers,37,45,0.451,13.0,111.2,111.9,-0.54
6829,Micheal Williams,PG,25,79,68,34.8,5.1,10.4,0.49,0.1,0.4,0.242,5.0,10.0,0.501,0.495,4.7,5.4,0.871,0.9,2.6,3.6,8.2,2.9,0.3,3.0,3.3,15.0,1992,18.4,0.587,0.04,0.518,3.1,8.0,5.7,32.2,4.1,0.5,19.2,19.1,5.8,3.2,9.0,0.156,1.9,1.8,3.7,3.9,0.0,0.0,0.0,Indiana Pacers,40,42,0.488,27.0,112.2,110.3,1.85
9190,Metta World Peace,SF,22,55,50,29.9,4.9,11.6,0.423,1.0,3.1,0.312,3.9,8.5,0.464,0.465,2.5,3.7,0.667,1.3,3.6,4.9,2.3,2.6,0.7,2.1,3.9,13.2,2002,15.8,0.501,0.267,0.321,5.1,13.8,9.5,14.1,4.5,1.7,14.0,23.5,0.0,2.7,2.7,0.079,-0.4,3.0,2.6,1.9,0.0,0.0,0.0,Indiana Pacers,42,40,0.512,8.0,96.8,96.5,-0.07
9727,Victor Oladipo,SG,25,75,75,34.0,8.5,17.9,0.477,2.1,5.8,0.371,6.4,12.1,0.528,0.537,3.9,4.9,0.799,0.6,4.6,5.2,4.3,2.4,0.8,2.9,2.3,23.1,2018,23.1,0.577,0.323,0.274,2.1,15.1,8.6,21.2,3.5,2.0,12.7,30.1,4.3,4.0,8.2,0.155,4.1,1.7,5.8,5.0,2.0,1010.0,0.002,Indiana Pacers,48,34,0.585,2.0,105.6,104.2,1.18
