### Now that we have a fully cleaned dataframe, we can start building out our model

In [220]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from catboost import Pool, CatBoostRegressor
import xgboost as xgb
from sklearn import svm
import plotly.express as px
import joblib
import dill 
import pdb
#filepath = 'rf_session_7-7.pkl'
#dill.load_session(filepath)
import joblib
import shap
#joblib_file = "joblib_RF_Model.pkl"  
#newest_rf = joblib.load(joblib_file) # This is with voter fatigue as well which has the lowest MSE and the highest APK

In [47]:
# Neural Network import
import tensorflow
import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from scikeras.wrappers import KerasRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [48]:
"""Imported functions from ml_metrics library"""

def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.

    This function computes the average prescision at k between two lists of
    items.

    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements

    Returns
    -------
    score : double
            The average precision at k over the input lists

    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k.

    This function computes the mean average prescision at k between two lists
    of lists of items.

    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements

    Returns
    -------
    score : double
            The mean average precision at k over the input lists

    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

In [49]:
mvp_df = pd.read_csv("csv_files/all_player_mvp_stats.csv")

In [50]:
# Dropping previous index as well as Tm since redundant with Team and
# Hornets have two different abbreviations
mvp_df = mvp_df.drop(["Unnamed: 0", "Tm"], axis=1)

In [51]:
years = list(range(1980, 2024))

In [52]:
# setting up everything for our variables
# Removed qualitative variables like Team as well as Year
predictors = ['Age', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB',
       'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'PER',
       'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%',
       'TOV%', 'USG%', 'OWS', 'DWS', 'WS', 'WS/48', 'OBPM', 'DBPM', 'BPM',
       'VORP', 'W', 'L', 'W/L%', 'GB',
       'PS/G', 'PA/G', 'SRS']

In [53]:
# generalize this later for all types of models
# outputs two new columns: Rank and Predicted Rank

def convert_to_ranks(df, actual_share_col, predicted_share_col):
    ranked_df = df.copy(deep=True)
    ranked_df["Rank"] = df[actual_share_col].rank(method="min", ascending=False)
    ranked_df["Predicted Rank"] = df[predicted_share_col].rank(method="min", ascending=False)
    return ranked_df.sort_values("Rank")

In [54]:
# create an error metric function that is based on ranking since
# most players' MVP share is 0
# assumption is table looks like ridge_combination
# with cols: Player, Share, MVP Share Predictions
# outdated not using the convert_to_ranks function but not useful
def find_ap(df, actual_share_col, predicted_share_col, num):
    actual_ranking = df.sort_values(actual_share_col, ascending=False).head(num)
    actual_ranking["Rank"] = list(range(1, actual_ranking.shape[0] + 1))
    predicted_ranking = df.sort_values(predicted_share_col, ascending=False)
    predicted_ranking["Pred_Rank"] = list(range(1, predicted_ranking.shape[0] + 1))
    abs_diff = 0
    for index, row in actual_ranking.iterrows():
        player_name = row["Player"]
        predicted_player_row = predicted_ranking[
            predicted_ranking["Player"] == player_name]
        abs_diff += abs(row["Rank"] - predicted_player_row["Pred_Rank"].values[0])
    return abs_diff

In [55]:
"""
Finding mean square error

This will give you a measure of the overall discrepancy or difference in ranks between the two for the top 10 MVP
candidates.

Input is a ranked dataframe after applying the convert_to_ranks function.
"""

def get_mse_top_n(ranked_df, num=10):
    top_n_df = ranked_df.head(num)
    return mean_squared_error(top_n_df["Rank"].to_numpy(), top_n_df["Predicted Rank"].to_numpy(), squared=True)

In [56]:
"""
Determine the AP@K (Average Precision at K) for your predicted ranking. 

Interested in top n since those are the MVP finalists. AP@K will provide an assessment
of the precision and correctness of your predicted ranking for the top candidates.

Input is a ranked dataframe after applying the convert_to_ranks function.
"""

def get_apk_n(ranked_df, k):
    top_k_actual = ranked_df["Player"][:k]
    top_k_predicted = ranked_df.sort_values("Predicted Rank")["Player"][:k]
    return apk(top_k_actual.tolist(), top_k_predicted.tolist(), 5)

In [57]:
""" 
Combines both of our metrics together to provide a clean look at how our model does 
in these two important criteria. """

def evaluate_model(df, actual_share_col, predicted_share_col, k):
    combined_df = convert_to_ranks(df, actual_share_col, predicted_share_col)
    
    #return combined_df
    mse = get_mse_top_10(combined_df)
    apk = get_apk_n(combined_df, k)
    
    return [mse, apk]
    

In [58]:
def print_model_results(mse, apk, k):
    print("Mean Squared Error:", mse)
    print("Average Precision at", str(k) + ":", apk)

In [216]:
all_predictions_xgb

Unnamed: 0,Player,Share,MVP Share Predictions,Rank,Predicted Rank,Difference,Year
6481,Larry Bird,0.978,0.748873,1.0,1.0,0.0,1985
5680,Magic Johnson,0.338,0.502177,2.0,2.0,0.0,1985
2903,Moses Malone,0.279,0.039608,3.0,7.0,-4.0,1985
5677,Kareem Abdul-Jabbar,0.264,0.061043,4.0,5.0,-1.0,1985
2026,Terry Cummings,0.177,0.032766,5.0,8.0,-3.0,1985
...,...,...,...,...,...,...,...
10559,Patrick Williams,0.000,0.000003,14.0,491.0,-477.0,2023
10558,Patrick Beverley,0.000,0.000004,14.0,374.0,-360.0,2023
10557,Nikola Vučević,0.000,0.000100,14.0,41.0,-27.0,2023
10694,Keon Ellis,0.000,0.000007,14.0,189.0,-175.0,2023


## If we filter our dataframe to look at only realistic MVP candidates, will that improve performance?

In [113]:
top_candidates.sort_values("G")

Unnamed: 0,Player,Pos,Age,G,GS,MP,FG,FGA,FG%,3P,...,Pts Max,Share,Team,W,L,W/L%,GB,PS/G,PA/G,SRS
1939,Alonzo Mourning,C,28,46,12,38.1,7.0,13.8,0.511,0.0,...,1180.0,0.655,Miami Heat,33,17,0.660,0.0,89.0,84.0,5.11
302,Allen Iverson,SG,23,48,75,41.5,9.1,22.0,0.412,1.2,...,1180.0,0.270,Philadelphia 76ers,28,22,0.560,5.0,89.7,87.6,2.56
574,Karl Malone,PF,35,49,0,37.4,8.0,16.3,0.493,0.0,...,1180.0,0.701,Utah Jazz,37,13,0.740,0.0,93.3,86.8,5.54
7154,Shaquille O'Neal,C,26,49,12,34.8,10.4,18.1,0.576,0.0,...,1180.0,0.075,Los Angeles Lakers,31,19,0.620,4.0,99.0,96.0,2.68
2433,Jason Kidd,PG,25,50,69,41.2,6.2,14.0,0.444,0.9,...,1180.0,0.135,Phoenix Suns,27,23,0.540,8.0,95.6,93.3,2.15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13213,Kobe Bryant,SG,29,82,82,38.9,9.5,20.6,0.459,1.8,...,1260.0,0.873,Los Angeles Lakers,57,25,0.695,0.0,108.6,101.3,7.34
4519,Michael Jordan,SG,23,82,76,40.0,13.4,27.8,0.482,0.1,...,780.0,0.576,Chicago Bulls,40,42,0.488,17.0,104.8,103.9,1.26
5004,Isiah Thomas,PG,22,82,46,36.7,8.2,17.7,0.462,0.3,...,760.0,0.151,Detroit Pistons,49,33,0.598,1.0,117.1,113.5,3.52
2851,Julius Erving,SF,30,82,0,35.0,9.7,18.6,0.521,0.0,...,690.0,0.658,Philadelphia 76ers,62,20,0.756,0.0,111.7,103.8,7.76


## Backtesting

In [61]:
# Following along, we make a function to find the biggest differences each year
# inside code we run function of convert_to_ranks
def difference_in_rank(ranked_df):
    ranked_df["Difference"] = ranked_df["Rank"] - ranked_df["Predicted Rank"]
    return ranked_df

In [130]:
def model_prediction(df, year, model, predictors, prefiltering=False):
    
    if prefiltering: ##
        filtered_df = df[(df["FGA"] > 10.0) & (df["TS%"] > .48) & (df["PTS"] > 12.5) 
                   & (df["PER"] > 17.0) & (df["WS/48"] > .1) & (df["BPM"] > 1.5) & (df["W/L%"] > .3) & (df["G"] > 45)]
        """train = df[(df["FGA"] > 10.0) & (df["TS%"] > .48) & (df["PTS"] > 12.5) 
                   & (df["PER"] > 17.0) & (df["WS/48"] > .1) & (df["BPM"] > 1.5) & (df["W/L%"] > .3) & (df["G"] > 45)]
        #test = df[df["Year"] == year]"""
        train = filtered_df[filtered_df["Year"] < year]
        test = filtered_df[filtered_df["Year"] == year]
    
    else:
        train = df[df["Year"] < year]
        test = df[df["Year"] == year]
    
    X_train = train[predictors]
    y_train = train["Share"]
    X_test = test[predictors]
    y_test = test["Share"]
    
    model.fit(X_train, y_train)
    model_predictions = model.predict(X_test) # predict for testing set (the current year starting w 2005)

    model_df = pd.DataFrame(model_predictions,
                                     columns=["MVP Share Predictions"],
                                     index=X_test.index) # putting predictions into a dataframe

    model_combination = pd.concat([test[["Player", "Share"]],
                                          model_df], axis = 1) # concatenating predictions with the players and their MVP share
    return model_combination

In [131]:
"""
Creating a backtesting function to run all the code we did and put it in a loop
"""
def backtest(orig_df, model_bt, timeframe, predictors, normalizing=False, voteFatigue=True, prefiltering=False):
    """
    Looping through years (starting with 2005 because we need at least 5 years of data to come up with a model)
    Then making predictions for every year based off previous traine data
    The more years we can train on, the more data our algorithm can build off to make accurate predictions
    """

    all_predictions = [] # list of dataframes holding our predictions for every year
    mse_ar = []
    apk_ar = []
    
    df = orig_df.copy(deep=True)
    
    
    if normalizing: # if our data needs to be normalized (for certain models where scaling is important)
        predictors_plus_year = predictors.copy()
        predictors_plus_year.append("Year")
        df[predictors] = df[predictors_plus_year].groupby("Year").transform(lambda x: (x-np.mean(x, axis=0))/np.std(x, ddof=1))
    
    if voteFatigue:
        df = add_previous_mvps(df, years)
        predictors = add_previous_mvp_as_predictor(predictors)

    for year in timeframe[5:]:
        if prefiltering:
            model_bt_combination = model_prediction(df, year, model_bt, predictors)
        else:
            model_bt_combination = model_prediction(df, year, model_bt, predictors)

        combined_bt_df = convert_to_ranks(model_bt_combination, "Share", "MVP Share Predictions") # adding actual and predicted rank into list
        with_differences = difference_in_rank(combined_bt_df) # adding difference between actual and predicted rank
        with_differences["Year"] = year # adding year to add more information to overall dataframe
        
        all_predictions.append(with_differences) # appending predictions to outer list

        mse_ar.append(get_mse_top_n(combined_bt_df))
        apk_ar.append(get_apk_n(combined_bt_df, 5))
        
    return ((sum(mse_ar) / len(mse_ar)), mse_ar, (sum(apk_ar) / len(apk_ar)), apk_ar, pd.concat(all_predictions))

In [132]:
reg = Ridge(alpha=.1)

In [133]:
normalized_avg_mse, normalized_mse_ar, normalized_avg_apk, normalized_apk_ar, normalized_all_predictions = backtest(mvp_df, reg, years, predictors, normalizing=True, voteFatigue=False, prefiltering=True)

In [134]:
avg_mse, mse_ar, avg_apk, apk_ar, all_predictions = backtest(mvp_df, reg, years, predictors, normalizing=False, voteFatigue=False, prefiltering=True)

In [135]:
avg_mse_n_vf, mse_ar_n_vf, avg_apk_n_vf, apk_ar_n_vf, all_predictions_n_vf = backtest(mvp_df, reg, years, predictors, normalizing=True, voteFatigue=True, prefiltering=True)

## Trying Random Forest

In [187]:
rf_naive = RandomForestRegressor(n_estimators=100, random_state=42, min_samples_split=5)
rf_normalized = RandomForestRegressor(n_estimators=100, random_state=42, min_samples_split=5)
rf = RandomForestRegressor(n_estimators=100, random_state=42, min_samples_split=5)

In [222]:
def add_previous_mvps(df, years):
    mvp_with_previous_df = mvp_df.copy(deep=True)
    mvp_with_previous_df["Previous MVP's Won"] = 0
    for year in years:
        year_winner = mvp_with_previous_df[mvp_with_previous_df["Year"] == year].sort_values("Share", ascending=False).iloc[:1]["Player"].item()
        years_after_winning_for_player = mvp_with_previous_df[(mvp_with_previous_df["Player"] == year_winner) & (mvp_with_previous_df["Year"] > year)]
        years_after_index = years_after_winning_for_player.index.values.tolist()
        mvp_with_previous_df.loc[years_after_index, ["Previous MVP's Won"]] += 1
    mvp_dummies = pd.get_dummies(mvp_with_previous_df, columns=["Previous MVP's Won"])
    return mvp_dummies

In [223]:
add_previous_mvps(mvp_df, years)

Unnamed: 0,Player,Pos,Age,G,GS,MP,FG,FGA,FG%,3P,...,GB,PS/G,PA/G,SRS,Previous MVP's Won_0,Previous MVP's Won_1,Previous MVP's Won_2,Previous MVP's Won_3,Previous MVP's Won_4,Previous MVP's Won_5
0,A.C. Green,PF,22,82,24,18.8,2.5,4.7,0.539,0.0,...,0.0,117.3,109.5,6.84,1,0,0,0,0,0
1,Byron Scott,SG,24,76,13,28.8,6.7,13.0,0.513,0.3,...,0.0,117.3,109.5,6.84,1,0,0,0,0,0
2,James Worthy,SF,24,75,0,32.7,8.4,14.5,0.579,0.0,...,0.0,117.3,109.5,6.84,1,0,0,0,0,0
3,Jerome Henderson,C,26,1,60,3.0,2.0,3.0,0.667,0.0,...,0.0,117.3,109.5,6.84,1,0,0,0,0,0
4,Kareem Abdul-Jabbar,C,38,79,7,33.3,9.6,16.9,0.564,0.0,...,0.0,117.3,109.5,6.84,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18775,Spencer Hawes,PF,28,54,1,14.8,2.5,5.1,0.484,0.5,...,9.0,103.6,103.8,-0.45,1,0,0,0,0,0
18776,Steve Novak,PF,33,8,0,2.8,0.3,0.9,0.286,0.1,...,9.0,103.6,103.8,-0.45,1,0,0,0,0,0
18777,Terrence Jones,PF,25,54,12,23.5,4.3,9.1,0.470,0.4,...,9.0,103.6,103.8,-0.45,1,0,0,0,0,0
18778,Thon Maker,C,19,57,34,9.9,1.5,3.2,0.459,0.5,...,9.0,103.6,103.8,-0.45,1,0,0,0,0,0


In [176]:
def add_previous_mvp_as_predictor(predictors):
    predictors_with_previous = predictors.copy()
    predictors_with_previous.append("Previous MVP's Won_0")
    predictors_with_previous.append("Previous MVP's Won_1")
    predictors_with_previous.append("Previous MVP's Won_2")
    predictors_with_previous.append("Previous MVP's Won_3")
    predictors_with_previous.append("Previous MVP's Won_4")
    predictors_with_previous.append("Previous MVP's Won_5")
    return predictors_with_previous

In [188]:
%%time
avg_mse_rf, mse_ar_rf, avg_apk_rf, apk_ar_rf, all_predictions_rf = backtest(
    mvp_df, rf_naive, years, predictors, normalizing=False, voteFatigue=False, prefiltering=True)

CPU times: user 12min 26s, sys: 2.88 s, total: 12min 29s
Wall time: 12min 30s


In [189]:
%%time
avg_mse_rf_votefatigue, mse_ar_rf_votefatigue, avg_apk_rf_votefatigue, apk_ar_rf_votefatigue, all_predictions_rf_votefatigue = backtest(
    mvp_df, rf, years, predictors, normalizing=False,voteFatigue=True, prefiltering=True)

CPU times: user 12min 53s, sys: 3.45 s, total: 12min 57s
Wall time: 13min 1s


In [191]:
all_predictions_rf_n

Unnamed: 0,Player,Share,MVP Share Predictions,Rank,Predicted Rank,Difference,Year
6481,Larry Bird,0.978,0.680149,1.0,1.0,0.0,1985
5680,Magic Johnson,0.338,0.322346,2.0,3.0,-1.0,1985
2903,Moses Malone,0.279,0.051050,3.0,7.0,-4.0,1985
5677,Kareem Abdul-Jabbar,0.264,0.110375,4.0,5.0,-1.0,1985
2026,Terry Cummings,0.177,0.040355,5.0,9.0,-4.0,1985
...,...,...,...,...,...,...,...
10559,Patrick Williams,0.000,0.000000,14.0,99.0,-85.0,2023
10558,Patrick Beverley,0.000,0.000000,14.0,99.0,-85.0,2023
10557,Nikola Vučević,0.000,0.006986,14.0,22.0,-8.0,2023
10694,Keon Ellis,0.000,0.000000,14.0,99.0,-85.0,2023


## Linear Regression Model

In [192]:
linreg = LinearRegression()

In [193]:
%%time
avg_mse_lr_votefatigue, mse_ar_lr_votefatigue, avg_apk_lr_votefatigue, apk_ar_lr_votefatigue, all_predictions_lr_votefatigue = backtest(
    mvp_df, linreg, years, predictors, normalizing=True, voteFatigue=True, prefiltering=True)

CPU times: user 6.46 s, sys: 425 ms, total: 6.88 s
Wall time: 1.3 s


## Gradient Boosting Regressor

In [181]:
xgb_model = xgb.XGBRegressor(
    n_estimators=100,
    tree_method="hist",
    objective='reg:logistic',
)

In [182]:
%%time
avg_mse_xgb_vf, mse_ar_xgb_vf, avg_apk_xgb_vf, apk_ar_xgb_vf, all_predictions_xgb_vf = backtest(
    mvp_df, xgb_model, years, predictors, normalizing=False, voteFatigue=True, prefiltering=True)

CPU times: user 27.6 s, sys: 11.6 s, total: 39.2 s
Wall time: 5.77 s


In [183]:
%%time
avg_mse_xgb, mse_ar_xgb, avg_apk_xgb, apk_ar_xgb, all_predictions_xgb = backtest(
    mvp_df, xgb_model, years, predictors, normalizing=False, voteFatigue=False, prefiltering=True)

CPU times: user 26 s, sys: 10.7 s, total: 36.8 s
Wall time: 5.15 s


## Neural Networks

In [184]:
dnn = MLPRegressor(
    solver='lbfgs',
    hidden_layer_sizes=100,
    max_iter=1000,
    random_state=42,
    activation='logistic',
    learning_rate ='adaptive')

In [185]:
%%time
avg_mse_nn, mse_ar_nn, avg_apk_nn, apk_ar_nn, all_predictions_nn = backtest(
    mvp_df, dnn, years, predictors, normalizing=True, voteFatigue=False, prefiltering=True)

CPU times: user 17min 53s, sys: 2min 49s, total: 20min 43s
Wall time: 3min 3s


In [186]:
%%time
avg_mse_nn_vf, mse_ar_nn_vf, avg_apk_nn_vf, apk_ar_nn_vf, all_predictions_nn_vf = backtest(
    mvp_df, dnn, years, predictors, normalizing=True,voteFatigue=True, prefiltering=True)

CPU times: user 35min 58s, sys: 1min 53s, total: 37min 52s
Wall time: 5min 11s


### Our metric tested how accurate the top 5 players are. I want to look at the top 3 candidates and their accuracy as well as how often the models got the actual MVP right

In [197]:
prediction_arrays = [all_predictions, normalized_all_predictions, all_predictions_n_vf, all_predictions_rf, 
                     all_predictions_rf_n, all_predictions_rf_votefatigue, all_predictions_lr_votefatigue,
                     all_predictions_xgb_vf, all_predictions_xgb, all_predictions_nn, all_predictions_nn_vf, all_predictions_en]

In [198]:
quant_years = list(range(1985, 2024))
past_5_years = list(range(2019,2024))
past_10_years = list(range(2014, 2024))
since_2000 = list(range(2000, 2024))

In [199]:
def check_accuracy(ranked_df):
    actual_winner = ranked_df.iloc[:1]["Player"].item()
    predicted_winner = ranked_df.sort_values("Predicted Rank").iloc[:1]["Player"].item()
    return actual_winner == predicted_winner

In [200]:
## Need to fix this bc getting wrong results

def get_metric_from_prediction_array(metric, prediction_list, num, timeframe):
    new_ar = []
    for model in prediction_list:
        model_ar = []
        for year in timeframe:
            if (metric == get_apk_n):
                value = get_apk_n(model[model["Year"] == year].sort_values("Rank"), num)
                model_ar.append(value)
            elif (metric == get_mse_top_n):
                value = get_mse_top_n(model[model["Year"] == year].sort_values("Rank"), num)
                model_ar.append(value)
            elif (metric == check_accuracy):
                value = check_accuracy(model[model["Year"] == year].sort_values("Rank"))
                model_ar.append(value)
        average = sum(model_ar) / len(model_ar)
        new_ar.append(average)
    return new_ar

In [201]:
apk_3_all_years = get_metric_from_prediction_array(get_apk_n, prediction_arrays, 3, quant_years)
apk_3_since_2000 = get_metric_from_prediction_array(get_apk_n, prediction_arrays, 3, since_2000)
apk_3_past_5_years = get_metric_from_prediction_array(get_apk_n, prediction_arrays, 3, past_5_years)
apk_3_past_10_years = get_metric_from_prediction_array(get_apk_n, prediction_arrays, 3, past_10_years)

#### Now accuracy and recent MSE for top 5 candidates

In [202]:
accuracy_all_years = get_metric_from_prediction_array(check_accuracy, prediction_arrays,0, quant_years)
accuracy_since_2000 = get_metric_from_prediction_array(check_accuracy, prediction_arrays,0, since_2000)
accuracy_past_5_years = get_metric_from_prediction_array(check_accuracy, prediction_arrays,0, past_5_years)
accuracy_past_10_years = get_metric_from_prediction_array(check_accuracy, prediction_arrays,0, past_10_years)

In [203]:
mse_top_5 = get_metric_from_prediction_array(get_mse_top_n, prediction_arrays,5, quant_years)
mse_top_5_past_10_years = get_metric_from_prediction_array(get_mse_top_n, prediction_arrays,5, past_10_years)

## Assessing performance of all models

In [204]:
mse_arrays = [mse_ar, normalized_mse_ar, mse_ar_n_vf, mse_ar_rf, mse_ar_rf_n, mse_ar_rf_votefatigue,
              mse_ar_lr_votefatigue, mse_ar_xgb_vf, mse_ar_xgb, mse_ar_nn, mse_ar_nn_vf, mse_ar_en]
apk_arrays = [apk_ar, normalized_apk_ar, apk_ar_n_vf, apk_ar_rf, apk_ar_rf_n, apk_ar_rf_votefatigue, 
              apk_ar_lr_votefatigue, apk_ar_xgb_vf, apk_ar_xgb, apk_ar_nn, apk_ar_nn_vf, apk_ar_en]

In [205]:
def checking_recent_years(list_of_model_arrays, year_to_start):
    year_index = years.index(year_to_start) - 4
    results = []
    results.append(year_to_start)
    for model in list_of_model_arrays:
        results.append(sum(model[:year_index]) / len(model[:year_index]))
    return results

In [206]:
# Converting this data to dataframe so I can graph
every_year_mse_results = []
every_year_apk_results = []
for year in years[5:]:
    up_to_year_accuracy_mse = checking_recent_years(mse_arrays, year)
    up_to_year_accuracy_apk = checking_recent_years(apk_arrays, year)
    every_year_mse_results.append(up_to_year_accuracy_mse)
    every_year_apk_results.append(up_to_year_accuracy_apk)

In [207]:
model_cols_with_year = ["Year", "Ridge Regression", "Normalized Ridge Regression", "Normalized Ridge w/Voter Fatigue",
              "Random Forest Naive","Random Forest Normalized", "Random Forest w/Voter Fatigue", 
              "Linear Regression w/Voter Fatigue", "XGradient Boost w/Voter Fatigue", "XGradient Boost",
              "Neural Network", "Neural Network w/Voter Fatigue", "Ensemble Model"]
model_cols_without_year = ["Ridge Regression", "Normalized Ridge Regression", "Normalized Ridge w/Voter Fatigue",
              "Random Forest Naive","Random Forest Normalized", "Random Forest w/Voter Fatigue", 
              "Linear Regression w/Voter Fatigue", "XGradient Boost w/Voter Fatigue", "XGradient Boost",
              "Neural Network", "Neural Network w/Voter Fatigue", "Ensemble Model"]

In [208]:
all_results_data = {"Average Mean Square Error Top 10": [avg_mse, normalized_avg_mse, avg_mse_n_vf, avg_mse_rf, avg_mse_rf_n, avg_mse_rf_votefatigue, avg_mse_lr_votefatigue, avg_mse_xgb_vf, avg_mse_xgb, avg_mse_nn, avg_mse_nn_vf, avg_mse_en],
                    "Mean Square Error Top 5": mse_top_5,
                    "Mean Square Error Top 5 Past 10 Years": mse_top_5_past_10_years,
                    "Mean Average Precision at k = 5": [avg_apk, normalized_avg_apk, avg_apk_n_vf, avg_apk_rf, avg_apk_rf_n, avg_apk_rf_votefatigue, avg_apk_lr_votefatigue, avg_apk_xgb_vf, avg_apk_xgb, avg_apk_nn, avg_apk_nn_vf, avg_apk_en], 
                    "Mean Average Precision at k = 3": apk_3_all_years, "MAP (k=3) since 2000": apk_3_since_2000,
                    "MAP (k=3) past 5 years":apk_3_past_5_years, "MAP (k=3) past 10 years":apk_3_past_10_years,
                    "Overall Accuracy": accuracy_all_years, "Accuracy since 2000": accuracy_since_2000,
                    "Accuracy past 5 years": accuracy_past_5_years, "Accuracy past 10 years": accuracy_past_10_years}

In [209]:
ensemble_model_df = pd.DataFrame(data=all_results_data, index=model_cols_without_year) 

In [210]:
ensemble_model_df

Unnamed: 0,Average Mean Square Error Top 10,Mean Square Error Top 5,Mean Square Error Top 5 Past 10 Years,Mean Average Precision at k = 5,Mean Average Precision at k = 3,MAP (k=3) since 2000,MAP (k=3) past 5 years,MAP (k=3) past 10 years,Overall Accuracy,Accuracy since 2000,Accuracy past 5 years,Accuracy past 10 years
Ridge Regression,236.84359,43.794872,20.1,0.609487,0.608262,0.553241,0.666667,0.594444,0.487179,0.5,0.6,0.5
Normalized Ridge Regression,261.366667,50.671795,16.64,0.651111,0.616809,0.574074,0.666667,0.622222,0.487179,0.5,0.6,0.5
Normalized Ridge w/Voter Fatigue,212.607692,103.492308,23.06,0.463162,0.44302,0.356481,0.455556,0.455556,0.282051,0.166667,0.0,0.0
Random Forest Naive,62.338462,9.887179,10.62,0.674103,0.675214,0.62963,0.8,0.733333,0.641026,0.625,0.4,0.6
Random Forest Normalized,69.353846,13.507692,10.7,0.648547,0.659544,0.618056,0.8,0.733333,0.666667,0.666667,0.8,0.8
Random Forest w/Voter Fatigue,54.607692,11.364103,11.46,0.647009,0.663818,0.625,0.8,0.722222,0.641026,0.625,0.4,0.6
Linear Regression w/Voter Fatigue,706.992308,1116.758974,23.24,0.453932,0.421652,0.361111,0.388889,0.433333,0.282051,0.208333,0.0,0.0
XGradient Boost w/Voter Fatigue,91.628205,10.235897,8.24,0.68735,0.643875,0.604167,0.866667,0.722222,0.692308,0.666667,0.6,0.8
XGradient Boost,56.561538,10.14359,9.58,0.69094,0.665242,0.631944,0.866667,0.711111,0.692308,0.666667,0.6,0.6
Neural Network,2954.315385,26.85641,7.24,0.694017,0.717949,0.648148,0.844444,0.744444,0.641026,0.708333,0.6,0.8


Average Mean Square Error Top 10:
1. Random Forest w/Voter Fatigue (5)
2. XGradient Boost (4)
3. XGradient Boost w/Voter Fatigue (3)
4. Random Forest Naive (2)
5. Random Forest Normalized (1)

Mean Square Error Top 5
1. Random Forest Naive (5)
2. XGradient Boost (4)
3. XGradient Boost w/Voter Fatigue (3)
4. Random Forest w/Voter Fatigue (2)
5. Neural Network (1)

Mean Square Error Top 5 Past 10 Years
1. Neural Network w/Voter Fatigue (5)
2. XGradient Boost w/Voter Fatigue (4)
3. Neural Network (3)
4. XGradient Boost (2)
5. Random Forest Naive (1)

Mean Average Precision at k = 5
1. XGradient Boost w/Voter Fatigue (5)
2. Neural Network (4)
3. XGradient Boost (3)
4. Random Forest w/Voter Fatigue (2)
5. Random Forest Naive (1)

Mean Average Precision at k = 3
1. Neural Network (5)
2. Neural Network w/Voter Fatigue (4)
3. Random Forest Naive (3)
4. Random Forest w/Voter Fatigue (2)
5. XGradient Boost (1)

MAP (k=3) since 2000
1. Neural Network w/Voter Fatigue (5)
2. Neural Network (4)
3. Random Forest w/Voter Fatigue (3)
4. XGradient Boost (2)
5. Random Forest Naive (1)

MAP (k=3) past 5 years
1. XGradient Boost w/Voter Fatigue (4.5)
2. XGradient Boost (4.5)
3. Neural Network (2.5)
4. Neural Network w/Voter Fatigue (2.5)
5. Random Forest Models all (0.33 each)

MAP (k=3) past 10 years
1. Neural Network w/Voter Fatigue (5)
2. Neural Network (4)
3. Random Forest Models all (2 each)

Overall Accuracy
1. XGradient Boost w/Voter Fatigue (5)
2. XGradient Boost (4)
3. Random Forest Normalized (3)
4. Random Forest other 2 and Neural Networks (0.75 each)

Accuracy past 5 years
1. Random Forest Normalized (4.5)
2. Neural Network w/Voter Fatigue (4.5)
3. XGradient Boost both + Neural Network (2 each)

Accuracy past 10 years
1. Random Forest Normalized, XGradient Boost w/Voter Fatigue, both Neurals (3.5 each)
2. Other Random Forests + XGradient Boost (0.33 each)

In [None]:
RandomForestWithVoterFatigue = 5 + 2 + 2 + 2 + 3 + .33 + 2 + .75
XGradientBoost = 4 + 4 + 2 + 3 + 1 + 2 + 4.5 + 4 + 2 + .33
XGradientBoostWithVoterFatigue = 3 + 3 + 4 + 5 + 4.5 + 5 + 2 + 3.5
RandomForestNaive = 2 + 5 + 1 + 1 + 3 + 1 + .33 + 2 + .75 + .33
RandomForestNormalized = 1 + .33 + 2 + 3 + 4.5 + 3.5 + .33
NeuralNetwork = 1 + 3 + 4 + 5 + 4 + 2.5 + 5 + .75 + 2 + 3.5
NeuralNetworkWithVoterFatigue = 5 + 4 + 5 + 2.5 + 5 + .75 + 4.5 + 3.5

In [None]:
model_scores = [RandomForestNaive, RandomForestNormalized, RandomForestWithVoterFatigue, XGradientBoost,
                XGradientBoostWithVoterFatigue, NeuralNetwork, NeuralNetworkWithVoterFatigue]

In [None]:
np.array(model_scores) / 165.98

### All 3 are good at different things, but seems like Neural Networks are more precise at the top as well as X Gradient Boost (especially with Voter Fatigue)

### Can we make an ensemble to create potentially "the best prediction"? Will it improve accuracy?

In [194]:
best_models = [all_predictions_nn, all_predictions_nn_vf, all_predictions_xgb_vf, all_predictions_xgb, all_predictions_rf_votefatigue, all_predictions_rf]

In [195]:
# Code to combine models for a holistic prediction for a single year's MVP race
def ensemble_predictions_single_year(model_predictions, year, num_candidates):
    mvp_dict = {} # make dictionary of candidates with key being their name and their value being their MVP Share Prediction
    for model in model_predictions: # going through each model
        top_predicted = model[model["Year"] == year].sort_values("Predicted Rank").head(num_candidates)
        for index, row in top_predicted.iterrows(): #iterating through each model's rows
            player, predicted_share = row["Player"], row["MVP Share Predictions"]
            if (player in mvp_dict):
                mvp_dict[player] += predicted_share
            else:
                mvp_dict[player] = predicted_share
    
    # Now that we have the dictionary, we can append this data to the actual rank for that year
    ensemble_df = mvp_df[["Player", "Share", "Year"]]
    ensemble_df = ensemble_df[(ensemble_df["Year"] == year) & ensemble_df["Player"].isin(mvp_dict.keys())]
    filtered_df = ensemble_df.copy(deep=True)
    filtered_df.loc[:, "MVP Share Predictions"] = 0
    
    for player, share in mvp_dict.items():
        player_row = filtered_df[filtered_df["Player"] == player].index.values[0]
        filtered_df.loc[player_row, "MVP Share Predictions"] = share
    
    # Now that we have assigned our predictions we can use our previous function to convert it to rankings
    converted_df = convert_to_ranks(filtered_df, "Share", "MVP Share Predictions")
    with_differences_df = difference_in_rank(converted_df)
    with_differences_df = with_differences_df[["Player", "Share", "MVP Share Predictions", "Rank", "Predicted Rank", "Difference", "Year"]]
    
    return with_differences_df

#### Now, time to do all the metrics on this ensemble model

In [196]:
%%time
"""Need to do 'backtesting' for the ensemble as well"""
mse_ar_en = []
apk_ar_en = []
predictions_en = []
for year in quant_years:
    ensemble_year_df = ensemble_predictions_single_year(best_models, year, 15)
    mse_ar_en.append(get_mse_top_n(ensemble_year_df))
    apk_ar_en.append(get_apk_n(ensemble_year_df, 5))
    predictions_en.append(ensemble_year_df)

avg_mse_en, avg_apk_en, all_predictions_en = (sum(mse_ar_en) / len(mse_ar_en)), (sum(apk_ar_en) / len(apk_ar_en)), pd.concat(predictions_en)

CPU times: user 293 ms, sys: 10.1 ms, total: 303 ms
Wall time: 308 ms


In [None]:
# good: 31, 24, 23
# bad: 29, 20, 17
apk_ar_en

In [None]:
ensemble_calc = np.sum(0 + 1 + 1 + 2 + 1 + 1 + 1 + 1 + 16 + 1) / 10
bball_ref_calc = np.sum(0 + 0 + 0 + 4 + 1 + 1 + 4 + 25 + 16 + 4) / 10
ensemble_calc, bball_ref_calc

##  Saving models so we can work on them and look at them after

In [733]:
# Save RF Model to file in the current working directory

joblib_file_rf = "joblib_RF_Model.pkl"  
joblib.dump(rf, joblib_file_rf)
# This is with voter fatigue as well which has the lowest MSE and the highest APK

['joblib_RF_Model.pkl']

In [734]:
joblib_file_rf_naive = "joblib_RF_Naive_Model.pkl"  
joblib.dump(rf_naive, joblib_file_rf_naive)

['joblib_RF_Naive_Model.pkl']

In [735]:
joblib_file_rf_normalized = "joblib_RF_Normalized_Model.pkl"  
joblib.dump(rf_naive, joblib_file_rf_normalized)

['joblib_RF_Normalized_Model.pkl']

In [736]:
filepath = 'rf_session_7-11.pkl'
dill.dump_session(filepath) # Save the session

## SHAP

In [215]:
explainer_xgb = shap.TreeExplainer(xgb_model)
shap_values_xgb = shap.TreeExplainer(model_prediction(df, year, model_bt, predictors)
explainer_xgb.

<shap.explainers._tree.Tree at 0x16a2d04c0>

In [217]:
model_prediction(add_previous_mvps(mvp_df, years), year, xgb_model, add_previous_mvp_as_predictor(predictors))

Unnamed: 0,Player,Share,MVP Share Predictions
7312,A.J. Green,0.000,0.000005
7313,Bobby Portis,0.000,0.000012
7314,Brook Lopez,0.000,0.000058
7315,Giannis Antetokounmpo,0.606,0.229270
7316,Goran Dragić,0.000,0.000012
...,...,...,...
18695,Mitchell Robinson,0.000,0.000033
18696,Obi Toppin,0.000,0.000004
18697,Quentin Grimes,0.000,0.000006
18698,RJ Barrett,0.000,0.000057


## Catboost

In [221]:
cb = CatBoostRegressor(iterations=2, 
                          depth=2, 
                          learning_rate=1, 
                          loss_function='RMSE')

Unnamed: 0,Player,Pos,Age,G,GS,MP,FG,FGA,FG%,3P,...,Pts Max,Share,Team,W,L,W/L%,GB,PS/G,PA/G,SRS
0,A.C. Green,PF,22,82,24,18.8,2.5,4.7,0.539,0.0,...,0.0,0.000,Los Angeles Lakers,62,20,0.756,0.0,117.3,109.5,6.84
1,Byron Scott,SG,24,76,13,28.8,6.7,13.0,0.513,0.3,...,0.0,0.000,Los Angeles Lakers,62,20,0.756,0.0,117.3,109.5,6.84
2,James Worthy,SF,24,75,0,32.7,8.4,14.5,0.579,0.0,...,780.0,0.009,Los Angeles Lakers,62,20,0.756,0.0,117.3,109.5,6.84
3,Jerome Henderson,C,26,1,60,3.0,2.0,3.0,0.667,0.0,...,0.0,0.000,Los Angeles Lakers,62,20,0.756,0.0,117.3,109.5,6.84
4,Kareem Abdul-Jabbar,C,38,79,7,33.3,9.6,16.9,0.564,0.0,...,780.0,0.173,Los Angeles Lakers,62,20,0.756,0.0,117.3,109.5,6.84
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18775,Spencer Hawes,PF,28,54,1,14.8,2.5,5.1,0.484,0.5,...,0.0,0.000,Milwaukee Bucks,42,40,0.512,9.0,103.6,103.8,-0.45
18776,Steve Novak,PF,33,8,0,2.8,0.3,0.9,0.286,0.1,...,0.0,0.000,Milwaukee Bucks,42,40,0.512,9.0,103.6,103.8,-0.45
18777,Terrence Jones,PF,25,54,12,23.5,4.3,9.1,0.470,0.4,...,0.0,0.000,Milwaukee Bucks,42,40,0.512,9.0,103.6,103.8,-0.45
18778,Thon Maker,C,19,57,34,9.9,1.5,3.2,0.459,0.5,...,0.0,0.000,Milwaukee Bucks,42,40,0.512,9.0,103.6,103.8,-0.45
