### Now that we have a fully cleaned dataframe, we can start building out our model

In [109]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor

In [110]:
"""Imported functions from ml_metrics library"""

def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.

    This function computes the average prescision at k between two lists of
    items.

    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements

    Returns
    -------
    score : double
            The average precision at k over the input lists

    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k.

    This function computes the mean average prescision at k between two lists
    of lists of items.

    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements

    Returns
    -------
    score : double
            The mean average precision at k over the input lists

    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

In [111]:
mvp_df = pd.read_csv("player_mvp_stats.csv")

In [112]:
# Dropping previous index as well as Tm since redundant with Team and
# Hornets have two different abbreviations
mvp_df = mvp_df.drop(["Unnamed: 0", "Tm"], axis=1)

In [113]:
years = list(range(2000, 2024))

In [114]:
mvp_df["Previous MVP's Won"] = 0
for year in years:
    year_winner = mvp_df[mvp_df["Year"] == year].sort_values("Share", ascending=False).iloc[:1]["Player"].item()
    years_after_winning_for_player = mvp_df[(mvp_df["Player"] == year_winner) & (mvp_df["Year"] > year)]
    years_after_index = years_after_winning_for_player.index.values.tolist()
    mvp_df.loc[years_after_index, ["Previous MVP's Won"]] += 1

In [116]:
nash = mvp_df[mvp_df["Player"] == "Steve Nash"]
nash[["Player", "Year", "Previous MVP's Won"]]

Unnamed: 0,Player,Year,Previous MVP's Won
317,Steve Nash,2011,2
1770,Steve Nash,2002,0
1786,Steve Nash,2003,0
2935,Steve Nash,2008,2
2948,Steve Nash,2009,2
4373,Steve Nash,2005,0
4391,Steve Nash,2006,1
4406,Steve Nash,2007,2
4419,Steve Nash,2010,2
6078,Steve Nash,2013,2


In [5]:
# setting up everything for our variables
# Removed qualitative variables like Team as well as Year
predictors = ['Age', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB',
       'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'PER',
       'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%',
       'TOV%', 'USG%', 'OWS', 'DWS', 'WS', 'WS/48', 'OBPM', 'DBPM', 'BPM',
       'VORP', 'W', 'L', 'W/L%', 'GB',
       'PS/G', 'PA/G', 'SRS']

In [6]:
train = mvp_df[mvp_df["Year"] < 2023]
test = mvp_df[mvp_df["Year"] == 2023]

In [7]:
X_train = train[predictors]
y_train = train["Share"]

X_test = test[predictors]
y_test = test["Share"]

In [8]:
# Setting up first model (Ridge Regression)

reg = Ridge(alpha=.1)
reg.fit(X_train, y_train)
ridge_predictions = reg.predict(X_test)

In [9]:
# Converting to df to be cleaner
ridge_df = pd.DataFrame(ridge_predictions,
                                 columns=["MVP Share Predictions"],
                                 index=X_test.index)
ridge_df

Unnamed: 0,MVP Share Predictions
48,0.017688
49,0.022474
50,0.029433
51,0.229242
52,-0.004037
...,...
11431,-0.012121
11432,0.007565
11433,-0.007648
11434,-0.018394


In [10]:
ridge_combination = pd.concat([test[["Player", "Share"]], ridge_df], axis = 1)
ridge_combination.sort_values("MVP Share Predictions", ascending=False).head(20)

Unnamed: 0,Player,Share,MVP Share Predictions
51,Giannis Antetokounmpo,0.606,0.229242
559,Nikola Jokić,0.674,0.227922
143,Luka Dončić,0.01,0.196454
11253,Joel Embiid,0.915,0.161215
3426,Domantas Sabonis,0.027,0.13535
2271,Jayson Tatum,0.28,0.119459
1072,Shai Gilgeous-Alexander,0.046,0.112464
11429,Julius Randle,0.0,0.112139
9504,Ja Morant,0.001,0.109441
6727,LeBron James,0.0,0.108317


In [11]:
# generalize this later for all types of models
# outputs two new columns: Rank and Predicted Rank

def convert_to_ranks(df, actual_share_col, predicted_share_col):
    ranked_df = df.copy(deep=True)
    ranked_df["Rank"] = df[actual_share_col].rank(method="min", ascending=False)
    ranked_df["Predicted Rank"] = df[predicted_share_col].rank(method="min", ascending=False)
    return ranked_df.sort_values("Rank")

In [12]:
# create an error metric function that is based on ranking since
# most players' MVP share is 0
# assumption is table looks like ridge_combination
# with cols: Player, Share, MVP Share Predictions
# outdated not using the convert_to_ranks function but not useful
def find_ap(df, actual_share_col, predicted_share_col, num):
    actual_ranking = df.sort_values(actual_share_col, ascending=False).head(num)
    actual_ranking["Rank"] = list(range(1, actual_ranking.shape[0] + 1))
    predicted_ranking = df.sort_values(predicted_share_col, ascending=False)
    predicted_ranking["Pred_Rank"] = list(range(1, predicted_ranking.shape[0] + 1))
    abs_diff = 0
    for index, row in actual_ranking.iterrows():
        player_name = row["Player"]
        predicted_player_row = predicted_ranking[
            predicted_ranking["Player"] == player_name]
        abs_diff += abs(row["Rank"] - predicted_player_row["Pred_Rank"].values[0])
    return abs_diff

In [13]:
"""
Finding mean square error

This will give you a measure of the overall discrepancy or difference in ranks between the two for the top 10 MVP
candidates.

Input is a ranked dataframe after applying the convert_to_ranks function.
"""

def get_mse_top_10(ranked_df, num=10):
    top_n_df = ranked_df.head(num)
    return mean_squared_error(top_n_df["Rank"].to_numpy(), top_n_df["Predicted Rank"].to_numpy(), squared=True)

In [14]:
"""
Determine the AP@K (Average Precision at K) for your predicted ranking. 

Interested in top n since those are the MVP finalists. AP@K will provide an assessment
of the precision and correctness of your predicted ranking for the top candidates.

Input is a ranked dataframe after applying the convert_to_ranks function.
"""

def get_apk_n(ranked_df, k):
    top_k_actual = ranked_df["Player"][:k]
    top_k_predicted = ranked_df.sort_values("Predicted Rank")["Player"][:k]
    return apk(top_k_actual.tolist(), top_k_predicted.tolist(), 5)

In [15]:
""" 
Combines both of our metrics together to provide a clean look at how our model does 
in these two important criteria. """

def evaluate_model(df, actual_share_col, predicted_share_col, k):
    combined_df = convert_to_ranks(df, actual_share_col, predicted_share_col)
    
    #return combined_df
    mse = get_mse_top_10(combined_df)
    apk = get_apk_n(combined_df, k)
    
    return [mse, apk]
    

In [16]:
def print_model_results(mse, apk, k):
    print("Mean Squared Error:", mse)
    print("Average Precision at", str(k) + ":", apk)

In [17]:
# Final output for evaluating how our model did on the two key metrics
naive_results = evaluate_model(ridge_combination, "Share", "MVP Share Predictions", 5)
print_model_results(naive_results[0], naive_results[1], 5)

Mean Squared Error: 13.4
Average Precision at 5: 0.55


## Backtesting

In [19]:
# Following along, we make a function to find the biggest differences each year
# inside code we run function of convert_to_ranks
def difference_in_rank(ranked_df):
    ranked_df["Difference"] = ranked_df["Rank"] - ranked_df["Predicted Rank"]
    return ranked_df

In [53]:
ridge_combination["Previous MVP's"] = 0
convert_to_ranks(ridge_combination, "Share", "MVP Share Predictions")


Unnamed: 0,Player,Share,MVP Share Predictions,Previous MVP's,Rank,Predicted Rank
11253,Joel Embiid,0.915,0.161215,0,1.0,4.0
559,Nikola Jokić,0.674,0.227922,0,2.0,2.0
51,Giannis Antetokounmpo,0.606,0.229242,0,3.0,1.0
2271,Jayson Tatum,0.280,0.119459,0,4.0,6.0
1072,Shai Gilgeous-Alexander,0.046,0.112464,0,5.0,7.0
...,...,...,...,...,...,...
3295,Patrick Williams,0.000,-0.036102,0,14.0,520.0
3294,Patrick Beverley,0.000,-0.026887,0,14.0,502.0
3293,Nikola Vučević,0.000,0.052492,0,14.0,27.0
3430,Keon Ellis,0.000,0.015045,0,14.0,98.0


In [20]:
"""
Creating a backtesting function to run all the code we did and put it in a loop
"""
def backtest(orig_df, model_bt, timeframe, predictors, normalizing=False):
    """
    Looping through years (starting with 2005 because we need at least 5 years of data to come up with a model)
    Then making predictions for every year based off previous traine data
    The more years we can train on, the more data our algorithm can build off to make accurate predictions
    """

    all_predictions = [] # list of dataframes holding our predictions for every year
    mse_ar = []
    apk_ar = []
    
    df = orig_df.copy(deep=True)
    
    if normalizing: # if our data needs to be normalized (for certain models where scaling is important)
        predictors_plus_year = predictors.copy()
        predictors_plus_year.append("Year")
        df[predictors] = df[predictors_plus_year].groupby("Year").transform(lambda x: (x-np.mean(x, axis=0))/np.std(x, ddof=1))

    for year in timeframe[5:]:
        train_bt = df[df["Year"] < year]
        test_bt = df[df["Year"] == year]

        X_train_bt = train_bt[predictors]
        y_train_bt = train_bt["Share"]
        X_test_bt = test_bt[predictors]
        y_test_bt = test_bt["Share"]

        model_bt.fit(X_train_bt, y_train_bt)
        model_bt_predictions = model_bt.predict(X_test_bt) # predict for testing set (the current year starting w 2005)

        model_bt_df = pd.DataFrame(model_bt_predictions,
                                     columns=["MVP Share Predictions"],
                                     index=X_test_bt.index) # putting predictions into a dataframe

        model_bt_combination = pd.concat([test_bt[["Player", "Share"]],
                                          model_bt_df], axis = 1) # concatenating predictions with the players and their MVP share

        combined_bt_df = convert_to_ranks(model_bt_combination, "Share", "MVP Share Predictions") # adding actual and predicted rank into list
        with_differences = difference_in_rank(combined_bt_df) # adding difference between actual and predicted rank
        with_differences["Year"] = year # adding year to add more information to overall dataframe
        
        all_predictions.append(with_differences) # appending predictions to outer list

        mse_ar.append(get_mse_top_10(combined_bt_df))
        apk_ar.append(get_apk_n(combined_bt_df, 5))
        
    return ((sum(mse_ar) / len(mse_ar)), mse_ar, (sum(apk_ar) / len(apk_ar)), apk_ar, pd.concat(all_predictions))

In [21]:
normalized_avg_mse, normalized_mse_ar, normalized_avg_apk, normalized_apk_ar, normalized_all_predictions = backtest(mvp_df, reg, years, predictors, normalizing=True)

In [22]:
avg_mse, mse_ar, avg_apk, apk_ar, all_predictions = backtest(mvp_df, reg, years, predictors)

In [23]:
# Made it to 35:22 on Part 3
normalized_all_predictions[normalized_all_predictions["Rank"] <=10].sort_values("Difference", ascending=False).head(15)

Unnamed: 0,Player,Share,MVP Share Predictions,Rank,Predicted Rank,Difference,Year
7275,Tim Duncan,0.002,0.131269,10.0,5.0,5.0,2009
6085,James Harden,0.007,0.180261,9.0,4.0,5.0,2016
10757,Dwight Howard,0.011,0.219161,7.0,2.0,5.0,2012
2500,Blake Griffin,0.004,0.160822,10.0,5.0,5.0,2011
6709,LeBron James,0.001,0.144871,10.0,5.0,5.0,2022
8513,Tim Duncan,0.026,0.175868,8.0,3.0,5.0,2006
143,Luka Dončić,0.01,0.210401,8.0,3.0,5.0,2023
4729,LeBron James,0.142,0.218189,5.0,1.0,4.0,2007
6365,Damian Lillard,0.023,0.201897,8.0,4.0,4.0,2020
9194,Nikola Jokić,0.018,0.14779,9.0,6.0,3.0,2020


In [24]:
pd.concat([pd.Series(reg.coef_), pd.Series(predictors)], axis=1).sort_values(0, ascending=False)

Unnamed: 0,0,1
13,0.142618,eFG%
48,0.080804,W/L%
18,0.034384,DRB
45,0.027272,VORP
15,0.024229,FTA
16,0.020869,FT%
17,0.019185,ORB
41,0.018951,WS/48
23,0.018599,TOV
4,0.0082,FG


## Trying Random Forest

In [25]:
rf = RandomForestRegressor(n_estimators=100, random_state=42, min_samples_split=5)

In [26]:
avg_mse_rf, mse_ar_rf, avg_apk_rf, apk_ar_rf, all_predictions_rf = backtest(
    mvp_df, rf, years, predictors)

In [28]:
avg_mse_rf_n, mse_ar_rf_n, avg_apk_rf_n, apk_ar_rf_n, all_predictions_rf_n = backtest(
    mvp_df, rf, years, predictors, normalizing=True)

In [30]:
avg_mse_rf, avg_mse_rf_n, avg_mse

(68.08947368421055, 78.3, 198.50000000000006)

In [31]:
avg_apk_rf, avg_apk_rf_n, avg_apk

(0.672982456140351, 0.6380701754385965, 0.5857894736842105)

In [32]:
all_predictions_rf[(all_predictions_rf["Rank"] <= 5) & (all_predictions_rf["Year"] == 2023)]

Unnamed: 0,Player,Share,MVP Share Predictions,Rank,Predicted Rank,Difference,Year
11253,Joel Embiid,0.915,0.565313,1.0,2.0,-1.0,2023
559,Nikola Jokić,0.674,0.708653,2.0,1.0,1.0,2023
51,Giannis Antetokounmpo,0.606,0.3622,3.0,4.0,-1.0,2023
2271,Jayson Tatum,0.28,0.247537,4.0,5.0,-1.0,2023
1072,Shai Gilgeous-Alexander,0.046,0.211718,5.0,6.0,-1.0,2023


In [46]:
all_predictions_rf[all_predictions_rf["Year"] == 2019].sort_values("Predicted Rank").head(15)

Unnamed: 0,Player,Share,MVP Share Predictions,Rank,Predicted Rank,Difference,Year
8032,James Harden,0.768,0.668332,2.0,1.0,1.0,2019
9268,Giannis Antetokounmpo,0.932,0.641124,1.0,2.0,-1.0,2019
9529,Nikola Jokić,0.21,0.17767,4.0,3.0,1.0,2019
3681,Stephen Curry,0.173,0.175869,5.0,4.0,1.0,2019
3218,LeBron James,0.001,0.164379,11.0,5.0,6.0,2019
6134,Anthony Davis,0.0,0.139355,13.0,6.0,7.0,2019
10689,Kawhi Leonard,0.013,0.133182,9.0,7.0,2.0,2019
3675,Kevin Durant,0.025,0.132218,8.0,8.0,0.0,2019
11241,Rudy Gobert,0.001,0.106314,11.0,9.0,2.0,2019
1251,Paul George,0.352,0.083944,3.0,10.0,-7.0,2019
