# testing PCA and then Regression

## Table of Contents
* [Import data/modules](#import-data)
* [Exploratory Data Analysis](#eda)
* [Preprocessing](#preprocessing)
* [Modeling](#modeling)
* [Forecasting 2022 MVP](#forecasting)

## Import data/modules <a class="anchor" id="import-data"></a>


In [3]:
#essentials
import os
import numpy as np
import pandas as pd
import dataframe_image as dfi
from tqdm import tqdm
import pickle
import warnings
warnings.filterwarnings('ignore')

#visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

#tools/metrics
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.feature_selection import mutual_info_regression
from sklearn.model_selection import train_test_split
import shap

#modeling
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

#pandas show all columns
pd.set_option('display.max_columns', None)

#%load_ext autoreload
#%autoreload 2
#%run ./__init__

### Read Data

In [4]:
#data_path = os.path.dirname(os.getcwd()) + '/data' + '/master_table.csv'
#master_table = pd.read_csv(data_path)
master_table = pd.read_csv("D:\\Github\\Capstone-project\\Data folder\\final_dataset.csv")

In [5]:
master_table.head()

Unnamed: 0,Unnamed: 0_x,Player Name,season,Team,conf_abbr,games,games_started,mp_per_g,fg_per_g,fga_per_g,fg_pct,fg2_per_g,fg2a_per_g,fg2_pct,fg3_per_g,fg3a_per_g,fg3_pct,ft_per_g,fta_per_g,ft_pct,orb_per_g,drb_per_g,trb_per_g,ast_per_g,stl_per_g,blk_per_g,tov_per_g,pf_per_g,pts_per_g,sos,mp,fg,fga,fg2,fg2a,fg2_pct.1,fg3,fg3a,ft,fta,orb,drb,trb,ast,stl,blk,tov,pf,pts,fg_per_min,fga_per_min,fg2_per_min,fg2a_per_min,fg3_per_min,fg3a_per_min,ft_per_min,fta_per_min,trb_per_min,ast_per_min,stl_per_min,blk_per_min,tov_per_min,pf_per_min,pts_per_min,fg_per_poss,fga_per_poss,fg2_per_poss,fg2a_per_poss,fg3_per_poss,fg3a_per_poss,ft_per_poss,fta_per_poss,trb_per_poss,ast_per_poss,stl_per_poss,blk_per_poss,tov_per_poss,pf_per_poss,pts_per_poss,off_rtg,def_rtg,per,ts_pct,efg_pct,fg3a_per_fga_pct,fta_per_fga_pct,pprod,orb_pct,drb_pct,trb_pct,ast_pct,stl_pct,blk_pct,tov_pct,usg_pct,ows,dws,ws,ws_per_40,obpm,dbpm,bpm,year,Ht,Wt,Yr,G,S,%Min,ORtg,%Poss,%Shots,eFG%,TS%,OR%,DR%,ARate,TORate,Blk%,Stl%,FC/40,FD/40,FTRate,FTM-A,Pct,2PM-A,Pct.1,3PM-A,Pct.2,%Pct,%2PM-A,%Pct.1
0,3.0,chris-clemons-2,2018-19,CAMPBELL,Big South,33,33,36.6,9.2,20.6,0.448,5.0,8.8,0.569,4.2,11.8,0.357,7.5,8.6,0.869,0.8,4.3,5.1,2.8,1.5,0.3,2.5,1.9,30.1,-4.39,1208,304.0,679.0,165.0,290.0,0.569,139.0,389.0,246.0,283.0,25.0,142.0,167,94.0,49.0,11.0,84.0,63.0,993.0,10.1,22.5,5.5,9.6,4.6,12.9,8.1,9.4,5.5,3.1,1.6,0.4,2.8,2.1,32.9,14.8,33.1,8.0,14.1,6.8,18.9,12.0,13.8,8.1,4.6,2.4,0.5,4.1,3.1,48.3,120.1,104.4,33.0,0.61,0.55,0.573,0.417,888.0,2.4,14.5,8.3,19.3,2.4,1.1,9.4,39.0,6.1,1.4,7.5,0.25,9.3,-1.6,7.7,2019,69,180.0,Sr,30,30.0,93.9,115.6,37.5,39.4,53.6,59.9,2.5,14.0,19.4,10.8,1.0,2.5,2.2,7.6,42.5,233-270,.863 56,149-266,.560 383,128-370,0.346,0.863,149-266,0.56
1,8.0,antoine-davis-2,2018-19,DETROIT MERCY,Horizon,30,29,37.4,8.8,21.9,0.4,4.4,10.3,0.423,4.4,11.6,0.38,4.2,4.9,0.857,0.7,2.4,3.1,3.6,0.9,0.0,3.3,2.3,26.1,-0.36,1122,263.0,657.0,131.0,310.0,0.423,132.0,347.0,126.0,147.0,22.0,72.0,94,107.0,28.0,0.0,99.0,70.0,784.0,9.4,23.4,4.7,11.1,4.7,12.4,4.5,5.2,3.4,3.8,1.0,0.0,3.5,2.5,28.0,13.3,33.3,6.6,15.7,6.7,17.6,6.4,7.4,4.8,5.4,1.4,0.0,5.0,3.5,39.7,109.0,118.9,22.4,0.539,0.501,0.528,0.224,738.0,2.1,7.5,4.7,24.0,1.4,0.0,12.0,36.4,3.4,-0.2,3.1,0.112,5.8,-3.6,2.2,2019,73,170.0,Fr,30,30.0,90.4,107.7,34.8,38.8,50.1,53.9,2.1,7.5,24.0,14.5,0.0,1.4,2.5,5.0,22.4,126-147,.857 69,131-310,0.423,132-347,0.38,0.857,131-310,0.423
2,9.0,antoine-davis-2,2019-20,DETROIT MERCY,Horizon,30,30,36.7,7.9,20.8,0.38,4.5,10.4,0.437,3.4,10.4,0.324,5.1,5.7,0.901,0.3,2.8,3.1,4.5,1.7,0.1,4.5,2.5,24.3,-1.9,1102,237.0,623.0,136.0,311.0,0.437,101.0,312.0,154.0,171.0,9.0,85.0,94,134.0,52.0,3.0,136.0,75.0,729.0,8.6,22.6,4.9,11.3,3.7,11.3,5.6,6.2,3.4,4.9,1.9,0.1,4.9,2.7,26.5,12.2,32.0,7.0,16.0,5.2,16.0,7.9,8.8,4.8,6.9,2.7,0.2,7.0,3.9,37.5,101.8,112.5,21.0,0.518,0.461,0.501,0.274,702.0,0.9,8.8,4.7,33.2,2.7,0.3,16.2,37.2,2.5,0.2,2.7,0.097,3.4,-2.4,1.1,2020,73,160.0,So,30,30.0,88.6,100.0,36.1,38.0,46.1,51.8,0.9,8.8,33.1,19.4,0.3,2.7,2.7,5.6,27.4,154-171,.901 16,136-311,0.437,101-312,0.324,0.901,136-311,0.437
3,10.0,antoine-davis-2,2020-21,DETROIT MERCY,Horizon,22,22,38.5,8.1,19.1,0.424,4.3,9.0,0.482,3.8,10.1,0.372,4.0,4.4,0.917,0.2,2.7,2.9,4.8,1.5,0.0,3.3,1.6,24.0,-6.21,846,178.0,420.0,95.0,197.0,0.482,83.0,223.0,88.0,96.0,4.0,60.0,64,105.0,32.0,0.0,72.0,36.0,527.0,8.4,19.9,4.5,9.3,3.9,10.5,4.2,4.5,3.0,5.0,1.5,0.0,3.4,1.7,24.9,12.3,29.1,6.6,13.7,5.8,15.5,6.1,6.7,4.4,7.3,2.2,0.0,5.0,2.5,36.5,112.0,111.7,23.5,0.566,0.523,0.531,0.229,509.0,0.6,8.6,4.6,26.7,2.2,0.0,13.4,33.1,3.0,0.3,3.3,0.155,3.9,-3.1,0.9,2021,73,165.0,Jr,22,22.0,95.6,110.5,32.0,34.1,52.3,56.6,0.6,8.5,26.7,15.6,0.0,2.2,1.7,4.7,22.9,88-96,.917 7,95-197,0.482,83-223,0.372,0.917,95-197,0.482
4,15.0,carsen-edwards-1,2018-19,PURDUE,Big Ten,36,36,35.4,7.7,19.5,0.394,3.9,9.0,0.44,3.8,10.6,0.355,5.1,6.1,0.837,0.4,3.2,3.6,2.9,1.3,0.3,3.1,2.0,24.3,11.99,1275,277.0,703.0,142.0,323.0,0.44,135.0,380.0,185.0,221.0,14.0,116.0,130,104.0,48.0,10.0,113.0,73.0,874.0,8.7,22.1,4.5,10.1,4.2,11.9,5.8,6.9,4.1,3.3,1.5,0.3,3.5,2.3,27.4,13.1,33.4,6.7,15.3,6.4,18.0,8.8,10.5,6.2,4.9,2.3,0.5,5.4,3.5,41.5,109.9,103.4,23.2,0.541,0.49,0.541,0.314,787.0,1.3,10.6,6.0,18.3,2.3,1.0,12.3,37.3,4.0,1.6,5.6,0.176,7.5,1.0,8.5,2019,73,200.0,Jr,36,36.0,87.3,108.0,34.7,37.5,49.0,54.1,1.3,10.7,18.3,15.5,1.0,2.3,2.3,5.9,31.4,185-221,.837 131,142-323,0.44,135-380,0.355,0.837,142-323,0.44


## Exploratory Data Analysis <a class="anchor" id="eda"></a>


In [None]:
master_table.info()

In [None]:
print(len(list(master_table.columns)))
print(len(list(master_table.columns[(master_table. dtypes == 'float64') | (master_table. dtypes == 'int64') ])))
print(len(list(master_table.columns[(master_table. dtypes != 'float64') & (master_table. dtypes != 'int64') ])))

In [None]:
columns_to_use = list(master_table.columns[(master_table. dtypes == 'float64') | (master_table. dtypes == 'int64') ])
print(columns_to_use)

In [None]:
columns_not_used = list(master_table.columns[(master_table. dtypes != 'float64') & (master_table. dtypes != 'int64') ])
print(columns_not_used)

In [None]:
master_table.describe()

In [None]:

master_table['year'] = master_table['year'].astype(int)
list(master_table['year'].unique())

In [None]:
master_table['prev_year'] = master_table['year']-1
master_table[['Player Name', 'season', 'Team','year', 'prev_year','ows', 'dws']]

In [None]:
filtered_df = master_table[master_table['games']<40]
df_to_use = filtered_df.dropna()

In [None]:
print (master_table.shape)
print(filtered_df.shape)
print(df_to_use.shape)

In [None]:
merged_df = pd.merge(df_to_use, df_to_use , how = 'inner',
                     left_on = ['Player Name','year'] ,
                     right_on = ['Player Name','prev_year'])

In [None]:
print(merged_df.shape)

In [None]:
#master_table = merged_df
merged_df['offense'] = merged_df['%Min_y']*merged_df['ORtg_y']

In [None]:
merged_df[['%Min_y','ORtg_y', 'offense' ]].head(3)

In [None]:
columns_to_use = list(merged_df.columns[(merged_df. dtypes == 'float64') | (merged_df. dtypes == 'int64') ])
print(columns_to_use)

In [None]:
columns = ['Unnamed: 0_x_x', 'games_x', 'games_started_x', 'mp_per_g_x', 'fg_per_g_x', 'fga_per_g_x', 'fg2_per_g_x', 'fg2a_per_g_x', 'fg3_per_g_x', 'fg3a_per_g_x', 'ft_per_g_x', 'fta_per_g_x', 'orb_per_g_x', 'drb_per_g_x', 'trb_per_g_x', 'ast_per_g_x', 'stl_per_g_x', 'blk_per_g_x', 'tov_per_g_x', 'pf_per_g_x', 'pts_per_g_x', 'sos_x', 'mp_x', 'fg_x', 'fga_x', 'fg2_x', 'fg2a_x', 'fg3_x', 'fg3a_x', 'ft_x', 'fta_x', 'orb_x', 'drb_x', 'trb_x', 'ast_x', 'stl_x', 'blk_x', 'tov_x', 'pf_x', 'pts_x', 'fg_per_min_x', 'fga_per_min_x', 'fg2_per_min_x', 'fg2a_per_min_x', 'fg3_per_min_x', 'fg3a_per_min_x', 'ft_per_min_x', 'fta_per_min_x', 'trb_per_min_x', 'ast_per_min_x', 'stl_per_min_x', 'blk_per_min_x', 'tov_per_min_x', 'pf_per_min_x', 'pts_per_min_x', 'fg_per_poss_x', 'fga_per_poss_x', 'fg2_per_poss_x', 'fg2a_per_poss_x', 'fg3_per_poss_x', 'fg3a_per_poss_x', 'ft_per_poss_x', 'fta_per_poss_x', 'trb_per_poss_x', 'ast_per_poss_x', 'stl_per_poss_x', 'blk_per_poss_x', 'tov_per_poss_x', 'pf_per_poss_x', 'pts_per_poss_x', 'off_rtg_x', 'def_rtg_x', 'per_x', 'ts_pct_x', 'efg_pct_x', 'fg3a_per_fga_pct_x', 'fta_per_fga_pct_x', 'pprod_x', 'orb_pct_x', 'drb_pct_x', 'trb_pct_x', 'ast_pct_x', 'stl_pct_x', 'blk_pct_x', 'tov_pct_x', 'usg_pct_x', 'ows_x', 'dws_x', 'ws_x', 'ws_per_40_x', 'obpm_x', 'dbpm_x', 'bpm_x', 'Ht_x', 'Wt_x', 'G_x', 'S_x', '%Min_x', 'ORtg_x', '%Poss_x', '%Shots_x', 'eFG%_x', 'TS%_x', 'OR%_x', 'DR%_x', 'ARate_x', 'TORate_x', 'Blk%_x', 'Stl%_x', 'FC/40_x', 'FD/40_x', 'FTRate_x', 'Pct.2_x', '%Pct_x', '%Pct.1_x', 'Unnamed: 0_x_y', 'games_y', 'games_started_y', 'mp_per_g_y', 'fg_per_g_y', 'fga_per_g_y', 'fg2_per_g_y', 'fg2a_per_g_y', 'fg3_per_g_y', 'fg3a_per_g_y', 'ft_per_g_y', 'fta_per_g_y', 'orb_per_g_y', 'drb_per_g_y', 'trb_per_g_y', 'ast_per_g_y', 'stl_per_g_y', 'blk_per_g_y', 'tov_per_g_y', 'pf_per_g_y', 'pts_per_g_y', 'sos_y', 'mp_y', 'fg_y', 'fga_y', 'fg2_y', 'fg2a_y', 'fg3_y', 'fg3a_y', 'ft_y', 'fta_y', 'orb_y', 'drb_y', 'trb_y', 'ast_y', 'stl_y', 'blk_y', 'tov_y', 'pf_y', 'pts_y', 'fg_per_min_y', 'fga_per_min_y', 'fg2_per_min_y', 'fg2a_per_min_y', 'fg3_per_min_y', 'fg3a_per_min_y', 'ft_per_min_y', 'fta_per_min_y', 'trb_per_min_y', 'ast_per_min_y', 'stl_per_min_y', 'blk_per_min_y', 'tov_per_min_y', 'pf_per_min_y', 'pts_per_min_y', 'fg_per_poss_y', 'fga_per_poss_y', 'fg2_per_poss_y', 'fg2a_per_poss_y', 'fg3_per_poss_y', 'fg3a_per_poss_y', 'ft_per_poss_y', 'fta_per_poss_y', 'trb_per_poss_y', 'ast_per_poss_y', 'stl_per_poss_y', 'blk_per_poss_y', 'tov_per_poss_y', 'pf_per_poss_y', 'pts_per_poss_y', 'off_rtg_y', 'def_rtg_y', 'per_y', 'ts_pct_y', 'efg_pct_y', 'fg3a_per_fga_pct_y', 'fta_per_fga_pct_y', 'pprod_y', 'orb_pct_y', 'drb_pct_y', 'trb_pct_y', 'ast_pct_y', 'stl_pct_y', 'blk_pct_y', 'tov_pct_y', 'usg_pct_y', 'ows_y', 'dws_y', 'ws_y', 'ws_per_40_y', 'obpm_y', 'dbpm_y', 'bpm_y', 'Ht_y', 'Wt_y', 'G_y', 'S_y', '%Min_y', 'ORtg_y', '%Poss_y', '%Shots_y', 'eFG%_y', 'TS%_y', 'OR%_y', 'DR%_y', 'ARate_y', 'TORate_y', 'Blk%_y', 'Stl%_y', 'FC/40_y', 'FD/40_y', 'FTRate_y', 'Pct.2_y', '%Pct_y', '%Pct.1_y', 'offense']

In [None]:
master_table  = merged_df[columns]
print(master_table.shape)

In [None]:
#master_table.to_csv('D:\\Github\\Capstone-project\\Data folder\\zain_testing.csv', index = False)

In [None]:
master_table[['%Min_x','ORtg_x', 'offense' ]].head(3)

### Correlation Matrix

In [None]:
#correlations of each features
corr_matrix = master_table.corr()

#plot heat map
mask = np.zeros_like(corr_matrix)
mask[np.triu_indices_from(mask)] = True
with sns.axes_style("white"):
    f, ax = plt.subplots(figsize=(15, 15))
    ax = sns.heatmap(corr_matrix, mask=mask, vmax=.3, square=True,cmap="RdYlGn")


### Mutual Information

Mutual information describes relationships in terms of uncertainty. The mutual information (MI) between two quantities is a measure of the extent to which knowledge of one quantity reduces uncertainty about the other

The least possible mutual information between quantities is 0.0. When MI is zero, the quantities are independent: neither can tell you anything about the other. Conversely, in theory there's no upper bound to what MI can be. In practice though values above 2.0 or so are uncommon. (Mutual information is a logarithmic quantity, so it increases very slowly.)

* MI can help you to understand the relative potential of a feature as a predictor of the target, considered by itself.
* It's possible for a feature to be very informative when interacting with other features, but not so informative all alone. MI can't detect interactions between features. It is a univariate metric.
* The actual usefulness of a feature depends on the model you use it with. A feature is only useful to the extent that its relationship with the target is one your model can learn. Just because a feature has a high MI score doesn't mean your model will be able to do anything with that information. You may need to transform the feature first to expose the association.



In [None]:
def calculate_ml_scores(df):
    X = df.copy()
    y = X["offense"]

    X.drop('offense', axis=1, inplace=True)

    # Label encoding for categoricals
    for colname in X.select_dtypes("object"):
        X[colname], _ = X[colname].factorize()

    # All discrete features should now have integer dtypes (double-check this before using MI!)
    discrete_features = X.dtypes == int
    
    mi_scores = mutual_info_regression(X, y)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return X, y, mi_scores

In [None]:
'''
#drop columns for mutual information
to_drop_mi = ['Rank','Player','Age','year','Tm','team','First','Pts Won','Pts Max','WS','WS/48']
master_table_mi = master_table.copy()
master_table_mi.drop(to_drop_mi, axis=1, inplace=True)
'''

In [None]:
X, y, mi_scores = calculate_ml_scores(df=master_table)

In [None]:
def plot_mi_scores(scores, figsize):
    scores = scores.sort_values(ascending=True)
    width = np.arange(len(scores))
    ticks = list(scores.index)
    
    fig, ax = plt.subplots(figsize=figsize)
    ax.barh(width, scores)
    
    for index, value in enumerate(scores):
        plt.text(value +0.005 , index, str(round(value,2)))
    
    plt.yticks(width, ticks)    
    plt.title("Mutual Information Scores")

plot_mi_scores(mi_scores, figsize=(14,11))

### Visualize significant features vs. MVP Shares

In [None]:
def add_win_lose_col(df):
    rank_lst = []
    for i in list(df['Rank']):
        if i == '1':
            rank_lst.append('won')
        else:
            rank_lst.append('lost')
    master_table_rank = df.copy()
    master_table_rank['Win/Lose'] = rank_lst
    return master_table_rank

In [None]:
def show_feature_vs_share(feature, df):
    fig = px.scatter(data_frame = df,
               x=feature,
               y='Share',
               color='Win/Lose',
               color_discrete_sequence=['blue','gray'], 
               hover_data={
                   'Win/Lose': False,
                   'Player': True, 
                   'year': True,
                   'seed': True,
                   'W/L%': True, 
                   'W': True
                   
               })
    fig.update_layout(height=500,
                     title = f"{feature} vs. MVP share")
    fig.show()

In [None]:
'''
features = ['win_shares', 
            'player_efficiency_rating',
            'value_over_replacement_player',
            'box_plus_minus',
            'offensive_box_plus_minus',
            'usage_percentage',
            'seed',
            'W',
            'W/L%',
            'PTS']

master_table_rank = add_win_lose_col(df=master_table)

for feature in features:
    show_feature_vs_share(feature=feature, df=master_table_rank)
'''

At this point these variables may seem to have somewhat linear relationship with the MVP share metric. It could be valid to consider them as model features as the experiments are conducted.

# Preprocessing <a class="anchor" id="preprocessing"></a>

drop unnecessary or redundant features

In [None]:
to_drop = []
'''

#drop columns 

to_drop = [
    'Rank',
    'Player',
    'Age',
    'year',
    'Tm',
    'team',
    'First',
    'Pts Won',
    'Pts Max',
    'WS/48',
    'WS',
    'MP',
    'G',
    'W', 
    'FG%',
    '3P%',
    'STL', 
    'BLK',
    'three_point_attempt_rate',
    'total_rebound_percentage',
    'offensive_rebound_percentage',
    'block_percentage',
    'defensive_rebound_percentage',
    'steal_percentage',
    'turnover_percentage',
    'assist_percentage',
    'AST',
    'TRB',
    #'free_throw_attempt_rate', ######### Experiment
    'FT%',
    'win_shares', 
    #'value_over_replacement_player', 
    'box_plus_minus', 
    #'offensive_box_plus_minus', 
    'defensive_box_plus_minus',
    'offensive_win_shares', 
    'defensive_win_shares', 
    'true_shooting_percentage' 
]
'''


In [None]:
#run another Mutual Information Score analysis
master_table_mi2 = master_table.copy()
master_table_mi2.drop(to_drop, axis=1, inplace=True)
X, y, mi_scores2 = calculate_ml_scores(df=master_table_mi2)
plot_mi_scores(mi_scores2, figsize=(14,4))

## Modeling <a class="anchor" id="modeling"></a>

### Train/Test Split

test on selected year, train on all other years that weren't selected

In [None]:
def train_test_split_by_year(year, df, scaling=False):
    #test year = selected year, train year = other years outside of selected year
    train_df = df[df['year'] != year]
    test_df = df[df['year'] == year]
    
    train_df2 = train_df.copy()
    test_df2 = test_df.copy()
    
    train_df2.drop(to_drop, axis=1, inplace=True)
    test_df2.drop(to_drop, axis=1, inplace=True)
    
    if scaling == True:
        sc_X = StandardScaler()
        sc_y = StandardScaler()
        train_df2 = sc_X.fit_transform(train_df2)
        test_df2 = sc_y.fit_transform(test_df2)
    
    X_train = train_df2.copy()
    y_train = X_train["offense"]
    
    X_test = test_df2.copy()
    y_test = X_test["offense"]

    X_train.drop('offense', axis=1, inplace=True)
    cols = X_train.columns
    X_test.drop('offense', axis=1, inplace=True)

    X_train = np.array(X_train)
    y_train = np.array(y_train)
    X_test = np.array(X_test)
    y_test = np.array(y_test)
    
    return X_train, y_train, X_test, y_test, cols

### Model Helper Functions

train, predict, calculate MAE & R squared, show actual vs. predicted in a dataframe

In [None]:
def run_model(regressor, X_train, y_train, X_test, y_test, df, year):
    model = regressor
    model.fit(X_train, y_train) 
    predictions = model.predict(X_test)
    mae = mean_absolute_error(predictions, y_test)
    r2 = r2_score(y_test, predictions)
    
    mvp_race = df[df['year'] == year]
    mvp_race['predicted_offense'] = predictions
    mvp_race = mvp_race.sort_values(["offense", "predicted_offense"], ascending = (False, False))
                                    
    #actual_winner = mvp_race[mvp_race['offense'] == mvp_race['offense'].max()]['Player']
    #predicted_winner = mvp_race[mvp_race['predicted_offense'] == mvp_race['predicted_offense'].max()]['Player']
    return model, mae, r2,  mvp_race #, predicted_winner.iloc[0], actual_winner.iloc[0]

find average metrics and overall accuracy

In [None]:
years = [2019, 2020, 2021, 2022, 2023]
#years = [year for year in range(1980, 2022)]

def run_model_average(df, regressor, scaling=False, print_metrics=False):
    mae_lst = []
    r2_lst = []
    predicted_lst = []
    actual_lst = []
    label_lst =[]
    model_lst = []
    for year in tqdm(years):
        X_train, y_train, X_test, y_test, cols = train_test_split_by_year(year=year, df=df, scaling=False)
        model, mae, r2,  mvp_race = run_model(regressor,
                                                             X_train,
                                                              y_train,
                                                              X_test,
                                                              y_test,
                                                              df=df,
                                                              year=year,
                                                        )
        '''
        if predicted_winner == actual_winner:
            label = 'correct'
        else:
            label = 'incorrect'
        '''
        mae_lst.append(mae)
        r2_lst.append(r2)
        #predicted_lst.append(predicted_winner)
        #actual_lst.append(actual_winner)
        #label_lst.append(label)
        model_lst.append(model)
    d = {
    'year': years,
    'MAE': mae_lst,
    #'R squared': r2_lst,
    #'Predicted MVP': predicted_lst,
    #'Actual MVP': actual_lst,
    #'Label': label_lst
    }

    summary = pd.DataFrame(d)
    #correct_count = summary['Label'].value_counts().iloc[0]
    #incorrect_count = summary['Label'].value_counts().iloc[1]
    #accuracy = correct_count / (correct_count + incorrect_count)
    avg_mae = summary['MAE'].mean()
    avg_r2  = summary['R squared'].mean()
    
    if print_metrics == True:
        print(f"Average MAE: {avg_mae}")
        print(f"Average R squared: {avg_r2}")
        #print(f"Prediction accuracy: {accuracy}")
    return avg_mae, avg_r2, summary, model_lst, cols #, accuracy

### Models
* Linear Regression
* Random Forest Regressor
* XGBoost Regressor
* LightGBM Regressor

(see parameter_tuning.ipynb for parameter tuning scripts)

In [None]:

master_table.rename(columns = {'year_x':'year'}, inplace = True) 
master_table.head(2)

#### Linear Regression

In [None]:
lr_avg_mae, lr_avg_r2, lr_accuracy, lr_summary, lr_models, cols = run_model_average(df=master_table,
                  regressor = LinearRegression(),
                 scaling=True,
                print_metrics=True)

#### Random Forest Regressor

In [None]:
#display feature importance for tree algorithms (RF, XGB, LGBM
def avg_feature_importance(models, cols):
    lst = []
    for model in models:
        feature_importance = list(model.feature_importances_)
        lst.append(feature_importance)
        
    df = pd.DataFrame(lst, columns=cols)
    mean_features = df.mean()
    
    #df2 = pd.DataFrame([cols ,mean_features], columns=['Feature', 'Feature Importance'])
    df2 = pd.DataFrame([cols ,mean_features]).T
    df2 = df2.rename(columns={0:'Feature', 1:'Score'}).sort_values(by='Score', ascending=False)
    
    #plt.rcParams["figure.figsize"] = (7,4)
    plt.title('Feature Importance Score')
    sns.barplot(x='Score',
                y= 'Feature',
               data=df2,
                  )
    plt.show()

In [None]:
rf_avg_mae, rf_avg_r2, rf_accuracy, rf_summary, rf_models, rf_cols = run_model_average(df=master_table,
                  regressor=RandomForestRegressor(n_estimators = 23, 
                                                  random_state = 0, 
                                                  max_depth=7, 
                                                  min_samples_leaf=1,
                                                  min_samples_split=2),
                print_metrics=True)

In [None]:
avg_feature_importance(models=rf_models, cols=rf_cols)

#### XGBoost

In [None]:
#BEST MODEL
# 16 5, 0.2745

xgb_avg_mae, xgb_avg_r2, xgb_accuracy, xgb_summary, xgb_models, xgb_cols = run_model_average(df=master_table,
                  regressor = XGBRegressor(n_estimators=16, max_depth=5, learning_rate = 0.2745, subsample=1, colsample_bytree=1),
                 scaling=False, print_metrics=True)

In [None]:
xgb_summary

In [None]:
avg_feature_importance(models=xgb_models, cols=xgb_cols)

In [None]:
xgb_summary

In [None]:
#export as image
dfi.export(xgb_summary,'xgboost_summary.png')

#### LightGBM

In [None]:
lgbm_avg_mae, lgbm_avg_r2, lgbm_accuracy, lgbm_summary, lgbm_models, lgbm_cols = run_model_average(df=master_table,
                  regressor = LGBMRegressor(n_estimators=23,
                                            max_depth=4,
                                            learning_rate=0.15,
                                            num_leaves=28,
                                            boosting_type='goss',
                                            random_state = 0,
                                           ),
                 scaling=False, print_metrics=True)


In [None]:
avg_feature_importance(models=lgbm_models, cols=lgbm_cols)

#### Model Summaries

In [None]:
d = {
    'Model': ['Linear Regression', 'Random Forest Regressor', 'XGBoost Regressor', 'LGBM Regressor'],
    'average MAE': [lr_avg_mae,rf_avg_mae, xgb_avg_mae, lgbm_avg_mae],
    'average R squared': [lr_avg_r2,rf_avg_r2, xgb_avg_r2, lgbm_avg_r2],
    'accuracy': [lr_accuracy,rf_accuracy, xgb_accuracy, lgbm_accuracy],
}
model_summary_df = pd.DataFrame(d)
model_summary_df.style.highlight_max(subset = ['average R squared', 'accuracy'],
                       color = 'lightgreen', axis = 0)

Best Models: 

In [None]:
#all models in models list have same parameters
best_xgb_model = xgb_models[0]
best_rf_model = rf_models[0]
best_lgbm_model = lgbm_models[0]

### Validate specific year 

The following function can be used to check details on a specific year's MVP race along with its predictions from the model 
* see 'Share' for actual share from the specific year's MVP race
* see 'predicted_share' for model's predicted share

In [None]:
def validate_year(year):
    X_train, y_train, X_test, y_test, cols = train_test_split_by_year(year, df=master_table, scaling=False)
    model, mae, r2, predicted_winner, actual_winner, mvp_race = run_model(best_xgb_model,
                                              X_train, y_train, X_test, y_test, df=master_table, year=year)
    # shift column 'Name' to first position
    nineth_column = mvp_race.pop('predicted_share')
    # first_column) function
    mvp_race.insert(8, 'predicted_share', nineth_column)
    mvp_race = mvp_race.reset_index(drop=True)

    X_test_df = pd.DataFrame(columns=cols, data = X_test)
    
    print(f'Predicted: {predicted_winner}')
    print(f'Actual: {actual_winner}')
    avg_feature_importance(models=[model], cols=cols)
    
    return model, X_test_df, mvp_race #mvp_race.style.highlight_max(subset = ['Share', 'predicted_share'], color = 'lightgreen', axis = 0)

In [None]:
def visualize_shap_values(mvp_race, model):
    top_candidates = list(mvp_race.head(3)['Player'])

    for idx, player in enumerate(top_candidates):
        data_for_prediction = mvp_race[mvp_race['Player'] == player]
        data_for_prediction = data_for_prediction[list(xgb_cols)]
        data_for_prediction_array = data_for_prediction.values.reshape(1, -1)
        rank = idx + 1
        print(f"Rank: {rank}: {player}")

        # Create object that can calculate shap values
        explainer = shap.TreeExplainer(model)
        # Calculate Shap values
        shap_values = explainer.shap_values(data_for_prediction_array)
        shap.initjs()
        display(shap.force_plot(explainer.expected_value, shap_values, data_for_prediction))

In [None]:
model, X_test_df, mvp_race = validate_year(year=2011)

In [None]:
visualize_shap_values(mvp_race, model)

## Forecasting 2022 MVP <a class="anchor" id="forecasting"></a>

In [None]:
#data to be forecasted: 2022 mvp candidates from NBA's MVP ladder
data_path_2022 = os.path.dirname(os.getcwd()) + '/data' + '/data_2022.csv'
data_2022 = pd.read_csv(data_path_2022)
data_2022_cleaned = data_2022.copy()
data_2022_cleaned = data_2022_cleaned[list(xgb_cols)]

In [None]:
data_2022

In [None]:
def prep_train_test(df):
    #train; using hitorical data from 1980 - 2021
    forecast_X_train_df = df.copy()
    forecast_X_train_df.drop(to_drop, axis=1, inplace=True)
    forecast_y_train_df = forecast_X_train_df['Share']
    forecast_X_train_df.drop(['Share'], axis=1, inplace=True)
    
    #data to be forecasted: 2022 mvp candidates from NBA's MVP ladder
    forecast_X_test_df = data_2022_cleaned
    
    print(f'Training dataset columns: \n{list(forecast_X_train_df.columns)} \n')
    print(f'Forecasting dataset columns: \n{list(forecast_X_test_df.columns)}')
    return forecast_X_train_df, forecast_y_train_df, forecast_X_test_df

In [None]:
def fit_forecast_model(regressor):
    model = regressor
    model.fit(forecast_X_train_df, forecast_y_train_df) 
    predictions = model.predict(forecast_X_test_df)
    
    mvp_race_forecast = data_2022.copy()
    mvp_race_forecast['Share Prediction'] = predictions
    mvp_race_forecast = mvp_race_forecast.sort_values(["Share Prediction"], ascending = (False))
    
    mvp_race_forecast_sub = mvp_race_forecast[[
                                                'Player',
                                                'Share Prediction',
                                                'PTS',
                                               'value_over_replacement_player',
                                               'seed',
                                                'W/L%',
                                               'player_efficiency_rating',
                                               'win_shares_per_48_minutes',
                                                'offensive_box_plus_minus',
                                                'usage_percentage',
                                            'free_throw_attempt_rate'
                                              ]].reset_index(drop=True)
    mvp_race_forecast_sub.head()
    avg_feature_importance(models=[model], cols=data_2022_cleaned.columns)
    return model, mvp_race_forecast_sub

In [None]:
def show_highlighted_df(df):
    return df.style.highlight_max(subset = ['value_over_replacement_player',
                                                    'player_efficiency_rating',
                                                    'W/L%',
                                                    'win_shares_per_48_minutes',
                                                    'usage_percentage',
                                                    'free_throw_attempt_rate',
                                                    'offensive_box_plus_minus',
                                                    'PTS',
                                                    'Share Prediction'], color = 'lightgreen', axis = 0)

In [None]:
#train on historical data, predict on 2022 data
forecast_X_train_df, forecast_y_train_df, forecast_X_test_df = prep_train_test(df=master_table)

NOTE: **VORP** (value_over_replacement_player) metric for 2022 candidates has been adjusted as a projection considering the games left in the season. 

### Model 1: XGBoost

In [None]:
best_xgb_model

In [None]:
#best xgb model
xgb_model, xgb_mvp_race_forecast = fit_forecast_model(regressor = XGBRegressor(
                                            n_estimators=16,
                                            max_depth=5,
                                            learning_rate=0.2745))

# 16 5, 0.2745



In [None]:
show_highlighted_df(df=xgb_mvp_race_forecast)

In [None]:
visualize_shap_values(model= xgb_model, mvp_race=xgb_mvp_race_forecast)

### Model 2: Random Forest

In [None]:
best_rf_model

In [None]:
#best random forest model
rf_model, rf_mvp_race_forecast = fit_forecast_model(regressor = RandomForestRegressor(n_estimators = 23, 
                                                  random_state = 0, 
                                                  max_depth=7, 
                                                  min_samples_leaf=1,
                                                  min_samples_split=2)
                  )

In [None]:
show_highlighted_df(df=rf_mvp_race_forecast)

In [None]:
visualize_shap_values(model = rf_model, mvp_race= rf_mvp_race_forecast)

#### Model 3 LightGBM

In [None]:
best_lgbm_model

In [None]:
#best LightGBM model
lgbm_model, lgbm_mvp_race_forecast = fit_forecast_model(
                                                regressor = LGBMRegressor(
                                                                    n_estimators=23,
                                                                    max_depth=4,
                                                                    learning_rate=0.15,
                                                                    num_leaves=28,
                                                                    boosting_type='goss',
                                                                    random_state = 0,
                                           ))

In [None]:
show_highlighted_df(df=lgbm_mvp_race_forecast)

In [None]:
visualize_shap_values(model = lgbm_model, mvp_race= lgbm_mvp_race_forecast)

#### MVP Prediction Summary

In [None]:
forecast_tables = [xgb_mvp_race_forecast, lgbm_mvp_race_forecast, rf_mvp_race_forecast]
model_names = ['XGBoost', 'LightGBM', 'Random Forest']

tables = []
for name, forecast_table in zip(model_names, forecast_tables):
    player_sub = forecast_table.head(3)[['Player']]
    player_sub['Rank'] = ['1st Place','2nd Place','3rd Place']
    player_sub2 = player_sub.T.reset_index(drop=True)
    player_sub2.columns = player_sub2.iloc[1]
    player_sub2.drop(player_sub2.tail(1).index,inplace=True)

    share_sub = forecast_table.head(5)[['Share Prediction']]
    share_sub['Rank'] = ['1st Place Share','2nd Place Share','3rd Place Share']
    
    share_sub2 = share_sub.T.reset_index(drop=True)
    share_sub2.columns = share_sub2.iloc[1]
    share_sub2.drop(share_sub2.tail(1).index,inplace=True)

    merged_df = pd.concat([player_sub2, share_sub2], axis=1).sort_index(axis=1)
    merged_df['Model'] = name
    tables.append(merged_df)
    
final_summary_table = pd.concat(tables)
# shift column 'Name' to first position
first_column = final_summary_table.pop('Model')
# first_column) function
final_summary_table.insert(0, 'Model', first_column)

In [None]:
final_summary_table

In [None]:
#updated 4/15/2022