In [1]:
import sys
import csv
import operator
import pandas as pd
import warnings
import numpy as np
import sklearn
from sklearn import linear_model
from sklearn import svm
from sklearn import preprocessing
from sklearn import utils
from sklearn.neighbors import KNeighborsRegressor
from sklearn import tree
from sklearn.feature_selection import mutual_info_regression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
np.set_printoptions(threshold=sys.maxsize)
warnings.filterwarnings('ignore')

# Data Load and Clean-Up

In [2]:
#Reading in normal player stats for yrs 2000-2019
normal_stats_file_name = 'data/normal_stats/normal_stats_{}.csv'
normal_stats = []
for i in range(19, 0, -1):
    normal_stats.append(pd.read_csv(normal_stats_file_name.format(i)))

#Reading in advanced player stats for yrs 2014-2019
adv_stats_file_name = 'data/advanced_stats/adv_stats_{}.csv'
adv_stats = []
for i in range(19, 0, -1):
    adv_stats.append(pd.read_csv(adv_stats_file_name.format(i)))

#Reading in team ratings for yrs 2014-2019
team_ratings_file_name = 'data/team_ratings/teams_{}.csv'
team_ratings = []
for i in range(19, 0, -1):
    team_ratings.append(pd.read_csv(team_ratings_file_name.format(i)))
                        
#Reading in mvp voting results for yrs 2014-2018
mvp_votes_file_name = 'data/mvp_votes/mvp_{}.csv'
mvp_votes = ['THIS IS WHAT WE ARE GONNA PREDICT!']
for i in range(18, 0, -1):
    mvp_votes.append(pd.read_csv(mvp_votes_file_name.format(i)))

In [3]:
#helper method to clean player names for data from 'Basketball-Reference'
def clean_player_names(str):
    index_of_slash = str.find('\\')
    str = str[:index_of_slash]
    return str.replace('*', '')

def clean_data(normal_stats, adv_stats, team_ratings, mvp_votes):
    player_vals = []
    team_vals = []
    mvp_vals = []
    for yr in range(19):
        norm = normal_stats[yr].drop_duplicates(subset='Player', keep='first')
        adv = adv_stats[yr].drop_duplicates(subset='Player', keep='first')
        merge = norm.merge(adv, left_on="Player", right_on='Player')
        merge = merge.drop(['Rk_x', 'Rk_y', 'Pos_y', 'Age_y', 'Tm_y', 'G_y', 'MP_y', 'Unnamed: 24', 'Unnamed: 19'], axis=1)
        merge = merge.rename(index=str, columns={"Pos_x": "Pos", "Age_x": "Age", "Tm_x": "Tm", "G_x": "G", "MP_x": "MP"})
        merge['Player'] = merge['Player'].apply(lambda str: clean_player_names(str))
        added_col_players = np.full((merge.count()[0]), (19 - yr))
        merge['Year'] = (added_col_players)
        
        teams_updated = team_ratings[yr]
        added_col_teams = np.full(len(teams_updated), (19 - yr))
        teams_updated['Year'] = (added_col_teams)
        player_vals += [merge]
        team_vals += [teams_updated]
        if (not isinstance(mvp_votes[yr], str)):
            mvp_updated = mvp_votes[yr]
            added_col_mvps = np.full(len(mvp_updated), (19 - yr))
            mvp_updated['Year'] = (added_col_mvps)
            mvp_updated = mvp_updated[['Player', 'First', 'Share', 'Year']]
            mvp_updated['Player'] = mvp_updated['Player'].apply(lambda str: clean_player_names(str))
            mvp_vals += [mvp_updated]
    
        
    player_data = pd.concat(player_vals)
    team_data = pd.concat(team_vals)
    mvp_data = pd.concat(mvp_vals)
    player_data = pd.merge(player_data, mvp_data,  how='left', left_on=['Player','Year'], right_on = ['Player','Year'])
    player_data = player_data.fillna(0)
    player_data = player_data.rename(index=str, columns={"First": "1st Place MVP Votes", "Share": "MVP Score"})
    player_data = player_data[player_data['Player'] != 'Tony Mitchell']
    player_data = player_data[player_data['Player'] != 'Chris Johnson']
    player_data = player_data[player_data['Player'] != 'Marcus Williams']
    player_data = player_data[player_data['Player'] != 'Trevon Duval']
    player_data = player_data.assign(id=(player_data['Player']).astype('category').cat.codes)
    player_data = player_data.set_index('Year')
    team_data = team_data.set_index('Year')
    return player_data, team_data
    
player_data, team_data = clean_data(normal_stats, adv_stats, team_ratings, mvp_votes)

# Getting Accustomed to the Data

After data clean up, we now have player and team data from the 2000-2001 NBA season to this past season(18-19). The data is formatted as such: starting with year 19 and down, it contains all players who played in each season. Along with each player's counting stats and advanced stats for each season, there are two columns that will be particularly useful for our analysis: 1st Place MVP Votes representing the number of first place votes they received in that season and MVP Score which represents the share of MVP votes they received out of all that were given. Thus, the MVP in a particular year is simply the player with the MAX('MVP Score'). We can check this intuition below:

In [4]:
seventeen_season = player_data[player_data.index == 17]
mvp_17_season = seventeen_season[seventeen_season['MVP Score'] == max(seventeen_season['MVP Score'])]
mvp_17_season

Unnamed: 0_level_0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP,1st Place MVP Votes,MVP Score,id
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
17,Russell Westbrook,PG,28,OKC,81,81,34.6,10.2,24.0,0.425,...,4.6,13.1,0.224,10.9,4.7,15.6,12.4,69.0,0.879,1508


Russell Westbrook was indeed the MVP of the 2016-2017 NBA Season, we now have some useful data where we can extract useful results! Let's play around a bit more to get accustomed to what we have. Below, we can see the results of all seasons (19-14) of my favorite player: Kevin Durant.

In [5]:
my_fave_player = player_data[player_data['Player'] == 'Kevin Durant']
my_fave_player

Unnamed: 0_level_0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP,1st Place MVP Votes,MVP Score,id
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
19,Kevin Durant,PF,30,GSW,78,78,34.6,9.2,17.7,0.521,...,2.9,11.5,0.204,4.2,0.1,4.3,4.3,0.0,0.0,1023
18,Kevin Durant,PF,29,GSW,68,68,34.2,9.3,18.0,0.516,...,2.9,10.4,0.215,5.0,0.7,5.6,4.5,0.0,0.065,1023
17,Kevin Durant,PF,28,GSW,62,62,33.4,8.9,16.5,0.537,...,4.0,12.0,0.278,5.4,2.6,8.0,5.2,0.0,0.002,1023
16,Kevin Durant,SF,27,OKC,72,72,35.8,9.7,19.2,0.505,...,3.5,14.5,0.27,7.0,0.9,7.9,6.4,0.0,0.112,1023
15,Kevin Durant,SF,26,OKC,27,27,33.8,8.8,17.3,0.51,...,1.0,4.8,0.252,6.3,-0.2,6.1,1.9,0.0,0.0,1023
14,Kevin Durant,SF,25,OKC,81,81,38.5,10.5,20.8,0.503,...,4.4,19.2,0.295,8.4,0.4,8.8,8.5,119.0,0.986,1023
13,Kevin Durant,SF,24,OKC,81,81,38.5,9.0,17.7,0.51,...,5.3,18.9,0.291,6.2,1.4,7.7,7.6,0.0,0.632,1023
12,Kevin Durant,SF,23,OKC,66,66,38.6,9.7,19.7,0.496,...,3.7,12.2,0.23,4.7,0.5,5.2,4.6,24.0,0.735,1023
11,Kevin Durant,SF,22,OKC,78,78,38.9,9.1,19.7,0.462,...,3.3,12.0,0.189,3.7,-0.8,2.9,3.7,0.0,0.157,1023
10,Kevin Durant,SF,21,OKC,82,82,39.5,9.7,20.3,0.476,...,5.0,16.1,0.238,4.9,0.2,5.1,5.8,4.0,0.495,1023


Additionally, we can see all the players who received MVP votes in all our data: AKA which players have MVP Scores that are nonzero. NOTE: MVP Votes being 0 does not imply not getting any votes as MVP Votes corresponds to 1st place votes, which are not given to every player who was considered for voting. As you'll see below, there are 83 players since 2000 who have received some share of MVP votes.

In [6]:
players_who_got_mvp_votes = player_data[player_data['MVP Score'] != 0.0].groupby('Player')
print("Number of Players who received MVP Votes: " + str(len(players_who_got_mvp_votes.groups)))
players_who_got_mvp_votes.groups

Number of Players who received MVP Votes: 83


{'Al Jefferson': Int64Index([14], dtype='int64', name='Year'),
 'Allen Iverson': Int64Index([6, 5, 3, 2, 1], dtype='int64', name='Year'),
 "Amar'e Stoudemire": Int64Index([11, 10, 8, 7, 5], dtype='int64', name='Year'),
 'Andrei Kirilenko': Int64Index([4], dtype='int64', name='Year'),
 'Antawn Jamison': Int64Index([8], dtype='int64', name='Year'),
 'Anthony Davis': Int64Index([18, 17, 15], dtype='int64', name='Year'),
 'Anthony Mason': Int64Index([1], dtype='int64', name='Year'),
 'Baron Davis': Int64Index([7, 4], dtype='int64', name='Year'),
 'Ben Wallace': Int64Index([4, 3, 2], dtype='int64', name='Year'),
 'Blake Griffin': Int64Index([15, 14, 11], dtype='int64', name='Year'),
 'Brandon Roy': Int64Index([9], dtype='int64', name='Year'),
 'Carlos Boozer': Int64Index([8, 7], dtype='int64', name='Year'),
 'Carmelo Anthony': Int64Index([14, 13, 10, 8, 7, 4], dtype='int64', name='Year'),
 'Chauncey Billups': Int64Index([10, 9, 7, 6], dtype='int64', name='Year'),
 'Chris Bosh': Int64Index([

As you can see, only 29 players received first place votes over the past 18 seasons, some could argue that these have been the best 29 players over the last few years, but thats a subject for another day!

In [7]:
players_who_got_mvp_first_place = player_data[player_data['1st Place MVP Votes'] != 0.0].groupby('Player')
print("Number of Players who received 1st Place Votes: " + str(len(players_who_got_mvp_first_place.groups)))
players_who_got_mvp_first_place.groups

Number of Players who received 1st Place Votes: 29


{'Allen Iverson': Int64Index([5, 1], dtype='int64', name='Year'),
 "Amar'e Stoudemire": Int64Index([5], dtype='int64', name='Year'),
 'Ben Wallace': Int64Index([3], dtype='int64', name='Year'),
 'Carmelo Anthony': Int64Index([13], dtype='int64', name='Year'),
 'Chauncey Billups': Int64Index([6], dtype='int64', name='Year'),
 'Chris Paul': Int64Index([12, 9, 8], dtype='int64', name='Year'),
 'Chris Webber': Int64Index([1], dtype='int64', name='Year'),
 'Derrick Rose': Int64Index([11], dtype='int64', name='Year'),
 'Dirk Nowitzki': Int64Index([7, 6], dtype='int64', name='Year'),
 'Dwight Howard': Int64Index([11, 10, 9], dtype='int64', name='Year'),
 'Dwyane Wade': Int64Index([9], dtype='int64', name='Year'),
 'Elton Brand': Int64Index([6], dtype='int64', name='Year'),
 'Gary Payton': Int64Index([2], dtype='int64', name='Year'),
 'James Harden': Int64Index([18, 17, 15], dtype='int64', name='Year'),
 'Jason Kidd': Int64Index([2], dtype='int64', name='Year'),
 "Jermaine O'Neal": Int64Index(

# Feature Selection

Next, since our goal is to predict the 2019 MVP, we will turn this into a supervised learning problem. Supervised learning problems involve a simple formula: Y = f(X) where X is our input, Y is our output, and f() is our mapping function. Our goal is to create/train such that our mapping function f() is really good at predicting output(Y) accurately given a particular (X). Thus our model will be as such: X will be a d-dimensional vector representing player stats from a certain year and Y will be our output (a vector containing mvp scores for all players from that year -> the max being the player who should win the MVP in that season.


To train our model, however, it is imperative we select good features, so before we choose some, we will utilize three different feature selection algorithms to determine the best features to use: Feature Dependency based on mutual information regression, Random Forest Regressor feature importance (both thanks to sklearn's feature selection package), and the Pearson coefficient matrix determining what values most correlate to our target of 'MVP Score'<br/>


In [8]:
y = player_data['MVP Score']
X = player_data.drop(['MVP Score', 'Player', 'Tm', 'Pos'], axis=1)
mi = mutual_info_regression(X, y)
mi /= np.max(mi)
indices_mi = np.argwhere(mi > 0.5)
top_features_mutual_info = [X.columns.values.tolist()[i[0]] for i in indices_mi]
top_features_mutual_info

['MP',
 'FG',
 'FGA',
 '2P',
 '2PA',
 'FTA',
 'TOV',
 'PTS',
 'PER',
 'USG%',
 'OWS',
 'WS',
 'WS/48',
 'BPM',
 'VORP']

In [9]:
regr = RandomForestRegressor(max_depth=5, random_state=0, n_estimators=100)
regr.fit(X, y)
random_forest = regr.feature_importances_
r_f = {k: v for k, v in (zip(X.columns.values.tolist(), random_forest))}
sorted_r_f = sorted(r_f.items(), key=operator.itemgetter(1))
top_features_r_f = [tup[0] for tup in sorted_r_f]
top_features_r_f.reverse()
top_features_r_f

['1st Place MVP Votes',
 'WS',
 'VORP',
 'DWS',
 'OWS',
 'USG%',
 'PER',
 'PF',
 'FG',
 'Age',
 '3PAr',
 'BLK%',
 'PTS',
 'WS/48',
 'G',
 'GS',
 '2P',
 'DBPM',
 'TS%',
 'BLK',
 'FT%',
 'id',
 '2P%',
 'TOV%',
 'MP',
 'AST%',
 'FGA',
 '2PA',
 'OBPM',
 'BPM',
 'FG%',
 '3P%',
 'STL%',
 'STL',
 'TOV',
 'AST',
 'eFG%',
 '3PA',
 'FTr',
 '3P',
 'FTA',
 'ORB%',
 'TRB',
 'DRB%',
 'TRB%',
 'FT',
 'ORB',
 'DRB']

In [10]:
corr = player_data.corr()
top_features_pearson = corr['MVP Score'].sort_values(ascending=False).index.tolist()
top_features_pearson

['MVP Score',
 '1st Place MVP Votes',
 'VORP',
 'OWS',
 'WS',
 'FTA',
 'FT',
 'PTS',
 'FG',
 '2P',
 'DWS',
 'TOV',
 'FGA',
 '2PA',
 'AST',
 'PER',
 'USG%',
 'BPM',
 'OBPM',
 'DRB',
 'STL',
 'AST%',
 'GS',
 'MP',
 'TRB',
 'WS/48',
 '3P',
 'BLK',
 '3PA',
 'G',
 'DBPM',
 'TS%',
 'ORB',
 '2P%',
 'DRB%',
 'FG%',
 'eFG%',
 'FTr',
 'PF',
 'FT%',
 'STL%',
 '3P%',
 'TRB%',
 'BLK%',
 'id',
 'Age',
 '3PAr',
 'ORB%',
 'TOV%']

In [95]:
#finalized_features = list(set(top_features_mutual_info) & set(top_features_pearson) & set(top_features_r_f))
finalized_features = ['Player', 'Tm', 'id'] + ['PTS', 'TRB', 'AST', 'STL', 'BLK', 'TS%', 'WS', 'WS/48', 'VORP', 
                                              'BPM', 'USG%', 'G', 'MP', '2P%', '3P%', 'FT%', 'PER', 
                                               'MVP Score', 'Team Wins', 'Team Losses', 'Team Rank', 'W/L%']



As you can see above, we have obtained three different sets of "best features" we can use to train our models. Now, we have to remove redundant information from all three and finalize a set of features. The finalized set I chose can be seen above, I also removed some features that the feature selection algorithms chose that I felt aren't exactly useful in today's NBA. Additionally, I decided to use three team_data features: 'Team Wins', Team Rank', 'W/L%'.

In [96]:
finalized_features

['Player',
 'Tm',
 'id',
 'PTS',
 'TRB',
 'AST',
 'STL',
 'BLK',
 'TS%',
 'WS',
 'WS/48',
 'VORP',
 'BPM',
 'USG%',
 'G',
 'MP',
 '2P%',
 '3P%',
 'FT%',
 'PER',
 'MVP Score',
 'Team Wins',
 'Team Losses',
 'Team Rank',
 'W/L%']

In [97]:
def feature_based_season_vector(player, year):
    team_data_conv = ['MIL', 'GSW', 'TOR', 'UTA', 'HOU', 'POR', 'DEN', 'BOS', 'OKC', 'IND', 'PHI', 'SAS', 'LAC', 'ORL','MIA', 
                  'BRK', 'DET', 'SAC', 'DAL', 'MIN', 'NOP', 'LAL', 'CHA', 'MEM', 'WAS', 'ATL', 'CHI', 'PHO', 'NYK', 'CLE']
    team_conv = {k: v for k, v in (zip(team_data_conv, team_data['Team']))}
    team_conv['NJN'] = 'New Jersey Nets'
    team_conv['SEA'] = 'Seattle Supersonics'
    team_conv['CHO'] = 'Charlotte Bobcats'
    team_conv['CHH'] = 'Charlotte Hornets'
    team_conv['NOH'] = 'New Orleans Hornets'
    team_conv['NOK'] = 'New Orleans/Oklahoma City Hornets'
    team_conv['VAN'] = 'Vancouver Grizzlies'
    this_guys_data = player_data[player_data['Player'] == player]
    this_guys_data = this_guys_data[this_guys_data.index == year]
    this_guys_team = this_guys_data['Tm'][year]
    season_vec = this_guys_data[finalized_features[:len(finalized_features) - 4]]
    if this_guys_data['Tm'].any() == 'TOT':
        season_vec['Team Wins'] = 0
        season_vec['Team Losses'] = 0
        season_vec['Team Rank'] = 30
        season_vec['W/L%'] = 0.0
    else:
        his_team_data = team_data[team_data['Team'] == team_conv[str(this_guys_team)]]
        season_vec['Team Wins'] = his_team_data['W']
        season_vec['Team Losses'] = his_team_data['L']
        season_vec['Team Rank'] = his_team_data['Rk']
        season_vec['W/L%'] = his_team_data['W/L%']
    return season_vec
    

Now, with the two helper functions above, we can extract season vectors for a player during a particular season that he played in. For example, Stephen Curry's season vector for 2017 is displayed below.

In [98]:
steph_17 = feature_based_season_vector('Stephen Curry', 17)
steph_17

Unnamed: 0_level_0,Player,Tm,id,PTS,TRB,AST,STL,BLK,TS%,WS,...,MP,2P%,3P%,FT%,PER,MVP Score,Team Wins,Team Losses,Team Rank,W/L%
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
17,Stephen Curry,GSW,1606,25.3,4.5,6.6,1.8,0.2,0.624,12.6,...,33.4,0.537,0.411,0.898,24.6,0.051,67,15,1,0.817


In [None]:
#||||| THIS METHOD TAKES A REALLY LONG TIME SO ITS COMMENTED OUT RN! |||||
def calculate_all_season_vectors(df):
    s_v = []
    for index, row in df.iterrows():
        print(row[0], index)
        s_v += [feature_based_season_vector(row[0], index).values.tolist()[0]]
    return s_v 
season_vectors = calculate_all_season_vectors(player_data)
print(season_vectors)
np.save('season_vectors', season_vectors)

In [101]:
season_vectors = np.load('season_vectors.npy')
season_vectors = pd.DataFrame(season_vectors, index=player_data.index, columns=finalized_features)

# Train/Test Split

In [102]:
def create_train_set(list_of_years):
    all_but_max = season_vectors[season_vectors.index.isin(list_of_years)]
    y_train = all_but_max['MVP Score'].values.tolist()
    all_but_max = all_but_max.drop(['Player', 'Tm', 'MVP Score'], axis=1)
    x_train = all_but_max.values.tolist()
    return np.array(x_train), np.array(y_train)

In [103]:
def create_test_set(prediction_year):
    season = season_vectors[season_vectors.index == prediction_year]
    season = season.drop(['Player', 'Tm', 'MVP Score'], axis=1)
    x_test = season.values.tolist()
    return np.array(x_test)

In [104]:
prediction_year = 19
list_of_years = list(np.arange(1, prediction_year))
x_train, y_train = create_train_set(list_of_years)
x_test = create_test_set(prediction_year)
x_train = x_train.astype(float)
y_train = y_train.astype(float)
x_test = x_test.astype(float)

In [105]:
np.save('x_train', x_train)
np.save('y_train', y_train)

In [106]:
print(x_train.shape, y_train.shape)

(8329, 22) (8329,)


# Predicting the MVP for the 2018-2019 Season w/ Different Models

In [164]:
lin_reg = linear_model.LinearRegression()
lasso = linear_model.LassoCV(alphas=np.logspace(-6, 6, 13), cv=5)
ridge = linear_model.RidgeCV(alphas=np.logspace(-6, 6, 13), cv=5)
linear_svr = svm.LinearSVR(random_state=0, tol=1e-05)
dec_tree = tree.DecisionTreeRegressor(random_state=0,  max_depth=5, max_features=10)
grad_boost = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1)

In [165]:
where_are_NaNs_train = np.isnan(x_train)
where_are_NaNs_test = np.isnan(x_test)
where_are_NaNs_y_train = np.isnan(y_train)
x_train[where_are_NaNs_train] = 0
x_test[where_are_NaNs_test] = 0
y_train[where_are_NaNs_y_train] = 0

#Linear Regression (normal least squares)
lin_reg.fit(x_train, y_train)
lin_reg_predictions_19 = lin_reg.predict(x_test)

#Lasso Regression w/ Built in Cross Validation
lasso.fit(x_train, y_train)
lasso_predictions_19 = lasso.predict(x_test)

#Ridge Regression w/ Built in Cross Validation
ridge.fit(x_train, y_train)
ridge_predictions_19 = ridge.predict(x_test)

#Linear Support Vector Regression
linear_svr.fit(x_train, y_train)
linear_svr_predictions_19 = linear_svr.predict(x_test)

#Decision Tree Regressor
dec_tree.fit(x_train, y_train)
dec_tree_predictions_19 = dec_tree.predict(x_test)

# Gradient Boosting Regressor
grad_boost.fit(x_train, y_train)
grad_boost_predictions_19 = grad_boost.predict(x_test)

In [166]:
def calculate_top_10_predictions(predictions, year):
    results = []
    top_10_ascending = list(np.argsort(predictions)[-10:])
    top_10_ascending.reverse()
    for id in top_10_ascending:
        p_id = x_test[id][0]
        year_results = player_data[player_data.index == year]
        print("Player: " + str(year_results[year_results['id'] == p_id]['Player'][year]) + ", MVP Score: " + str(predictions[id]))
        results += [str(year_results[year_results['id'] == p_id]['Player'][year])]
    return results

In [167]:
def calculate_top_10_year(year):
    year_res = player_data[player_data.index == year].sort_values('MVP Score', ascending=False)['Player'][:10]
    return year_res.values.tolist()

In [168]:
print("Linear Regression Predictions for 2018-2019 MVP\n")
top_10_lin_reg_19 = calculate_top_10_predictions(lin_reg_predictions_19, 19)

Linear Regression Predictions for 2018-2019 MVP

Player: James Harden, MVP Score: 0.260480543318
Player: Giannis Antetokounmpo, MVP Score: 0.198278318301
Player: Nikola Jokic, MVP Score: 0.168612818867
Player: LeBron James, MVP Score: 0.145690687639
Player: Russell Westbrook, MVP Score: 0.144276340845
Player: Damian Lillard, MVP Score: 0.141642269852
Player: Karl-Anthony Towns, MVP Score: 0.140821391829
Player: Stephen Curry, MVP Score: 0.132511037046
Player: Kevin Durant, MVP Score: 0.127255339689
Player: Anthony Davis, MVP Score: 0.125147971838


In [169]:
print("Lasso Regression Predictions for 2018-2019 MVP\n")
top_10_lasso_19 = calculate_top_10_predictions(lasso_predictions_19, 19)

Lasso Regression Predictions for 2018-2019 MVP

Player: James Harden, MVP Score: 0.242555285918
Player: Giannis Antetokounmpo, MVP Score: 0.183532431278
Player: Nikola Jokic, MVP Score: 0.153389444496
Player: Damian Lillard, MVP Score: 0.13053915404
Player: Paul George, MVP Score: 0.128880359333
Player: LeBron James, MVP Score: 0.12831574052
Player: Karl-Anthony Towns, MVP Score: 0.127513264016
Player: Stephen Curry, MVP Score: 0.126168556347
Player: Russell Westbrook, MVP Score: 0.125502171584
Player: Anthony Davis, MVP Score: 0.118452547937


In [170]:
print("Ridge Regression Predictions for 2018-2019 MVP\n")
top_10_ridge_19 = calculate_top_10_predictions(ridge_predictions_19, 19)

Ridge Regression Predictions for 2018-2019 MVP

Player: James Harden, MVP Score: 0.23869500145
Player: Giannis Antetokounmpo, MVP Score: 0.188261983073
Player: Nikola Jokic, MVP Score: 0.153115324877
Player: Damian Lillard, MVP Score: 0.133726671809
Player: LeBron James, MVP Score: 0.132626214683
Player: Karl-Anthony Towns, MVP Score: 0.128836878865
Player: Paul George, MVP Score: 0.128400590353
Player: Russell Westbrook, MVP Score: 0.127418407672
Player: Stephen Curry, MVP Score: 0.126841210807
Player: Kevin Durant, MVP Score: 0.12306819993


In [171]:
print("Linear Support Vector Regression Predictions for 2018-2019 MVP\n")
top_10_linear_svr_19 = calculate_top_10_predictions(linear_svr_predictions_19, 19)

Linear Support Vector Regression Predictions for 2018-2019 MVP

Player: James Harden, MVP Score: 0.12475314841
Player: Paul George, MVP Score: 0.0816023253067
Player: Giannis Antetokounmpo, MVP Score: 0.079641485651
Player: Anthony Davis, MVP Score: 0.0680944292338
Player: Stephen Curry, MVP Score: 0.066221599684
Player: Damian Lillard, MVP Score: 0.0661514477552
Player: Kawhi Leonard, MVP Score: 0.0641141468512
Player: Bradley Beal, MVP Score: 0.0616046695561
Player: Karl-Anthony Towns, MVP Score: 0.0582478585835
Player: Kyrie Irving, MVP Score: 0.0579885399385


In [172]:
print("Decision Tree Predictions for 2018-2019 MVP\n")
top_10_dec_tree_19 = calculate_top_10_predictions(dec_tree_predictions_19, 19)

Decision Tree Predictions for 2018-2019 MVP

Player: Giannis Antetokounmpo, MVP Score: 0.888
Player: James Harden, MVP Score: 0.472666666667
Player: Stephen Curry, MVP Score: 0.307571428571
Player: Kevin Durant, MVP Score: 0.307571428571
Player: Kawhi Leonard, MVP Score: 0.307571428571
Player: Joel Embiid, MVP Score: 0.0875
Player: Damian Lillard, MVP Score: 0.08055
Player: Nikola Jokic, MVP Score: 0.08055
Player: Paul George, MVP Score: 0.08055
Player: Rudy Gobert, MVP Score: 0.0256842105263


In [173]:
print("Gradient Boosting Regressor Predictions for 2018-2019 MVP\n")
top_10_grad_boost_19 = calculate_top_10_predictions(grad_boost_predictions_19, 19)

Gradient Boosting Regressor Predictions for 2018-2019 MVP

Player: James Harden, MVP Score: 0.910670193031
Player: Giannis Antetokounmpo, MVP Score: 0.838336441914
Player: Stephen Curry, MVP Score: 0.193492122123
Player: Paul George, MVP Score: 0.137057122492
Player: Damian Lillard, MVP Score: 0.0973584951985
Player: Rudy Gobert, MVP Score: 0.0961601058003
Player: Kevin Durant, MVP Score: 0.0944173252204
Player: Anthony Davis, MVP Score: 0.0802893464811
Player: LeBron James, MVP Score: 0.0709236937505
Player: Joel Embiid, MVP Score: 0.0668184319523


In [174]:
def determine_accuracy(year, predictions, real):
    #print("Actual Top 10: " + str(real) + "\n")
    #print("Predicted Top 10: " + str(predictions))
    accuracy = len(set(real) & set(predictions))/len(real)
    #print("Accuracy: " + str(accuracy) + "\n")
    return accuracy 

In [175]:
basketball_reference_mvp_candidates = ['Giannis Antetokounmpo', 'James Harden', 'Nikola Jokic', 'Kawhi Leonard', 'Kevin Durant', 
                                       'Joel Embiid', 'Damian Lillard', 'Stephen Curry', 'Paul George', 'Russell Westbrook']

In [176]:
print("Linear Regression Accuracy for 19: " + str(determine_accuracy(19, top_10_lin_reg_19, basketball_reference_mvp_candidates)))
print("Lasso Regression Accuracy for 19: " + str(determine_accuracy(19, top_10_lasso_19, basketball_reference_mvp_candidates)))
print("Ridge Regression Accuracy for 19: " + str(determine_accuracy(19, top_10_ridge_19, basketball_reference_mvp_candidates)))
print("Linear SVR Accuracy for 19: " + str(determine_accuracy(19, top_10_linear_svr_19, basketball_reference_mvp_candidates)))
print("Decision Tree Regressor Accuracy for 19: " + str(determine_accuracy(19, top_10_dec_tree_19, basketball_reference_mvp_candidates)))
print("Gradient Boosting Accuracy for 19: " + str(determine_accuracy(19, top_10_grad_boost_19, basketball_reference_mvp_candidates)))

Linear Regression Accuracy for 19: 0.7
Lasso Regression Accuracy for 19: 0.7
Ridge Regression Accuracy for 19: 0.8
Linear SVR Accuracy for 19: 0.6
Decision Tree Regressor Accuracy for 19: 0.9
Gradient Boosting Accuracy for 19: 0.7


After running tests with 6 different models (namely, Linear Regression, Lasso Regression, Ridge Regression, Linear SVR, KNN Regression, and Gradient Boosting Regression), we have 6 different sets of MVP predictions for 2019. A common theme we see is that James Harden appears at the top of the list 5/6 times. We can also compare the different models' accuracy versus the BasketballReference Top Candidates for MVP, since results for voting for the 2018-2019 MVP will be out June 24th. Based on this modeling process, I predict James Harden will be the 2018-2019 NBA Most Valuable Player as he `appeared 6/6 times at the top spot amongst our different models' predictions!

# Checking all Models' Accuracy Against the Last 5 MVP Results

In [177]:
models = [lin_reg, lasso, ridge, linear_svr, dec_tree, grad_boost]
model_names = ['lin_reg', 'lasso', 'ridge', 'linear_svr', 'dec_tree', 'grad_boost']
ac = [[], [], [], [], [], []]
accuracies = {k: v for k, v in (zip(models, ac))}
max_year = 18
for i in range(5):
    for model in models:
        results = []
        list_of_years = list(np.arange(1, max_year))
        x_train_i, y_train_i = create_train_set(list_of_years)
        x_test_i = create_test_set(max_year)
        x_train_i = x_train_i.astype(float)
        y_train_i = y_train_i.astype(float)
        x_test_i = x_test_i.astype(float)
        where_are_NaNs_train = np.isnan(x_train_i)
        where_are_NaNs_test = np.isnan(x_test_i)
        x_train_i[where_are_NaNs_train] = 0
        x_test_i[where_are_NaNs_test] = 0
        model.fit(x_train_i, y_train_i)
        predictions = model.predict(x_test_i)
        top_10 = list(np.argsort(predictions)[-10:])
        top_10.reverse()
        #print("MVP PREDICTIONS FOR YEAR: " + str(max_year) + "\n")  
        for id in top_10:
            p_id = x_test_i[id][0]
            yr = player_data[player_data.index == max_year]
            #print("Player: " + str(yr[yr['id'] == p_id]['Player'][max_year]) + ", MVP Score: " + str(predictions[id]))
            results += [yr[yr['id'] == p_id]['Player'][max_year]]
        real = calculate_top_10_year(max_year)
        accuracies[model].append([determine_accuracy(max_year, results, real)])
    #print("Accuracy for " + str(model_names[i]) + ": " + str(accuracies[i]/len(accuracies)))
    max_year -= 1

In [178]:
count = 0
for k, v in accuracies.items():
    accuracy = sum([item for sublist in v for item in sublist])/len(v)
    print("Accuracy for " + str(model_names[count]) + " model: " + str(accuracy))
    count+=1

Accuracy for lin_reg model: 0.8200000000000001
Accuracy for lasso model: 0.8200000000000001
Accuracy for ridge model: 0.78
Accuracy for linear_svr model: 0.74
Accuracy for dec_tree model: 0.72
Accuracy for grad_boost model: 0.6799999999999999


Our models do a pretty good job over the last 5 years!
<br> Linear Regression: **82%**
<br> Lasso Regression: **82%**
<br> Ridge Regression: **78%**
<br> Linear SVR Regression: **74%**
<br> Decision Tree Regression: **72%**
<br> Gradient Boosting Regression: **67.9%**

# Finalizing Predictions

After averaging out the results of all of our top 10 predictions for 2019, we can settle down and finalize a top list. 

In [180]:
top_players = {}
model_results = [top_10_lin_reg_19, top_10_lasso_19, top_10_ridge_19, top_10_linear_svr_19, 
                 top_10_dec_tree_19, top_10_grad_boost_19]
for results in model_results:
    for i in range(len(results)):
        if results[i] in top_players:
            top_players[results[i]].append(i+1)
        else:
            top_players[results[i]] = [i+1]
top_players = {k: np.mean(v) for k, v in top_players.items()}
top_players = sorted(top_players.items(), key=operator.itemgetter(1))
top_players

[('James Harden', 1.1666666666666667),
 ('Giannis Antetokounmpo', 2.0),
 ('Nikola Jokic', 4.25),
 ('Damian Lillard', 5.333333333333333),
 ('Paul George', 5.4000000000000004),
 ('LeBron James', 6.0),
 ('Stephen Curry', 6.0),
 ('Kawhi Leonard', 6.0),
 ('Karl-Anthony Towns', 7.25),
 ('Russell Westbrook', 7.333333333333333),
 ('Kevin Durant', 7.5),
 ('Anthony Davis', 8.0),
 ('Bradley Beal', 8.0),
 ('Joel Embiid', 8.0),
 ('Rudy Gobert', 8.0),
 ('Kyrie Irving', 10.0)]