In [4]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
player_path = 'gdrive/My Drive/nba_summary_17_19.csv'
team_path = 'gdrive/My Drive/team_boxscores_2017_2018.csv'

In [6]:
import pandas as pd
player_df = pd.read_csv(player_path)
team_df = pd.read_csv(team_path)
print(player_df.columns, '\n\n',team_df.columns)

Index(['id', 'created_on', 'updated_on', 'source', 'is_duplicated',
       'display_name', 'match_id', 'team_id', 'team_abbr', 'team_city',
       'player_id', 'player_name', 'start_position', 'comment', 'min', 'fgm',
       'fga', 'fg_pct', 'fg3m', 'fg3a', 'fg3_pct', 'ftm', 'fta', 'ft_pct',
       'oreb', 'dreb', 'reb', 'ast', 'stl', 'blk', 'to', 'pf', 'pts',
       'plus_minus', 'ast_pct', 'ast_ratio', 'ast_tov', 'blka', 'def_rating',
       'dreb_pct', 'e_def_rating', 'e_net_rating', 'e_off_rating', 'e_pace',
       'e_usg_pct', 'efg_pct', 'net_rating', 'off_rating',
       'opp_pts_2nd_chance', 'opp_pts_fb', 'opp_pts_off_tov', 'opp_pts_paint',
       'oreb_pct', 'pace', 'pfd', 'pie', 'pts_2nd_chance', 'pts_fb',
       'pts_off_tov', 'pts_paint', 'reb_pct', 'tm_tov_pct', 'ts_pct',
       'usg_pct', 'granularity', 'per_mode', 'season_id', 'dd2', 'fp', 'gp',
       'td3', 'hometeam_id', 'hometeam_abbr', 'visitorteam_id',
       'visitorteam_abbr', 'final_hscore', 'final_vscore',
     

In [7]:
# dataframe property
print(player_df.shape, team_df.shape)

(69863, 81) (5538, 62)


In [8]:
# filter match ids
match_id_field = list(player_df['match_id']) + list(team_df['match_id'])
unwanted_ids = [i for i in match_id_field if str(i)[0] != '2']
print("unwanted match_id number: ", len(unwanted_ids))

unwanted match_id number:  9665


In [9]:
# assumption testing -> match id is always 8 digits
match_id_len = [len(str(i)) == 8 for i in match_id_field]
print("Assumption is tested to be: ", all(match_id_len))

Assumption is tested to be:  True


In [0]:
# regular season match ids start with 2
# Filter all those starting with 2
def remove_non_nba(data):
  return data[(data['match_id'] >= 20000000) & (data['match_id'] < 30000000)]

In [11]:
# test match_id filter
nba_player_df = remove_non_nba(player_df)  # nba df is all match within nba(match id starts with 2)
nba_team_df = remove_non_nba(team_df)
print("Rows filtered out: ", (player_df.shape[0] + team_df.shape[0]) - (nba_player_df.shape[0] + nba_team_df.shape[0]))


Rows filtered out:  9665


In [12]:
# Now we try to understand the season data
season_id_field = list(set(player_df['season'])) + list(set(team_df['season_id']))
print("Season ids are: ", season_id_field)

Season ids are:  ['2017-18', '2018-19', 22017, 22018]


In [0]:
# convert season_id for consistency
pd.options.mode.chained_assignment = None

nba_player_df.loc[nba_player_df['season'] == '2018-19', 'season_id'] = 22018
nba_player_df.loc[nba_player_df['season'] == '2017-18', 'season_id'] = 22017
nba_player_df = nba_player_df.astype({'season_id':int})

In [0]:
# The data contains 2 seasons
# Separate 2017 and 2018 season
# Look at season_id attribute
def separate_season(data):
  return data[data['season_id'] == 22017], data[data['season_id'] == 22018]


In [15]:
# separate data based on season
player17_df, player18_df = separate_season(nba_player_df)
team17_df, team18_df = separate_season(nba_team_df)
print(player17_df.shape, player18_df.shape, team17_df.shape, team18_df.shape)

(30020, 81) (30796, 81) (2460, 62) (2460, 62)


In [0]:
# add a home/ away column to team df
def bool_home_away(player, team):
  # Use the players dataset
  # Compare team_id with hometeam_id
  # Then use match_ids in the two datasets to take home/away boolean
  # from players dataset to team dataset
  isHome = player['team_id'] == player['hometeam_id']
  player = player.assign(isHome = isHome.values)
  team = team.merge(player[['match_id', 'team_id', 'isHome']], on=['match_id', 'team_id'], how = 'left')
  team.drop_duplicates(inplace=True)
  return player, team

In [17]:
# add isHome to both player and team df
player17, team17 = bool_home_away(player17_df, team17_df)
player18, team18 = bool_home_away(player18_df, team18_df)
print(player17.shape, player18.shape, team17.shape, team18.shape)

(30020, 82) (30796, 82) (2460, 63) (2460, 63)


In [0]:
def pts_scored_recieved(player):
  player['pts_scored'] = 0 
  player['pts_received'] = 0
  player.loc[player['isHome'] == True, ['pts_scored']] = player['final_hscore'] # points scored = home score for home teams.
  player.loc[player['isHome'] == True, ['pts_received']] = player['final_vscore'] # points received = away score for home teams.
  player.loc[player['isHome'] == False, ['pts_scored']] = player['final_vscore']
  player.loc[player['isHome'] == False, ['pts_received']] = player['final_hscore']
  return player

In [19]:
pts_scored_recieved(player17)
pts_scored_recieved(player18)
print('Done')

Done


In [20]:
team18.head()

Unnamed: 0,match_id,team_id,team_name_x,team_abbr,team_city,min,fgm,fga,fg_pct,fg3m,fg3a,fg3_pct,ftm,fta,ft_pct,oreb,dreb,reb,ast,stl,blk,to,pf,pts,plus_minus,source,team_name_y,e_off_rating,off_rating,e_def_rating,def_rating,e_net_rating,net_rating,ast_pct,ast_tov,ast_ratio,oreb_pct,dreb_pct,reb_pct,e_tm_tov_pct,tm_tov_pct,efg_pct,ts_pct,usg_pct,e_usg_pct,e_pace,pace,pie,team_name,pts_off_tov,pts_2nd_chance,pts_fb,pts_paint,opp_pts_off_tov,opp_pts_2nd_chance,opp_pts_fb,opp_pts_paint,blka,pfd,granularity,season_id,per_mode,isHome
0,21800729,1610612762,Jazz,UTA,Utah,240.0,37,94,0.394,13,40,0.325,19,25,0.76,16,31,47,25,9,8,11,19,106,4.0,https://stats.nba.com,Jazz,106.0,110.4,99.8,106.3,6.2,4.2,0.676,2.27,17.7,0.356,0.648,0.496,11.0,11.5,0.463,0.505,1.0,0.198,101.12,96.0,0.525,Jazz,12.0,12.0,8.0,38.0,13.0,19.0,12.0,48.0,5,22,team_game,22018,PerGame,True
13,21800729,1610612750,Timberwolves,MIN,Minnesota,240.0,38,90,0.422,8,26,0.308,18,21,0.857,12,38,50,20,5,5,14,22,102,-4.0,https://stats.nba.com,Timberwolves,99.8,106.3,106.0,110.4,-6.2,-4.2,0.526,1.33,14.9,0.352,0.644,0.504,14.671,15.6,0.467,0.514,1.0,0.196,101.12,96.0,0.475,Timberwolves,13.0,19.0,12.0,48.0,12.0,12.0,8.0,38.0,8,19,team_game,22018,PerGame,False
26,21800428,1610612758,Kings,SAC,Sacramento,240.0,45,96,0.469,15,44,0.341,20,25,0.8,7,35,42,33,6,3,13,28,125,-5.0,https://stats.nba.com,Kings,110.6,108.7,113.2,112.1,-2.6,-3.4,0.733,2.54,21.6,0.132,0.691,0.417,11.504,11.3,0.547,0.584,1.0,0.202,113.92,115.5,0.477,Kings,25.0,9.0,31.0,48.0,15.0,18.0,20.0,46.0,0,23,team_game,22018,PerGame,True
39,21800428,1610612744,Warriors,GSW,Golden State,240.0,44,96,0.458,15,40,0.375,27,36,0.75,15,45,60,32,5,0,18,23,130,5.0,https://stats.nba.com,Warriors,113.2,112.1,110.6,108.7,2.6,3.4,0.727,1.78,19.8,0.309,0.868,0.583,15.674,15.5,0.536,0.581,1.0,0.2,113.92,115.5,0.523,Warriors,15.0,18.0,20.0,46.0,25.0,9.0,31.0,48.0,3,28,team_game,22018,PerGame,False
50,21800081,1610612753,Magic,ORL,Orlando,240.0,33,101,0.327,10,43,0.233,15,17,0.882,11,30,41,21,9,6,11,22,91,-22.0,https://stats.nba.com,Magic,83.9,85.8,105.5,106.6,-21.6,-20.8,0.636,1.91,14.9,0.235,0.81,0.455,10.14,10.4,0.376,0.419,1.0,0.201,107.8,106.0,0.325,Magic,16.0,14.0,12.0,30.0,14.0,9.0,10.0,54.0,7,19,team_game,22018,PerGame,False


In [21]:
team17.head()


Unnamed: 0,match_id,team_id,team_name_x,team_abbr,team_city,min,fgm,fga,fg_pct,fg3m,fg3a,fg3_pct,ftm,fta,ft_pct,oreb,dreb,reb,ast,stl,blk,to,pf,pts,plus_minus,source,team_name_y,e_off_rating,off_rating,e_def_rating,def_rating,e_net_rating,net_rating,ast_pct,ast_tov,ast_ratio,oreb_pct,dreb_pct,reb_pct,e_tm_tov_pct,tm_tov_pct,efg_pct,ts_pct,usg_pct,e_usg_pct,e_pace,pace,pie,team_name,pts_off_tov,pts_2nd_chance,pts_fb,pts_paint,opp_pts_off_tov,opp_pts_2nd_chance,opp_pts_fb,opp_pts_paint,blka,pfd,granularity,season_id,per_mode,isHome
0,21700366,1610612762,Jazz,UTA,Utah,240.0,34,77,0.442,11,30,0.367,22,27,0.815,4,28,32,17,10,5,13,21,101,-11.0,https://stats.nba.com,Jazz,103.2,105.2,111.4,114.3,-8.2,-9.1,0.5,1.31,14.3,0.13,0.696,0.413,13.282,13.5,0.513,0.568,1.0,0.2,99.22,97.0,0.446,Jazz,24.0,11.0,15.0,42.0,15.0,15.0,11.0,36.0,5,25,team_game,22017,PerGame,True
12,21700366,1610612745,Rockets,HOU,Houston,240.0,38,81,0.469,18,42,0.429,18,24,0.75,8,38,46,22,10,5,17,25,112,11.0,https://stats.nba.com,Rockets,111.4,114.3,103.2,105.2,8.2,9.1,0.579,1.29,16.9,0.304,0.87,0.587,16.905,17.3,0.58,0.612,1.0,0.199,99.22,97.0,0.554,Rockets,15.0,15.0,11.0,36.0,24.0,11.0,15.0,42.0,5,21,team_game,22017,PerGame,False
24,21700507,1610612754,Pacers,IND,Indiana,240.0,38,79,0.481,10,25,0.4,8,10,0.8,7,32,39,27,8,4,15,17,94,-4.0,https://stats.nba.com,Pacers,102.8,105.6,108.2,110.1,-5.3,-4.5,0.711,1.8,21.5,0.262,0.833,0.548,16.411,16.9,0.544,0.564,1.0,0.199,91.0,89.0,0.477,Pacers,8.0,8.0,3.0,34.0,18.0,4.0,15.0,46.0,3,12,team_game,22017,PerGame,True
36,21700507,1610612742,Mavericks,DAL,Dallas,240.0,40,78,0.513,8,29,0.276,10,15,0.667,4,31,35,26,7,3,9,12,98,4.0,https://stats.nba.com,Mavericks,108.2,110.1,102.8,105.6,5.3,4.5,0.65,2.6,21.6,0.167,0.738,0.452,11.038,11.2,0.564,0.579,1.0,0.197,91.0,89.0,0.523,Mavericks,18.0,4.0,15.0,46.0,8.0,8.0,3.0,34.0,4,17,team_game,22017,PerGame,False
47,21700105,1610612749,Bucks,MIL,Milwaukee,240.0,45,91,0.495,19,36,0.528,12,15,0.8,9,31,40,29,3,3,14,25,121,-5.0,https://stats.nba.com,Bucks,117.9,123.5,125.0,128.6,-7.1,-5.1,0.644,2.07,20.6,0.277,0.75,0.505,13.645,14.3,0.599,0.62,1.0,0.195,101.68,98.0,0.468,Bucks,11.0,17.0,13.0,40.0,19.0,15.0,5.0,52.0,3,18,team_game,22017,PerGame,False


In [0]:
def get_rolling(data): # team data as input
  relevant_stats = ['match_id', 'team_id', 'isHome', 'plus_minus', 'fgm', 'fga', 'fg_pct', 'fg3m', 'fg3a',
                  'fg3_pct', 'ftm', 'fta', 'ft_pct', 'oreb', 'dreb', 'reb', 
                  'ast', 'stl', 'blk', 'to', 'pf', 'pts', 'e_off_rating', 
                  'off_rating', 'e_def_rating', 'def_rating', 'e_net_rating', 
                  'net_rating', 'ast_pct', 'ast_tov', 'ast_ratio', 'oreb_pct', 
                  'dreb_pct', 'reb_pct', 'e_tm_tov_pct', 'tm_tov_pct', 'efg_pct', 
                  'ts_pct', 'usg_pct', 'e_usg_pct', 'e_pace', 'pace', 'pie', 
                  'pts_off_tov', 'pts_2nd_chance', 'pts_fb', 'pts_paint', 'opp_pts_off_tov', 
                  'opp_pts_2nd_chance', 'opp_pts_fb', 'opp_pts_paint', 'blka', 'pfd']

  constant_attr=['match_id', 'team_id', 'isHome', 'plus_minus']
  
  data = data.sort_values('match_id', ascending=True) # sort by date (matchid is chronological)
  data.set_index(constant_attr,inplace=True) # set constants as index so they dont get averaged
  result= data.groupby("team_id").apply(lambda x:x.rolling(window=5).mean()) # get rolling avg
  dropped = result.dropna()
  result = dropped.groupby('match_id').apply(filter_single)
  result = result.reset_index(level=[1, 2, 3, 4]) # put indexes back in as columns
  result = result.drop(['match_id'], axis=1)
  result = result.reset_index(level=0)

  return result

In [0]:
def filter_single(x):
  if  len(x)>1:
    return x

In [0]:
def get_logReg_test(teamdf): # testing dataset
  team = get_rolling(teamdf) # get moving avgs
  home = team.loc[team['isHome']==True]
  visitor = team.loc[team['isHome']==False] # divide as home- away
  visitor = visitor.rename(columns={'team_id': 'v_team_id', 'ast_tov' : 'v_ast_tov', 'oreb_pct': 'v_oreb_pct',
                                    'efg_pct': 'v_efg_pct', 'e_net_rating': 'v_net_rating'}) # identify visitor stats
  visitor = visitor[['match_id', 'v_ast_tov', 'v_oreb_pct', 'v_efg_pct', 'v_net_rating']]
  home = home[['match_id', 'ast_tov', 'oreb_pct', 'efg_pct', 'e_net_rating', 'plus_minus']]
  data = pd.merge(home, visitor, on='match_id') # merge home&visitor stats
  data['winner'] = 'N/A'
  data.loc[data['plus_minus']>0, ['winner']] = 1 #add winner column (1 if home, 0 if away)
  data.loc[data['plus_minus']<0, ['winner']] = 0
  data = data.drop(['plus_minus', 'match_id'], axis=1)
  return(data)

In [0]:
def get_logReg_train(teamdf): # training dataset
  team = teamdf
  home = team.loc[team['isHome']==True]
  visitor = team.loc[team['isHome']==False]
  visitor = visitor.rename(columns={'team_id': 'v_team_id', 'ast_tov' : 'v_ast_tov', 'oreb_pct': 'v_oreb_pct',
                                    'efg_pct': 'v_efg_pct', 'e_net_rating': 'v_net_rating'}) # identify visitor stats
  visitor = visitor[['match_id', 'v_ast_tov', 'v_oreb_pct', 'v_efg_pct', 'v_net_rating']]
  home = home[['match_id', 'ast_tov', 'oreb_pct', 'efg_pct', 'e_net_rating', 'plus_minus']]
  data = pd.merge(home, visitor, on='match_id') # merge home&visitor stats
  data['winner'] = 'N/A'
  data.loc[data['plus_minus']>0, ['winner']] = 1 #add winner column (1 if home, 0 if away)
  data.loc[data['plus_minus']<0, ['winner']] = 0
  data = data.drop(['plus_minus', 'match_id'], axis=1)
  return(data)

In [0]:

train = get_logReg_train(team17)
test = get_logReg_test(team18)

In [46]:
from sklearn.linear_model import LogisticRegression

x_train, y_train = train.drop(columns=['winner']), train['winner']

log_reg = LogisticRegression()
log_reg.fit(x_train, y_train) #train

x_test, y_test = test.drop(columns=['winner']), test['winner']

acc=log_reg.score(x_test, y_test) #test
print(acc)
# print("Baseline accuracy is {} on our test data".format(acc))

0.7570815450643776


In [47]:
print(log_reg.coef_)
print(log_reg.intercept_)

[[ 0.25026277  1.22952908 -0.02257944  0.41729546 -0.30639857 -1.60479844
   0.19335808 -0.41729546]]
[0.34056972]
