In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

KeyboardInterrupt: ignored

In [0]:
player_path = 'gdrive/My Drive/nba_summary_17_19.csv'
team_path = 'gdrive/My Drive/team_boxscores_2017_2018.csv'

In [0]:
import pandas as pd
player_df = pd.read_csv(player_path)
team_df = pd.read_csv(team_path)
print(player_df.columns, '\n\n',team_df.columns)

In [0]:
# dataframe property
print(player_df.shape, team_df.shape)

In [0]:
# filter match ids
match_id_field = list(player_df['match_id']) + list(team_df['match_id'])
unwanted_ids = [i for i in match_id_field if str(i)[0] != '2']
print("unwanted match_id number: ", len(unwanted_ids))

In [0]:
# assumption testing -> match id is always 8 digits
match_id_len = [len(str(i)) == 8 for i in match_id_field]
print("Assumption is tested to be: ", all(match_id_len))

In [0]:
# regular season match ids start with 2
# Filter all those starting with 2
def remove_non_nba(data):
  return data[(data['match_id'] >= 20000000) & (data['match_id'] < 30000000)]

In [0]:
# test match_id filter
nba_player_df = remove_non_nba(player_df)  # nba df is all match within nba(match id starts with 2)
nba_team_df = remove_non_nba(team_df)
print("Rows filtered out: ", (player_df.shape[0] + team_df.shape[0]) - (nba_player_df.shape[0] + nba_team_df.shape[0]))


In [0]:
# Now we try to understand the season data
season_id_field = list(set(player_df['season'])) + list(set(team_df['season_id']))
print("Season ids are: ", season_id_field)

In [0]:
# convert season_id for consistency
pd.options.mode.chained_assignment = None

nba_player_df.loc[nba_player_df['season'] == '2018-19', 'season_id'] = 22018
nba_player_df.loc[nba_player_df['season'] == '2017-18', 'season_id'] = 22017
nba_player_df = nba_player_df.astype({'season_id':int})

In [0]:
# The data contains 2 seasons
# Separate 2017 and 2018 season
# Look at season_id attribute
def separate_season(data):
  return data[data['season_id'] == 22017], data[data['season_id'] == 22018]


In [0]:
# separate data based on season
player17_df, player18_df = separate_season(nba_player_df)
team17_df, team18_df = separate_season(nba_team_df)
print(player17_df.shape, player18_df.shape, team17_df.shape, team18_df.shape)

In [0]:
# add a home/ away column to team df
def bool_home_away(player, team):
  # Use the players dataset
  # Compare team_id with hometeam_id
  # Then use match_ids in the two datasets to take home/away boolean
  # from players dataset to team dataset
  isHome = player['team_id'] == player['hometeam_id']
  player = player.assign(isHome = isHome.values)
  team = team.merge(player[['match_id', 'team_id', 'isHome']], on=['match_id', 'team_id'], how = 'left')
  team.drop_duplicates(inplace=True)
  return player, team

In [0]:
# add isHome to both player and team df
player17, team17 = bool_home_away(player17_df, team17_df)
player18, team18 = bool_home_away(player18_df, team18_df)
print(player17.shape, player18.shape, team17.shape, team18.shape)

In [0]:
def pts_scored_recieved(player):
  player['pts_scored'] = 0 
  player['pts_received'] = 0
  player.loc[player['isHome'] == True, ['pts_scored']] = player['final_hscore'] # points scored = home score for home teams.
  player.loc[player['isHome'] == True, ['pts_received']] = player['final_vscore'] # points received = away score for home teams.
  player.loc[player['isHome'] == False, ['pts_scored']] = player['final_vscore']
  player.loc[player['isHome'] == False, ['pts_received']] = player['final_hscore']
  return player

In [0]:
pts_scored_recieved(player17)
pts_scored_recieved(player18)
print('Done')

In [0]:
team17.head()

Correct syntax: 

h_team = data[(data['match_id']==match) and (data['isHome']==True)]['team_id']

In [0]:
def get_rolling(data): # team data
  relevant_stats = ['match_id', 'team_id', 'fgm', 'fga', 'fg_pct', 'fg3m', 'fg3a',
                  'fg3_pct', 'ftm', 'fta', 'ft_pct', 'oreb', 'dreb', 'reb', 
                  'ast', 'stl', 'blk', 'to', 'pf', 'pts', 'plus_minus', 'e_off_rating', 
                  'off_rating', 'e_def_rating', 'def_rating', 'e_net_rating', 
                  'net_rating', 'ast_pct', 'ast_tov', 'ast_ratio', 'oreb_pct', 
                  'dreb_pct', 'reb_pct', 'e_tm_tov_pct', 'tm_tov_pct', 'efg_pct', 
                  'ts_pct', 'usg_pct', 'e_usg_pct', 'e_pace', 'pace', 'pie', 
                  'pts_off_tov', 'pts_2nd_chance', 'pts_fb', 'pts_paint', 'opp_pts_off_tov', 
                  'opp_pts_2nd_chance', 'opp_pts_fb', 'opp_pts_paint', 'blka', 'pfd']

  numericals = list(set(relevant_stats) - set(['match_id', 'team_id']))
  teams = data[relevant_stats]
  teams = teams.sort_values('match_id', ascending=True)
  teams[numericals] = teams.groupby("team_id").apply(lambda x:x.iloc[:,2:].rolling(window=5).mean())
  dropped = teams.dropna()
  result = dropped.groupby(dropped['match_id']).apply(filter_single)
  result[result['match_id']==data['match_id'] and result['team_id']==data['team_id']]['isHome'] = data['isHome']
  result[result['match_id']==data['match_id'] and result['team_id']==data['team_id']]['plus_minus'] = data['isHome']
  return result

In [0]:
def filter_single(x):
  if  len(x)>1:
    return x

In [0]:
def get_logReg_data(teamdf):
  team = get_rolling(teamdf)
  home = team.loc[team['isHome']==True]
  visitor = team.loc[team['isHome']==False]
  visitor = visitor.rename(columns={"team_id": "v_team_id", 
                                    "e_net_rating": "v_net_rating", "ast_tov": "v_ast_tov", 
                                    "oreb_pct": "v_oreb_pct", "efg_pct": "v_efg_pct"}) # identify visitor stats
  visitor = visitor[['match_id', 'v_team_id', 'v_net_rating', 'v_ast_tov', 'v_oreb_pct', 'v_efg_pct']]
  home = home[['match_id', 'team_id', 'e_net_rating', 'ast_tov', 'oreb_pct', 'efg_pct', 'plus_minus']]
  data = pd.merge(home, visitor, on='match_id') # merge home&visitor stats
  data[data['match_id']==result['match_id'] and data['team_id']==result['team_id']]['e_net_rating', 'ast_tov', 'oreb_pct', 'efg_pct'] = result1['e_net_rating', 'ast_tov', 'oreb_pct', 'efg_pct']
  data['winner'] = 'N/A'
  data.loc[data['plus_minus']>0, ['winner']] = 1 #add winner column (1 if home, 0 if away)
  data.loc[data['plus_minus']<0, ['winner']] = 0
  data = data.drop(['plus_minus'], axis=1)
  return(data)

In [133]:
train, test = get_logReg_data(team17), get_logReg_data(team18)

ValueError: ignored

In [0]:
'''from sklearn.linear_model import LogisticRegression

x_train, y_train = train.drop(columns=['winner']), train[['winner']]

log_reg = LogisticRegression()
log_reg.fit(x_train, y_train) #train

x_test, y_test = test.drop(columns=['winner']), test[['winner']]

acc=log_reg.score(x_test, y_test) #test
print("Baseline accuracy is {} on our test data".format(acc))'''