In [1]:
import json
import numpy as np
from collections import defaultdict
from collections import OrderedDict

In [2]:
f = open('prediction/all_boxscores_2000_2017.json')
boxscore_data = json.load(f)

In [3]:
boxscore_data_ordered_by_key = OrderedDict(sorted(boxscore_data.items()))

In [4]:
# build running averages of stats in a season for a team
# team-ID -> season -> team's game num -> stat_type -> average (inclusive of current game)
team_boxscore_rolling_averages = defaultdict(lambda: defaultdict(lambda: defaultdict(dict)))

boxscore_categories = ['boxscoretraditionalv2', 'boxscoreadvancedv2', 'boxscoremiscv2', 'boxscorescoringv2',
                       'boxscoreusagev2', 'boxscorefourfactorsv2', 'boxscoreplayertrackv2', 'hustlestatsboxscore',
                       'boxscoresummaryv2']

BOXSCORE_TRADITIONAL_FEATURES = []

def prune_team_stats(team_stats):
  pruned_team_stats = []
  stat_name_index_tuples = [
    ('FGM', 6),
    ('FGA', 7),
    ('FG_PCT', 8),
    ('FG3M', 9),
    ('FG3A', 10),
    ('FG3_PCT', 11),
    ('FTM', 12),
    ('FTA', 13),
    ('FT_PCT', 14),
    ('OREB', 15),
    ('DREB', 16),
    ('REB', 17),
    ('AST', 18),
    ('STL', 19),
    ('BLK', 20),
    ('TO', 21),
    ('PF', 22),
    ('PTS', 23),
    ('PLUS_MINUS', 24)
  ]
  
  for stat_name_index_tuple in stat_name_index_tuples:
    pruned_team_stats.append(team_stats[stat_name_index_tuple[1]])
  
  return np.asarray(pruned_team_stats)

def prune_starter_bench_stats(team_starter_bench_stats):
  pruned_team_starter_bench_stats = []
  stat_name_index_tuples = [
    ('FGM', 7),
    ('FGA', 8),
    ('FG_PCT', 9),
    ('FG3M', 10),
    ('FG3A', 11),
    ('FG3_PCT', 12),
    ('FTM', 13),
    ('FTA', 14),
    ('FT_PCT', 15),
    ('OREB', 16),
    ('DREB', 17),
    ('REB', 18),
    ('AST', 19),
    ('STL', 20),
    ('BLK', 21),
    ('TO', 22),
    ('PF', 23),
    ('PTS', 24),
  ]
  
  for stat_name_index_tuple in stat_name_index_tuples:
    pruned_team_starter_bench_stats.append(team_starter_bench_stats[stat_name_index_tuple[1]])
  
  return np.asarray(pruned_team_starter_bench_stats)

# maintains state about how many games were played by a team in a season, reset every season
# TEAM_ID -> num games played
team_games_played_dict = defaultdict(lambda: 1)

prev_game_num = None
for json_file_name, data in boxscore_data_ordered_by_key.iteritems():
  game_id = json_file_name.split('.')[0]
  game_num = game_id[-4:]
  season = game_id[3:5]
  if game_num == '0001':
    team_games_played_dict = defaultdict(lambda: 1)
  for boxscore_category in boxscore_categories:
    category_data = boxscore_data[json_file_name][boxscore_category]
    if boxscore_category == 'boxscoretraditionalv2':
      if 'resultSets' in category_data:
        data = category_data['resultSets']
        # first team in list is away, second is home
        if len(data[1]['rowSet']) == 0:
          continue
        away_team_id = data[1]['rowSet'][0][1]
        home_team_id = data[1]['rowSet'][1][1]
        team_stats_away = data[1]['rowSet'][0]
        team_stats_home = data[1]['rowSet'][1]
        pruned_team_stats_away_features = prune_team_stats(team_stats_away)
        pruned_team_stats_home_features = prune_team_stats(team_stats_home)
        team_starter_stats_away = data[2]['rowSet'][0]
        team_bench_stats_away = data[2]['rowSet'][1]
        team_starter_stats_home = data[2]['rowSet'][2]
        team_bench_stats_home = data[2]['rowSet'][3]
        pruned_starter_stats_away = prune_starter_bench_stats(team_starter_stats_away)
        pruned_bench_stats_away = prune_starter_bench_stats(team_bench_stats_away)
        pruned_starter_stats_home = prune_starter_bench_stats(team_starter_stats_home)
        pruned_bench_stats_home = prune_starter_bench_stats(team_bench_stats_away)
        
        if team_games_played_dict[away_team_id] == 1:
          team_boxscore_rolling_averages[away_team_id][season][1]['team_stats'] = pruned_team_stats_away_features
        else:
          prev_game_num = team_games_played_dict[away_team_id] - 1
          curr_game_num = team_games_played_dict[away_team_id]
          prev_away_team_stats = team_boxscore_rolling_averages[away_team_id][season][prev_game_num]['team_stats']
          team_boxscore_rolling_averages[away_team_id][season][curr_game_num]['team_stats'] = \
            prev_away_team_stats + \
            (pruned_team_stats_away_features - prev_away_team_stats)/curr_game_num
        
        team_games_played_dict[away_team_id] += 1

        if team_games_played_dict[home_team_id] == 1:
          team_boxscore_rolling_averages[home_team_id][season][1]['team_stats'] = pruned_team_stats_home_features
        else:
          prev_game_num = team_games_played_dict[home_team_id] - 1
          curr_game_num = team_games_played_dict[home_team_id]
          prev_home_team_stats = team_boxscore_rolling_averages[home_team_id][season][prev_game_num]['team_stats']
          team_boxscore_rolling_averages[home_team_id][season][curr_game_num]['team_stats'] = \
            prev_home_team_stats + \
            (pruned_team_stats_home_features - prev_home_team_stats)/curr_game_num
        
        team_games_played_dict[home_team_id] += 1
    else:
      # ignore other categories for now
      break

In [5]:
# cumulative timeseries for the OKC Thunder in the 2016-17 season
team_boxscore_rolling_averages[1610612760]['16']

defaultdict(dict,
            {1: {'team_stats': array([  39.   ,   94.   ,    0.415,    6.   ,   22.   ,    0.273,
                       19.   ,   26.   ,    0.731,   13.   ,   40.   ,   53.   ,
                       19.   ,    7.   ,    3.   ,   10.   ,   23.   ,  103.   ,    6.   ])},
             2: {'team_stats': array([  39.5   ,   95.    ,    0.416 ,    5.5   ,   21.5   ,    0.2555,
                       23.5   ,   32.    ,    0.734 ,   13.5   ,   41.5   ,   55.    ,
                       16.    ,    9.    ,    3.    ,   16.    ,   23.5   ,  108.    ,
                        4.5   ])},
             3: {'team_stats': array([  41.33333333,   95.        ,    0.43533333,    6.66666667,
                       23.        ,    0.28566667,   20.33333333,   27.        ,
                        0.764     ,   12.33333333,   39.33333333,   51.66666667,
                       20.33333333,   11.        ,    4.        ,   17.        ,
                       23.        ,  109.66666667,    8