In [4]:
import json
import pandas as pd
import os

#Tidy data for one single game, hepler function of tidy_data
def tidy_one_game_data(raw_data: dict):
    '''
    To extract the features wanted from one game into a pandas dataframe, helper function of tidyData
    returns: pd.DataFrame, or None if liveData doesn't exist
    '''
    
    
    if 'liveData' not in raw_data:
        return None
    
    df = pd.DataFrame(columns=['gameID_eventID', 'period', 'dateTime', 'gameID', 'team', 'goal', 'x', 'y', 'shooter', 'goalie', 'shotType', 'emptyNet', 'strength', 'gameType','home','away', 'season'])
    types = ['Shot', 'Goal']
    gameID = raw_data['gamePk']
    gameType = raw_data['gameData']['game']['type']
    home = raw_data['gameData']['teams']['home']['name']
    away = raw_data['gameData']['teams']['away']['name']
    season = raw_data['gameData']['game']['season']
    i = 0
    
    #For each play, if event is type of 'Shot' or 'Goal', acquire the features needed
    for play in raw_data['liveData']['plays']['allPlays']:
        if play['result']['event'] in types:
            about = play['about']
            gameID_eventID = str(gameID)+'_'+str(about['eventId'])
            period = about['period']
            dateTime = about['dateTime']
            team = play['team']['name']
            goal = False
            shooter = ''
            goalie = ''
            emptyNet = ''
            strength = ''
            
            if play['result']['event'] == 'Goal':
                goal = True
                emptyNet = play['result'].get('emptyNet', None)
                strength = play['result'].get('strength',None).get('name', None)
                for player in play['players']:
                    if player['playerType'] == 'Scorer':
                        shooter = player['player']['fullName']
                    if player['playerType'] == 'Goalie':
                        goalie =  player['player']['fullName']
            else:
                for player in play['players']:
                    if player['playerType'] == 'Shooter':
                        shooter = player['player']['fullName']
                    if player['playerType'] == 'Goalie':
                        goalie =  player['player']['fullName']
            x = play['coordinates'].get('x', None)
            y = play['coordinates'].get('y',None)
            shotType = play['result'].get('secondaryType',None)
            
            df.loc[i] = [gameID_eventID, period, dateTime, gameID, team, goal, x, y, shooter, goalie, shotType, emptyNet, strength, gameType,home,away, season]
            i += 1
            
    return df
    

In [5]:
def tidy_data(dir_path: str) -> pd.DataFrame:
    '''
    To extract the features wanted from all games stored in the JSON files of directory dir_path (nested or not) into a pandas dataframe
    returns: pd.DataFrame
    '''
    
    
    df = pd.DataFrame()
    init = True
    
    for dir, subdirs, files in os.walk(dir_path):
           for name in files:
                file_path = os.path.join(dir, name)
                with open(file_path, 'r') as file:
                    data = file.read()
                raw_data = json.loads(data)
                if init:
                    df = tidy_one_game_data(raw_data)
                    if df is not None:
                        init = False
                else:
                    new_df = tidy_one_game_data(raw_data)
                    if new_df is not None:
                        df = pd.concat([df, new_df],ignore_index=True)
    #write the dataframe to CSV so we don't have to re-run this code everytime
    df.to_csv('data/tidy.csv', index=False)                     
    return df

df_tidy = tidy_data('data')
   

In [6]:
df_tidy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 387829 entries, 0 to 387828
Data columns (total 17 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   gameID_eventID  387829 non-null  object
 1   period          387829 non-null  object
 2   dateTime        387829 non-null  object
 3   gameID          387829 non-null  object
 4   team            387829 non-null  object
 5   goal            387829 non-null  object
 6   x               387813 non-null  object
 7   y               387814 non-null  object
 8   shooter         387829 non-null  object
 9   goalie          387829 non-null  object
 10  shotType        387780 non-null  object
 11  emptyNet        386826 non-null  object
 12  strength        387829 non-null  object
 13  gameType        387829 non-null  object
 14  home            387829 non-null  object
 15  away            387829 non-null  object
 16  season          387829 non-null  object
dtypes: object(17)
memory usage: 5

In [7]:
df_tidy

Unnamed: 0,gameID_eventID,period,dateTime,gameID,team,goal,x,y,shooter,goalie,shotType,emptyNet,strength,gameType,home,away,season
0,2016030111_53,1,2017-04-12T23:11:54Z,2016030111,Montréal Canadiens,False,-8.0,-36.0,Dwight King,Henrik Lundqvist,Wrist Shot,,,P,Montréal Canadiens,New York Rangers,20162017
1,2016030111_55,1,2017-04-12T23:13:24Z,2016030111,New York Rangers,False,85.0,-6.0,Rick Nash,Carey Price,Wrist Shot,,,P,Montréal Canadiens,New York Rangers,20162017
2,2016030111_56,1,2017-04-12T23:13:32Z,2016030111,Montréal Canadiens,False,-69.0,-35.0,Max Pacioretty,Henrik Lundqvist,Wrist Shot,,,P,Montréal Canadiens,New York Rangers,20162017
3,2016030111_57,1,2017-04-12T23:15:08Z,2016030111,Montréal Canadiens,False,-55.0,-17.0,Dwight King,Henrik Lundqvist,Slap Shot,,,P,Montréal Canadiens,New York Rangers,20162017
4,2016030111_62,1,2017-04-12T23:18:02Z,2016030111,Montréal Canadiens,False,-58.0,-28.0,Paul Byron,Henrik Lundqvist,Wrist Shot,,,P,Montréal Canadiens,New York Rangers,20162017
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
387824,2020020868_490,3,2021-05-09T05:02:46Z,2020020868,San Jose Sharks,False,71.0,-16.0,Noah Gregor,Adin Hill,Backhand,,,R,San Jose Sharks,Arizona Coyotes,20202021
387825,2020020868_498,4,2021-05-09T05:05:28Z,2020020868,San Jose Sharks,False,-68.0,-14.0,Erik Karlsson,Adin Hill,Wrist Shot,,,R,San Jose Sharks,Arizona Coyotes,20202021
387826,2020020868_500,4,2021-05-09T05:06:21Z,2020020868,Arizona Coyotes,False,56.0,11.0,Conor Garland,Alexei Melnichuk,Wrist Shot,,,R,San Jose Sharks,Arizona Coyotes,20202021
387827,2020020868_751,4,2021-05-09T05:07:10Z,2020020868,Arizona Coyotes,False,73.0,6.0,Conor Garland,Alexei Melnichuk,Wrist Shot,,,R,San Jose Sharks,Arizona Coyotes,20202021


In [8]:
df_tidy.head(10)

Unnamed: 0,gameID_eventID,period,dateTime,gameID,team,goal,x,y,shooter,goalie,shotType,emptyNet,strength,gameType,home,away,season
0,2016030111_53,1,2017-04-12T23:11:54Z,2016030111,Montréal Canadiens,False,-8.0,-36.0,Dwight King,Henrik Lundqvist,Wrist Shot,,,P,Montréal Canadiens,New York Rangers,20162017
1,2016030111_55,1,2017-04-12T23:13:24Z,2016030111,New York Rangers,False,85.0,-6.0,Rick Nash,Carey Price,Wrist Shot,,,P,Montréal Canadiens,New York Rangers,20162017
2,2016030111_56,1,2017-04-12T23:13:32Z,2016030111,Montréal Canadiens,False,-69.0,-35.0,Max Pacioretty,Henrik Lundqvist,Wrist Shot,,,P,Montréal Canadiens,New York Rangers,20162017
3,2016030111_57,1,2017-04-12T23:15:08Z,2016030111,Montréal Canadiens,False,-55.0,-17.0,Dwight King,Henrik Lundqvist,Slap Shot,,,P,Montréal Canadiens,New York Rangers,20162017
4,2016030111_62,1,2017-04-12T23:18:02Z,2016030111,Montréal Canadiens,False,-58.0,-28.0,Paul Byron,Henrik Lundqvist,Wrist Shot,,,P,Montréal Canadiens,New York Rangers,20162017
5,2016030111_64,1,2017-04-12T23:18:39Z,2016030111,Montréal Canadiens,False,-45.0,-33.0,Andrei Markov,Henrik Lundqvist,Wrist Shot,,,P,Montréal Canadiens,New York Rangers,20162017
6,2016030111_67,1,2017-04-12T23:20:15Z,2016030111,Montréal Canadiens,False,54.0,-27.0,Jordie Benn,Henrik Lundqvist,Wrist Shot,,,P,Montréal Canadiens,New York Rangers,20162017
7,2016030111_68,1,2017-04-12T23:20:25Z,2016030111,New York Rangers,False,49.0,1.0,Derek Stepan,Carey Price,Wrist Shot,,,P,Montréal Canadiens,New York Rangers,20162017
8,2016030111_71,1,2017-04-12T23:21:50Z,2016030111,Montréal Canadiens,False,-35.0,11.0,Nikita Nesterov,Henrik Lundqvist,Wrist Shot,,,P,Montréal Canadiens,New York Rangers,20162017
9,2016030111_72,1,2017-04-12T23:22:01Z,2016030111,Montréal Canadiens,False,-54.0,-33.0,Alexander Radulov,Henrik Lundqvist,Slap Shot,,,P,Montréal Canadiens,New York Rangers,20162017
