In [1]:
import requests
import pandas as pd
import matplotlib.pyplot as plt

pd.options.display.max_columns = None

More more helpful information on navigating the NHL Hockey API, see:

https://gitlab.com/dword4/nhlapi/tree/master/

## Play-by-Play Data

`GET https://statsapi.web.nhl.com/api/v1/game/ID/feed/live Returns` all data about a specified game id including play data with on-ice coordinates and post-game details like first, second and third stars and any details about shootouts. The data returned is simply too large at often over 30k lines and is best explored with a JSON viewer.

### Game ID Dictionary
* First 4 digits signify the season start year, ex: `2018` (for the 2018-2019 season)
* Next 2 digits signify the following:
    - 01: Preseason
    - 02: Regular Season
    - 03: Post-Season (Playoffs)
    - 04: All-Star Games
* The final 4 digits signify the game number. Valid range is `0001`-`1271` (until 2020, when the NHL will mandate that there will be 1312 games per season)

Note: for help visualizing JSON see - http://jsonviewer.stack.hu

## Buffalo Sabres vs Carolina Hurricanes

In [2]:
year = '2018'
season = '02'
game_number = '0683'
game_id = year + season + game_number

url = f'https://statsapi.web.nhl.com/api/v1/game/{game_id}/feed/live'
response = requests.get(url)

In [3]:
url

'https://statsapi.web.nhl.com/api/v1/game/2018020683/feed/live'

In [4]:
json = response.json()

## Team Data
First, we will start simple and extract the names and codes of the home and visiting teams.

In [5]:
away_team = json.get('gameData').get('teams').get('away').get('name')
home_team = json.get('gameData').get('teams').get('home').get('name')

away_tri_code = json.get('gameData').get('teams').get('away').get('triCode')
home_tri_code = json.get('gameData').get('teams').get('home').get('triCode')

print(f'Home: {home_team} ({home_tri_code})')
print(f'Away: {away_team} ({away_tri_code})')

Home: Carolina Hurricanes (CAR)
Away: Buffalo Sabres (BUF)


## Player Data
### Part 1: Player Game Stats
Next, we will create a pandas DataFrame of all player information, both home and visitors, for the given game. This will be useful down the line when analyzing each event or play. First we will begin with game-specific stats for each team.

In [6]:
away_players = json.get('liveData').get('boxscore').get('teams').get('away').get('players')

In [7]:
home_players = json.get('liveData').get('boxscore').get('teams').get('home').get('players')

In [8]:
home_player_stats = {}

for vals in home_players.values():
    for key, info in vals.items():
        # add to player stats
        if (key == 'person') or (key == 'position'):
            for sub_key, value in info.items():
                if sub_key in home_player_stats:
                    home_player_stats[sub_key].append(value)
                else:
                    home_player_stats[sub_key] = []
        elif key == 'jerseyNumber':
            if key in home_player_stats:
                home_player_stats[key].append(info)
            else:
                home_player_stats[key] = []
        elif key == 'stats':
            if info.get('skaterStats') is not None:
                sub_info = info.get('skaterStats')
            for sub_key, value in sub_info.items():
                if sub_key in home_player_stats:
                    home_player_stats[sub_key].append(value)
                else:
                    home_player_stats[sub_key] = []

# delete this column due to uneven number of entries
del home_player_stats['faceOffPct']

# construct pandas DataFrame
home_player_stats = pd.DataFrame(home_player_stats)
home_player_stats['team'] = home_tri_code
home_player_stats['home_away'] = 'Home'

In [9]:
home_player_stats.head()

Unnamed: 0,id,fullName,link,shootsCatches,rosterStatus,jerseyNumber,code,name,type,abbreviation,timeOnIce,assists,goals,shots,hits,powerPlayGoals,powerPlayAssists,penaltyMinutes,faceOffWins,faceoffTaken,takeaways,giveaways,shortHandedGoals,shortHandedAssists,blocked,plusMinus,evenTimeOnIce,powerPlayTimeOnIce,shortHandedTimeOnIce,team,home_away
0,8476934,Brock McGinn,/api/v1/people/8476934,L,Y,23,L,Left Wing,Forward,LW,16:45,0,0,2,3,0,0,0,0,1,2,1,0,0,0,0,13:47,0:00,2:58,CAR,Home
1,8475735,Greg McKegg,/api/v1/people/8475735,L,Y,42,C,Center,Forward,C,14:39,0,0,2,1,0,0,0,3,11,1,0,0,0,0,0,14:39,0:00,0:00,CAR,Home
2,8476958,Jaccob Slavin,/api/v1/people/8476958,L,Y,74,D,Defenseman,Defenseman,D,23:27,1,0,1,3,0,0,0,0,0,0,0,0,0,1,2,18:58,1:20,3:09,CAR,Home
3,8477998,Warren Foegele,/api/v1/people/8477998,L,Y,13,L,Left Wing,Forward,LW,10:35,0,0,3,5,0,0,0,0,0,3,0,0,0,0,-2,9:16,0:00,1:19,CAR,Home
4,8478427,Sebastian Aho,/api/v1/people/8478427,L,Y,20,C,Center,Forward,C,21:42,0,2,2,1,0,0,0,13,20,2,3,0,0,0,4,16:49,2:45,2:08,CAR,Home


In [10]:
away_player_stats = {}

for vals in away_players.values():
    for key, info in vals.items():
        # add to player stats
        if (key == 'person') or (key == 'position'):
            for sub_key, value in info.items():
                if sub_key in away_player_stats:
                    away_player_stats[sub_key].append(value)
                else:
                    away_player_stats[sub_key] = []
        elif key == 'jerseyNumber':
            if key in away_player_stats:
                away_player_stats[key].append(info)
            else:
                away_player_stats[key] = []
        elif key == 'stats':
            if info.get('skaterStats') is not None:
                sub_info = info.get('skaterStats')
            for sub_key, value in sub_info.items():
                if sub_key in away_player_stats:
                    away_player_stats[sub_key].append(value)
                else:
                    away_player_stats[sub_key] = []

# delete this column due to uneven number of entries
del away_player_stats['faceOffPct']

# construct pandas DataFrame
away_player_stats = pd.DataFrame(away_player_stats)
away_player_stats['team'] = away_tri_code
away_player_stats['home_away'] = 'Away'

In [11]:
away_player_stats.head()

Unnamed: 0,id,fullName,link,shootsCatches,rosterStatus,jerseyNumber,code,name,type,abbreviation,timeOnIce,assists,goals,shots,hits,powerPlayGoals,powerPlayAssists,penaltyMinutes,faceOffWins,faceoffTaken,takeaways,giveaways,shortHandedGoals,shortHandedAssists,blocked,plusMinus,evenTimeOnIce,powerPlayTimeOnIce,shortHandedTimeOnIce,team,home_away
0,8475728,Johan Larsson,/api/v1/people/8475728,L,Y,22,C,Center,Forward,C,12:41,0,0,0,1,0,0,2,8,13,0,0,0,0,1,2,10:52,0:00,1:49,BUF,Away
1,8474618,Marco Scandella,/api/v1/people/8474618,L,Y,6,D,Defenseman,Defenseman,D,14:03,1,0,2,1,0,0,0,0,0,0,0,0,0,1,-1,12:33,0:08,1:22,BUF,Away
2,8477839,Conor Sheary,/api/v1/people/8477839,L,Y,43,L,Left Wing,Forward,LW,15:49,0,0,1,0,0,0,0,1,1,1,0,0,0,0,0,14:08,1:41,0:00,BUF,Away
3,8471436,Matt Hunwick,/api/v1/people/8471436,L,Y,48,,Unknown,Unknown,,15:49,0,0,1,0,0,0,0,1,1,1,0,0,0,0,0,14:08,1:41,0:00,BUF,Away
4,8476931,Jake McCabe,/api/v1/people/8476931,L,Y,19,D,Defenseman,Defenseman,D,15:49,0,0,0,0,0,0,0,0,0,1,0,0,0,1,-2,12:42,0:00,3:07,BUF,Away


In [12]:
game_player_stats = pd.concat([home_player_stats, away_player_stats])

Finally, before presenting the finalized table, we will convert all the `camelCase` columns to be in `snake_case` format.

In [13]:
col_names = ['id', 'full_name', 'link', 'shoots_catches', 'roster_status',
       'jersey_number', 'code', 'name', 'type', 'abbreviation', 'time_on_ice',
       'assists', 'goals', 'shots', 'hits', 'power_play_goals',
       'power_play_assists', 'penalty_minutes', 'face_off_wins', 'face_off_taken',
       'takeaways', 'giveaways', 'short_handed_goals', 'short_handed_assists',
       'blocked', 'plus_minus', 'even_time_on_ice', 'power_play_time_on_ice',
       'short_handed_time_on_ice', 'team', 'home_away']
game_player_stats.columns = col_names
game_player_stats.head()

Unnamed: 0,id,full_name,link,shoots_catches,roster_status,jersey_number,code,name,type,abbreviation,time_on_ice,assists,goals,shots,hits,power_play_goals,power_play_assists,penalty_minutes,face_off_wins,face_off_taken,takeaways,giveaways,short_handed_goals,short_handed_assists,blocked,plus_minus,even_time_on_ice,power_play_time_on_ice,short_handed_time_on_ice,team,home_away
0,8476934,Brock McGinn,/api/v1/people/8476934,L,Y,23,L,Left Wing,Forward,LW,16:45,0,0,2,3,0,0,0,0,1,2,1,0,0,0,0,13:47,0:00,2:58,CAR,Home
1,8475735,Greg McKegg,/api/v1/people/8475735,L,Y,42,C,Center,Forward,C,14:39,0,0,2,1,0,0,0,3,11,1,0,0,0,0,0,14:39,0:00,0:00,CAR,Home
2,8476958,Jaccob Slavin,/api/v1/people/8476958,L,Y,74,D,Defenseman,Defenseman,D,23:27,1,0,1,3,0,0,0,0,0,0,0,0,0,1,2,18:58,1:20,3:09,CAR,Home
3,8477998,Warren Foegele,/api/v1/people/8477998,L,Y,13,L,Left Wing,Forward,LW,10:35,0,0,3,5,0,0,0,0,0,3,0,0,0,0,-2,9:16,0:00,1:19,CAR,Home
4,8478427,Sebastian Aho,/api/v1/people/8478427,L,Y,20,C,Center,Forward,C,21:42,0,2,2,1,0,0,0,13,20,2,3,0,0,0,4,16:49,2:45,2:08,CAR,Home


## Player Data
### Part 2: Player Info
Now we will also get player-specific information such as their nationality, height, weight etc.

In [14]:
players = json.get('gameData').get('players')

In [15]:
player_data = {}

for player, data in players.items():
    for key, val in data.items():
        if key in player_data:
            player_data[key].append(val)
        else:
            player_data[key] = []

It looks like we have captured all the information we needed. However, the `currentTeam` and the `primaryPosition` keys don't contain the data in the format we need, so we will need to clean those two columns a little bit more.

In [16]:
for key, val in player_data.items():
    print(key, val[:5])

id [8477839, 8471436, 8477998, 8478427, 8475753]
fullName ['Conor Sheary', 'Matt Hunwick', 'Warren Foegele', 'Sebastian Aho', 'Justin Faulk']
link ['/api/v1/people/8477839', '/api/v1/people/8471436', '/api/v1/people/8477998', '/api/v1/people/8478427', '/api/v1/people/8475753']
firstName ['Conor', 'Matt', 'Warren', 'Sebastian', 'Justin']
lastName ['Sheary', 'Hunwick', 'Foegele', 'Aho', 'Faulk']
primaryNumber ['43', '48', '13', '20', '27']
birthDate ['1992-06-08', '1985-05-21', '1996-04-01', '1997-07-26', '1992-03-20']
currentAge [26, 33, 22, 21, 26]
birthCity ['Winchester', 'Warren', 'Markham', 'Rauma', 'South St.Paul']
birthStateProvince ['MA', 'MI', 'ON', 'MN', 'NY']
birthCountry ['USA', 'USA', 'CAN', 'FIN', 'USA']
nationality ['USA', 'USA', 'CAN', 'FIN', 'USA']
height ['5\' 8"', '5\' 11"', '6\' 2"', '6\' 0"', '6\' 0"']
weight [176, 194, 198, 176, 217]
active [True, True, True, True, True]
alternateCaptain [False, False, False, False, True]
captain [False, False, False, False, False]


In [17]:
current_team = [x.get('name') for x in player_data['currentTeam']]
current_team_code = [x.get('triCode') for x in player_data['currentTeam']]
primary_position = [x.get('abbreviation') for x in player_data['primaryPosition']]

We will now re-assign the cleaned data back to our player data dictionary, and then use it to construct a pandas DataFrame to be able to explore the data easier.

In [18]:
player_data['currentTeam'] = current_team
player_data['currentTeamCode'] = current_team_code
player_data['primaryPosition'] = primary_position

We notice that because some players are not born in the US/Canada, there is not field for `birthStateProvince` for them. Therefore we will just drop this key from our data.

In [19]:
for key, val in player_data.items():
    print(key, ' : ', len(val))

id  :  41
fullName  :  41
link  :  41
firstName  :  41
lastName  :  41
primaryNumber  :  41
birthDate  :  41
currentAge  :  41
birthCity  :  41
birthStateProvince  :  29
birthCountry  :  41
nationality  :  41
height  :  41
weight  :  41
active  :  41
alternateCaptain  :  41
captain  :  41
rookie  :  41
shootsCatches  :  41
rosterStatus  :  41
currentTeam  :  41
primaryPosition  :  41
currentTeamCode  :  41


In [20]:
del player_data['birthStateProvince']

Construct a pandas DataFrame

In [21]:
columns = [
    'player_id',
    'full_name',
    'link',
    'first_name',
    'last_name',
    'primary_number',
    'birth_date',
    'current_age',
    'birth_city',
    'birth_country',
    'nationality',
    'height',
    'weight',
    'active',
    'alternate_captain',
    'captain',
    'rookie',
    'shoots_catches',
    'roster_status',
    'current_team',
    'primary_position',
    'current_team_code'
]

player_data = pd.DataFrame(player_data)
player_data.columns = columns

In [22]:
player_data.head()

Unnamed: 0,player_id,full_name,link,first_name,last_name,primary_number,birth_date,current_age,birth_city,birth_country,nationality,height,weight,active,alternate_captain,captain,rookie,shoots_catches,roster_status,current_team,primary_position,current_team_code
0,8477839,Conor Sheary,/api/v1/people/8477839,Conor,Sheary,43,1992-06-08,26,Winchester,USA,USA,"5' 8""",176,True,False,False,False,L,Y,Buffalo Sabres,LW,BUF
1,8471436,Matt Hunwick,/api/v1/people/8471436,Matt,Hunwick,48,1985-05-21,33,Warren,USA,USA,"5' 11""",194,True,False,False,False,L,Y,Buffalo Sabres,D,BUF
2,8477998,Warren Foegele,/api/v1/people/8477998,Warren,Foegele,13,1996-04-01,22,Markham,CAN,CAN,"6' 2""",198,True,False,False,True,L,Y,Carolina Hurricanes,LW,CAR
3,8478427,Sebastian Aho,/api/v1/people/8478427,Sebastian,Aho,20,1997-07-26,21,Rauma,FIN,FIN,"6' 0""",176,True,False,False,False,L,Y,Carolina Hurricanes,C,CAR
4,8475753,Justin Faulk,/api/v1/people/8475753,Justin,Faulk,27,1992-03-20,26,South St.Paul,USA,USA,"6' 0""",217,True,True,False,False,R,Y,Carolina Hurricanes,D,CAR


The data looks good, so we'll now join the player info to the player game stats data that we saved earlier. Before doing so, we'll drop a few columns in each table that we don't want duplicating when we merge the two tables together.

In [23]:
drop_cols = ['link', 'full_name', 'current_team', 'current_team_code',
             'shoots_catches', 'roster_status', 'primary_number', 'primary_position']
player_data.drop(columns=drop_cols, inplace=True)

In [24]:
player_data = game_player_stats.merge(player_data, how='left',
                        left_on='id', right_on='player_id')

Finally, we will drop one of the two `id` columns so that we don't have any confusion moving forward.

In [25]:
player_data.drop(columns=['player_id'], inplace=True)
player_data.rename({'id' : 'player_id'}, axis=1, inplace=True)

In [26]:
player_data.head()

Unnamed: 0,player_id,full_name,link,shoots_catches,roster_status,jersey_number,code,name,type,abbreviation,time_on_ice,assists,goals,shots,hits,power_play_goals,power_play_assists,penalty_minutes,face_off_wins,face_off_taken,takeaways,giveaways,short_handed_goals,short_handed_assists,blocked,plus_minus,even_time_on_ice,power_play_time_on_ice,short_handed_time_on_ice,team,home_away,first_name,last_name,birth_date,current_age,birth_city,birth_country,nationality,height,weight,active,alternate_captain,captain,rookie
0,8476934,Brock McGinn,/api/v1/people/8476934,L,Y,23,L,Left Wing,Forward,LW,16:45,0,0,2,3,0,0,0,0,1,2,1,0,0,0,0,13:47,0:00,2:58,CAR,Home,Brock,McGinn,1994-02-02,24.0,Fergus,CAN,CAN,"6' 0""",187.0,True,False,False,False
1,8475735,Greg McKegg,/api/v1/people/8475735,L,Y,42,C,Center,Forward,C,14:39,0,0,2,1,0,0,0,3,11,1,0,0,0,0,0,14:39,0:00,0:00,CAR,Home,,,,,,,,,,,,,
2,8476958,Jaccob Slavin,/api/v1/people/8476958,L,Y,74,D,Defenseman,Defenseman,D,23:27,1,0,1,3,0,0,0,0,0,0,0,0,0,1,2,18:58,1:20,3:09,CAR,Home,Jaccob,Slavin,1994-05-01,24.0,Denver,USA,USA,"6' 3""",207.0,True,False,False,False
3,8477998,Warren Foegele,/api/v1/people/8477998,L,Y,13,L,Left Wing,Forward,LW,10:35,0,0,3,5,0,0,0,0,0,3,0,0,0,0,-2,9:16,0:00,1:19,CAR,Home,Warren,Foegele,1996-04-01,22.0,Markham,CAN,CAN,"6' 2""",198.0,True,False,False,True
4,8478427,Sebastian Aho,/api/v1/people/8478427,L,Y,20,C,Center,Forward,C,21:42,0,2,2,1,0,0,0,13,20,2,3,0,0,0,4,16:49,2:45,2:08,CAR,Home,Sebastian,Aho,1997-07-26,21.0,Rauma,FIN,FIN,"6' 0""",176.0,True,False,False,False


Everything looks good, with the exception of some partially missing player information for **Greg McKegg**. This shouldn't be an issue moving forward though. Now that we have a table of all the player information, let's further explore the live game data.
___
## Play Data

In [27]:
plays = json.get('liveData').get('plays').get('allPlays')

An example of a faceoff event would look like this:

In [28]:
plays[3]

{'players': [{'player': {'id': 8478403,
    'fullName': 'Jack Eichel',
    'link': '/api/v1/people/8478403'},
   'playerType': 'Winner'},
  {'player': {'id': 8478427,
    'fullName': 'Sebastian Aho',
    'link': '/api/v1/people/8478427'},
   'playerType': 'Loser'}],
 'result': {'event': 'Faceoff',
  'eventCode': 'CAR53',
  'eventTypeId': 'FACEOFF',
  'description': 'Jack Eichel faceoff won against Sebastian Aho'},
 'about': {'eventIdx': 3,
  'eventId': 53,
  'period': 1,
  'periodType': 'REGULAR',
  'ordinalNum': '1st',
  'periodTime': '00:00',
  'periodTimeRemaining': '20:00',
  'dateTime': '2019-01-12T00:39:19Z',
  'goals': {'away': 0, 'home': 0}},
 'coordinates': {'x': 0.0, 'y': 0.0},
 'team': {'id': 7,
  'name': 'Buffalo Sabres',
  'link': '/api/v1/teams/7',
  'triCode': 'BUF'}}

We will now create a pandas DataFrame to track all of the faceoffs.

In [29]:
faceoffs = {
    'event_id' : [],
    'period' : [],
    'period_time' : [],
    'winning_player_id' : [],
    'winning_player_name' : [],
    'losing_player_id' : [],
    'losing_player_name' : [],
    'x_coord' : [],
    'y_coord' : []
}

for play in plays:
    if play.get('result').get('event') != 'Faceoff':
        continue
        
    faceoffs['event_id'].append(play.get('about').get('eventId'))
    faceoffs['period'].append(play.get('about').get('period'))
    faceoffs['period_time'].append(play.get('about').get('periodTime'))
    
    winning_player = play.get('players')[0]
    losing_player = play.get('players')[1]
    
    faceoffs['winning_player_id'].append(winning_player.get('player').get('id'))
    faceoffs['winning_player_name'].append(winning_player.get('player').get('fullName'))
    
    faceoffs['losing_player_id'].append(losing_player.get('player').get('id'))
    faceoffs['losing_player_name'].append(losing_player.get('player').get('fullName'))
    
    faceoffs['x_coord'].append(play.get('coordinates').get('x'))
    faceoffs['y_coord'].append(play.get('coordinates').get('y'))
    
faceoffs = pd.DataFrame(faceoffs)

In [30]:
faceoffs.head(20)

Unnamed: 0,event_id,period,period_time,winning_player_id,winning_player_name,losing_player_id,losing_player_name,x_coord,y_coord
0,53,1,00:00,8478403,Jack Eichel,8478427,Sebastian Aho,0.0,0.0
1,66,1,04:15,8471743,Vladimir Sobotka,8478027,Lucas Wallmark,69.0,-22.0
2,71,1,04:56,8475735,Greg McKegg,8478542,Evan Rodrigues,69.0,-22.0
3,25,1,08:04,8478027,Lucas Wallmark,8473449,Kyle Okposo,69.0,-22.0
4,29,1,09:09,8476437,Victor Rask,8475728,Johan Larsson,-69.0,22.0
5,84,1,09:19,8478542,Evan Rodrigues,8476437,Victor Rask,-69.0,-22.0
6,88,1,10:25,8478403,Jack Eichel,8478427,Sebastian Aho,0.0,0.0
7,91,1,10:40,8478427,Sebastian Aho,8478403,Jack Eichel,69.0,-22.0
8,34,1,11:46,8478027,Lucas Wallmark,8471743,Vladimir Sobotka,69.0,-22.0
9,98,1,12:09,8475728,Johan Larsson,8468508,Justin Williams,-69.0,22.0


Now we have a clean dataframe with all of the data that we are interested in for the purpose of this analysis, namely: faceoffs. Let's dig into it a bit and see if we can visualize any trends.

Let's start off by seeing how many faceoffs occurred during this game.

In [31]:
faceoffs.shape

(62, 9)

Of those 62 faceoffs, who was the most common winner, and who was the most common loser?

In [32]:
faceoffs['winning_player_name'].value_counts()

Sebastian Aho       13
Vladimir Sobotka    10
Jack Eichel          8
Johan Larsson        8
Lucas Wallmark       7
Victor Rask          6
Greg McKegg          3
Justin Williams      2
Jeff Skinner         1
Micheal Ferland      1
Evan Rodrigues       1
Kyle Okposo          1
Conor Sheary         1
Name: winning_player_name, dtype: int64

In [33]:
faceoffs['losing_player_name'].value_counts()

Jack Eichel         10
Greg McKegg          8
Sebastian Aho        7
Lucas Wallmark       6
Johan Larsson        5
Evan Rodrigues       5
Jeff Skinner         4
Justin Williams      4
Vladimir Sobotka     4
Victor Rask          3
Sam Reinhart         2
Kyle Okposo          2
Brock McGinn         1
Micheal Ferland      1
Name: losing_player_name, dtype: int64

Now, if we want to analyze faceoff outcomes by teams, we'll need to join team information according to the player. Luckily we have the player ID so we can use that to join on. Although we can use `pd.merge()` or `df.merge()`, we will instead use a `dict` with a `map` to simplify the process.

In [34]:
player_data.head()

Unnamed: 0,player_id,full_name,link,shoots_catches,roster_status,jersey_number,code,name,type,abbreviation,time_on_ice,assists,goals,shots,hits,power_play_goals,power_play_assists,penalty_minutes,face_off_wins,face_off_taken,takeaways,giveaways,short_handed_goals,short_handed_assists,blocked,plus_minus,even_time_on_ice,power_play_time_on_ice,short_handed_time_on_ice,team,home_away,first_name,last_name,birth_date,current_age,birth_city,birth_country,nationality,height,weight,active,alternate_captain,captain,rookie
0,8476934,Brock McGinn,/api/v1/people/8476934,L,Y,23,L,Left Wing,Forward,LW,16:45,0,0,2,3,0,0,0,0,1,2,1,0,0,0,0,13:47,0:00,2:58,CAR,Home,Brock,McGinn,1994-02-02,24.0,Fergus,CAN,CAN,"6' 0""",187.0,True,False,False,False
1,8475735,Greg McKegg,/api/v1/people/8475735,L,Y,42,C,Center,Forward,C,14:39,0,0,2,1,0,0,0,3,11,1,0,0,0,0,0,14:39,0:00,0:00,CAR,Home,,,,,,,,,,,,,
2,8476958,Jaccob Slavin,/api/v1/people/8476958,L,Y,74,D,Defenseman,Defenseman,D,23:27,1,0,1,3,0,0,0,0,0,0,0,0,0,1,2,18:58,1:20,3:09,CAR,Home,Jaccob,Slavin,1994-05-01,24.0,Denver,USA,USA,"6' 3""",207.0,True,False,False,False
3,8477998,Warren Foegele,/api/v1/people/8477998,L,Y,13,L,Left Wing,Forward,LW,10:35,0,0,3,5,0,0,0,0,0,3,0,0,0,0,-2,9:16,0:00,1:19,CAR,Home,Warren,Foegele,1996-04-01,22.0,Markham,CAN,CAN,"6' 2""",198.0,True,False,False,True
4,8478427,Sebastian Aho,/api/v1/people/8478427,L,Y,20,C,Center,Forward,C,21:42,0,2,2,1,0,0,0,13,20,2,3,0,0,0,4,16:49,2:45,2:08,CAR,Home,Sebastian,Aho,1997-07-26,21.0,Rauma,FIN,FIN,"6' 0""",176.0,True,False,False,False


In [35]:
faceoffs.head()

Unnamed: 0,event_id,period,period_time,winning_player_id,winning_player_name,losing_player_id,losing_player_name,x_coord,y_coord
0,53,1,00:00,8478403,Jack Eichel,8478427,Sebastian Aho,0.0,0.0
1,66,1,04:15,8471743,Vladimir Sobotka,8478027,Lucas Wallmark,69.0,-22.0
2,71,1,04:56,8475735,Greg McKegg,8478542,Evan Rodrigues,69.0,-22.0
3,25,1,08:04,8478027,Lucas Wallmark,8473449,Kyle Okposo,69.0,-22.0
4,29,1,09:09,8476437,Victor Rask,8475728,Johan Larsson,-69.0,22.0


Effectively, the `winning_players` and `losing_players` sub-df's are exactly the same. However, we are creating two subsets of the same data so that we can join each on the `faceoffs` dataframe for the winning and losing players.

In [36]:
# join stats for the "winning player" of the faceoff
cols = ['player_id', 'jersey_number', 'team', 'home_away']
winning_players = player_data[cols]
winning_players.columns = ['winning_player_id', 'winning_player_number',
                           'winning_player_team', 'winning_player_home_away']

# join stats for the "losing player" of the faceoff
losing_players = player_data[cols]
losing_players.columns = ['losing_player_id', 'losing_player_number',
                          'losing_player_team', 'losing_player_home_away']

In [37]:
faceoffs = faceoffs.merge(winning_players, how='left', on='winning_player_id')
faceoffs = faceoffs.merge(losing_players, how='left', on='losing_player_id')

In [38]:
faceoffs.head()

Unnamed: 0,event_id,period,period_time,winning_player_id,winning_player_name,losing_player_id,losing_player_name,x_coord,y_coord,winning_player_number,winning_player_team,winning_player_home_away,losing_player_number,losing_player_team,losing_player_home_away
0,53,1,00:00,8478403,Jack Eichel,8478427,Sebastian Aho,0.0,0.0,9,BUF,Away,20,CAR,Home
1,66,1,04:15,8471743,Vladimir Sobotka,8478027,Lucas Wallmark,69.0,-22.0,17,BUF,Away,71,CAR,Home
2,71,1,04:56,8475735,Greg McKegg,8478542,Evan Rodrigues,69.0,-22.0,42,CAR,Home,71,BUF,Away
3,25,1,08:04,8478027,Lucas Wallmark,8473449,Kyle Okposo,69.0,-22.0,71,CAR,Home,21,BUF,Away
4,29,1,09:09,8476437,Victor Rask,8475728,Johan Larsson,-69.0,22.0,49,CAR,Home,22,BUF,Away


Inspecting the two merges - we see that the data was successfully joined together.
___

## Joining On-Ice Data
We will now bring in a separate dataset which scraped a different set of play-by-play data (also from NHL.com) that has all the on-ice player info. We will use this to better analyze each faceoff. This `pandas.DataFrame` was generated from `nhl-data-scrape.ipynb`, and can be found at: https://github.com/yanniskatsaros/hockey-analytics

In [39]:
on_ice = pd.read_csv('on_ice.csv')

In [40]:
on_ice.head(5)

Unnamed: 0,event_id,period,strength,time_elapsed,time_remaining,event_type,event,visitor_on_ice,home_on_ice,home_1,home_2,home_3,home_4,home_5,home_6,visitor_1,visitor_2,visitor_3,visitor_4,visitor_5,visitor_6
0,5,1,EV,0:00,20:00,FAC,BUF won Neu. Zone - BUF #9 EICHEL vs CAR #20 AHO,9C 53C 72C 6D 19D 40G,20R 79L 86L 22D 74D 35G,20.0,79.0,86.0,22.0,74.0,35.0,9.0,53.0,72.0,6.0,19.0,40.0
1,6,1,EV,0:22,19:38,GIVE,"CAR GIVEAWAY - #86 TERAVAINEN, Def. Zone",9C 53C 72C 6D 19D 40G,20R 79L 86L 22D 74D 35G,20.0,79.0,86.0,22.0,74.0,35.0,9.0,53.0,72.0,6.0,19.0,40.0
2,7,1,EV,0:29,19:31,TAKE,"CAR TAKEAWAY - #20 AHO, Def. Zone",9C 53C 72C 6D 19D 40G,20R 79L 86L 22D 74D 35G,20.0,79.0,86.0,22.0,74.0,35.0,9.0,53.0,72.0,6.0,19.0,40.0
3,8,1,EV,1:17,18:43,HIT,"CAR #27 FAULK HIT BUF #43 SHEARY, Neu. Zone",17C 23C 43L 24D 55D 40G,71C 37R 48L 27D 44D 35G,71.0,37.0,48.0,27.0,44.0,35.0,17.0,23.0,43.0,24.0,55.0,40.0
4,9,1,EV,1:32,18:28,GIVE,"CAR GIVEAWAY - #44 DE HAAN, Neu. Zone",71C 21R 22L 4D 26D 40G,42C 14R 23L 44D 57D 35G,42.0,14.0,23.0,44.0,57.0,35.0,71.0,21.0,22.0,4.0,26.0,40.0


We will join the two datasets together using the `time_elapsed` column since they are both generated from the same data source (NHL.com). We will subset only the faceoffs from the play-by-play data. First we need to make sure to zero-pad the `event_times` in order for the `join` to properly work.

In [41]:
fac_plays = on_ice[on_ice['event_type'] == 'FAC'].reset_index(drop=True)

In [42]:
fac_plays['time_elapsed'] = fac_plays['time_elapsed'].str.zfill(5)

In [43]:
combined_faceoffs = faceoffs.merge(right=fac_plays,
                       how='left',
                       left_on=['period_time', 'period'],
                       right_on=['time_elapsed', 'period'])

In [44]:
combined_faceoffs.head()

Unnamed: 0,event_id_x,period,period_time,winning_player_id,winning_player_name,losing_player_id,losing_player_name,x_coord,y_coord,winning_player_number,winning_player_team,winning_player_home_away,losing_player_number,losing_player_team,losing_player_home_away,event_id_y,strength,time_elapsed,time_remaining,event_type,event,visitor_on_ice,home_on_ice,home_1,home_2,home_3,home_4,home_5,home_6,visitor_1,visitor_2,visitor_3,visitor_4,visitor_5,visitor_6
0,53,1,00:00,8478403,Jack Eichel,8478427,Sebastian Aho,0.0,0.0,9,BUF,Away,20,CAR,Home,5,EV,00:00,20:00,FAC,BUF won Neu. Zone - BUF #9 EICHEL vs CAR #20 AHO,9C 53C 72C 6D 19D 40G,20R 79L 86L 22D 74D 35G,20.0,79.0,86.0,22.0,74.0,35.0,9.0,53.0,72.0,6.0,19.0,40.0
1,66,1,04:15,8471743,Vladimir Sobotka,8478027,Lucas Wallmark,69.0,-22.0,17,BUF,Away,71,CAR,Home,25,EV,04:15,15:45,FAC,BUF won Off. Zone - BUF #17 SOBOTKA vs CAR #71...,17C 23C 43L 4D 26D 40G,71C 37R 86L 19D 57D 35G,71.0,37.0,86.0,19.0,57.0,35.0,17.0,23.0,43.0,4.0,26.0,40.0
2,71,1,04:56,8475735,Greg McKegg,8478542,Evan Rodrigues,69.0,-22.0,42,CAR,Home,71,BUF,Away,31,EV,04:56,15:04,FAC,CAR won Def. Zone - BUF #71 RODRIGUES vs CAR #...,71C 21R 22L 6D 19D 40G,42C 14R 23L 27D 44D 35G,42.0,14.0,23.0,27.0,44.0,35.0,71.0,21.0,22.0,6.0,19.0,40.0
3,25,1,08:04,8478027,Lucas Wallmark,8473449,Kyle Okposo,69.0,-22.0,71,CAR,Home,21,BUF,Away,48,EV,08:04,11:56,FAC,CAR won Def. Zone - BUF #21 OKPOSO vs CAR #71 ...,71C 21R 22L 24D 55D 40G,71C 37R 48L 27D 44D 35G,71.0,37.0,48.0,27.0,44.0,35.0,71.0,21.0,22.0,24.0,55.0,40.0
4,29,1,09:09,8476437,Victor Rask,8475728,Johan Larsson,-69.0,22.0,49,CAR,Home,22,BUF,Away,57,EV,09:09,10:51,FAC,CAR won Off. Zone - BUF #22 LARSSON vs CAR #49...,71C 21R 22L 24D 55D 40G,49C 8R 13L 19D 57D 35G,49.0,8.0,13.0,19.0,57.0,35.0,71.0,21.0,22.0,24.0,55.0,40.0


Now, using the `combined_faceoffs` data, we will investigate the distribution of faceoffs by the Offensive, Defensive, and Neutral zone and investigate any trends or differences in the data. This data is not explicitly state in our dataset. However, we can extract it from the `event` column. For example: __"BUF won Neu. Zone - BUF #9 EICHEL vs CAR #20 AHO"__ tells us who the winning team was, and where the faceoff took place which is what we want to extract.

In [45]:
# create a new, empty column
combined_faceoffs['faceoff_zone'] = '-'

# Offensive Zone
mask = combined_faceoffs['event'].str.contains('Off.')
combined_faceoffs.loc[mask, 'faceoff_zone'] = 'Offensive'

# Defensive Zone
mask = combined_faceoffs['event'].str.contains('Def.')
combined_faceoffs.loc[mask, 'faceoff_zone'] = 'Defensive'

# Neutral Zone
mask = combined_faceoffs['event'].str.contains('Neu.')
combined_faceoffs.loc[mask, 'faceoff_zone'] = 'Neutral'

In [46]:
combined_faceoffs.head()

Unnamed: 0,event_id_x,period,period_time,winning_player_id,winning_player_name,losing_player_id,losing_player_name,x_coord,y_coord,winning_player_number,winning_player_team,winning_player_home_away,losing_player_number,losing_player_team,losing_player_home_away,event_id_y,strength,time_elapsed,time_remaining,event_type,event,visitor_on_ice,home_on_ice,home_1,home_2,home_3,home_4,home_5,home_6,visitor_1,visitor_2,visitor_3,visitor_4,visitor_5,visitor_6,faceoff_zone
0,53,1,00:00,8478403,Jack Eichel,8478427,Sebastian Aho,0.0,0.0,9,BUF,Away,20,CAR,Home,5,EV,00:00,20:00,FAC,BUF won Neu. Zone - BUF #9 EICHEL vs CAR #20 AHO,9C 53C 72C 6D 19D 40G,20R 79L 86L 22D 74D 35G,20.0,79.0,86.0,22.0,74.0,35.0,9.0,53.0,72.0,6.0,19.0,40.0,Neutral
1,66,1,04:15,8471743,Vladimir Sobotka,8478027,Lucas Wallmark,69.0,-22.0,17,BUF,Away,71,CAR,Home,25,EV,04:15,15:45,FAC,BUF won Off. Zone - BUF #17 SOBOTKA vs CAR #71...,17C 23C 43L 4D 26D 40G,71C 37R 86L 19D 57D 35G,71.0,37.0,86.0,19.0,57.0,35.0,17.0,23.0,43.0,4.0,26.0,40.0,Offensive
2,71,1,04:56,8475735,Greg McKegg,8478542,Evan Rodrigues,69.0,-22.0,42,CAR,Home,71,BUF,Away,31,EV,04:56,15:04,FAC,CAR won Def. Zone - BUF #71 RODRIGUES vs CAR #...,71C 21R 22L 6D 19D 40G,42C 14R 23L 27D 44D 35G,42.0,14.0,23.0,27.0,44.0,35.0,71.0,21.0,22.0,6.0,19.0,40.0,Defensive
3,25,1,08:04,8478027,Lucas Wallmark,8473449,Kyle Okposo,69.0,-22.0,71,CAR,Home,21,BUF,Away,48,EV,08:04,11:56,FAC,CAR won Def. Zone - BUF #21 OKPOSO vs CAR #71 ...,71C 21R 22L 24D 55D 40G,71C 37R 48L 27D 44D 35G,71.0,37.0,48.0,27.0,44.0,35.0,71.0,21.0,22.0,24.0,55.0,40.0,Defensive
4,29,1,09:09,8476437,Victor Rask,8475728,Johan Larsson,-69.0,22.0,49,CAR,Home,22,BUF,Away,57,EV,09:09,10:51,FAC,CAR won Off. Zone - BUF #22 LARSSON vs CAR #49...,71C 21R 22L 24D 55D 40G,49C 8R 13L 19D 57D 35G,49.0,8.0,13.0,19.0,57.0,35.0,71.0,21.0,22.0,24.0,55.0,40.0,Offensive


A quick-look shows us that there's a pretty even distribution of faceoffs won by each time and a relatively even distribution of faceoffs taken by zone.

In [47]:
combined_faceoffs['winning_player_team'].value_counts()

CAR    31
BUF    30
Name: winning_player_team, dtype: int64

In [48]:
combined_faceoffs['faceoff_zone'].value_counts()

Defensive    25
Offensive    19
Neutral      18
Name: faceoff_zone, dtype: int64

However, it would be more interesting to investigate the number of faceoffs won or lost by each team, grouped by each zone.