In [28]:
#requests is needed to connect to the API
import requests
import pandas as pd
import numpy as np

In [2]:
def get_request(game, year='2018', season_type='02'):
    '''Input - takes 3 arguements game-integer, year-string, season_type-string. 
               game is an integer for the game number id, minus any prefixing 0's
               year is the first year number of a season i.e 2018 = 2018/2019 season. 2018 is the default
               season_type represents the type of season used. Default 2 represents the regular season
       Output - Returns a josn file from the API for the game requested
       Desc - This function takes input from the user to make a request to the API for a specified game
              It then jsonifies the request and returns that json file
    '''
    r = requests.get(url='http://statsapi.web.nhl.com/api/v1/game/'
                     +year+season_type+str(game).zfill(4)+'/feed/live')
    data = r.json()
    return data

def get_meta_stats(data):
    '''Input - Data from the NHL_API
       Output - returns the name and final score of the home and away team
       Desc - takes in thet data gotten from the api, and collects the data on the teams and score
       it then returns this information
    '''
    core = data['liveData']['boxscore']['teams']
    home_team = core['home']['team']['triCode']
    home_score = core['home']['teamStats']['teamSkaterStats']['goals']
    away_team = core['away']['team']['triCode']
    away_score = core['away']['teamStats']['teamSkaterStats']['goals']
    
    return home_team, home_score, away_team, away_score
    

def get_corsi(data, home_team, away_team):
    '''Input - Data from NHL_API, the home and away teams triCode id
       Output - Returns the number of corsi events for each team
       Description - This ffunction is what scrolls through the play by play data and counts 
       all the times there is a corsi event, which is either a shot, blocked shot, or missed shot
       It then returns this calue for both teams
    '''
    event_types = ['Shot','Blocked Shot','Missed Shot']
    home_corsi = 0
    away_corsi = 0    
    plays = data['liveData']['plays']['allPlays'] 
    for play in plays:
        if play['result']['event'] in event_types:
            if play['team']['triCode'] == home_team:
                home_event += 1
            if play['team']['triCode'] == away_team:
                away_event += 1
    
    return home_event, away_event

def make_a_list(game_id, home_team, home_score, home_corsi, away_team, away_score, away_corsi):
    '''Input - all the variables that have been collected. 
       Output - A single list of all the variables 
    '''
    row_list = []
    row_list.append(game_id)
    row_list.append(home_team)
    row_list.append(home_score)
    row_list.append(home_corsi)
    row_list.append(away_team)
    row_list.append(away_score)
    row_list.append(away_corsi)
    return row_list

In [3]:
to_df_list = []
for i in range(1,1302):    
    data = get_request(game=i)
    if 'liveData' not in data:
        print(f'game number {i} does not have live data')
        continue
    home_team, home_score, away_team, away_score = get_meta_stats(data)
    home_corsi, away_corsi = get_corsi(data, home_team, away_team)
    to_df_list.append(make_a_list(i, home_team, home_score, home_corsi, away_team, away_score, away_corsi))
    if i % 100 == 0:
        print(f'{i} of 1302 complete')

100 of 1302 complete
200 of 1302 complete
300 of 1302 complete
400 of 1302 complete
500 of 1302 complete
600 of 1302 complete
700 of 1302 complete
800 of 1302 complete
900 of 1302 complete
1000 of 1302 complete
1100 of 1302 complete
1200 of 1302 complete
game number 1272 does not have live data
game number 1273 does not have live data
game number 1274 does not have live data
game number 1275 does not have live data
game number 1276 does not have live data
game number 1277 does not have live data
game number 1278 does not have live data
game number 1279 does not have live data
game number 1280 does not have live data
game number 1281 does not have live data
game number 1282 does not have live data
game number 1283 does not have live data
game number 1284 does not have live data
game number 1285 does not have live data
game number 1286 does not have live data
game number 1287 does not have live data
game number 1288 does not have live data
game number 1289 does not have live data
game nu

In [19]:
col_names = ['gameId', 'homeTeam', 'homeScore', 'homeCorsi', 'awayTeam', 'awayScore', 'awayCorsi']
df = pd.DataFrame(to_df_list, columns=col_names)

In [20]:
df.head()

Unnamed: 0,gameId,homeTeam,homeScore,homeCorsi,awayTeam,awayScore,awayCorsi
0,1,TOR,3,65,MTL,2,66
1,2,WSH,7,60,BOS,0,50
2,3,VAN,5,41,CGY,2,59
3,4,SJS,2,66,ANA,5,32
4,5,BUF,0,57,BOS,4,46


In [41]:
df['CorsiWinner'] = np.where((df.homeScore > df.awayScore) & (df.homeCorsi >= df.awayCorsi) |
                             (df.homeScore < df.awayScore) & (df.homeCorsi <= df.awayCorsi), 1,0)

In [44]:
df.head(15)

Unnamed: 0,gameId,homeTeam,homeScore,homeCorsi,awayTeam,awayScore,awayCorsi,CorsiWinner
0,1,TOR,3,65,MTL,2,66,0
1,2,WSH,7,60,BOS,0,50,1
2,3,VAN,5,41,CGY,2,59,0
3,4,SJS,2,66,ANA,5,32,0
4,5,BUF,0,57,BOS,4,46,0
5,6,NYR,2,65,NSH,3,49,0
6,7,PIT,7,62,WSH,6,56,1
7,8,CAR,1,74,NYI,2,51,0
8,9,OTT,3,55,CHI,4,71,1
9,10,DET,2,46,CBJ,3,70,1


In [43]:
print((df.CorsiWinner.sum()/df.shape[0])*100)

44.374508261211645
