<a href="https://colab.research.google.com/github/twointum/TheVegasFlu/blob/main/The_Vegas_Flu.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests
import pandas as pd
import json

# Make a GET request to the NHL daily schedule endpoint. Details here: https://github.com/dword4/nhlapi
response = requests.get('https://statsapi.web.nhl.com/api/v1/schedule?startDate=2023-04-27&endDate=2023-04-27')

# Convert the JSON response into a pandas DataFrame. Each date will be in a row.
df = pd.DataFrame(response.json()['dates'])

# Print the first 5 rows of the DataFrame, just to check it works.
print(df.head())

         date  totalItems  totalEvents  totalGames  totalMatches  \
0  2023-04-27           3            0           3             0   

                                               games events matches  
0  [{'gamePk': 2022030125, 'link': '/api/v1/game/...     []      []  


In [None]:
# Initialize a variable to hold all the game ids
game_ids = []

# Since all we need from here is the gamePk (the game ID to use in the box score link), make a little function to parse each date row.
def extract_gamePk(row):
    # The value in the column is a list []. There are multiple games per day (possibly). 
    for game in row['games']:
      game_ids.append(game['gamePk'])

# use that function for every row aka every date.
df.apply(extract_gamePk, axis=1)

# There should be 1358 games in the list.
print(len(game_ids))

3


In [None]:
# For testing, limit the number of games to loop. Comment this out when you're ready to roll!
#game_ids = [2017020013, 2017020019, 2017020031, 2017020032, 2017020039, 2017020046, 2017020051,]

In [None]:
# Each game will get its own dataframe. We must comebine them all into a big dataframe. Initialize it here.
big_df = pd.DataFrame()
game_counter = 1

# Now we have to loop through each game, make an API call and store that data in a dataframe.
for game_id in game_ids:
  # Do the response thing again, same source as above, different endpoint. This is the box score endpoint.
  api_url = f"https://statsapi.web.nhl.com/api/v1/game/{game_id}/boxscore"
  response = requests.get(api_url)
  print(game_counter, ': ', response, api_url)
  game_counter += 1


  # Get some team level info from the JSON payload
  away_id    = response.json()['teams']['away']['team']['id'];
  away_name  = response.json()['teams']['away']['team']['name'];
  away_goals = response.json()['teams']['away']['teamStats']['teamSkaterStats']['goals'];
  home_id    = response.json()['teams']['home']['team']['id'];
  home_name  = response.json()['teams']['home']['team']['name'];
  home_goals = response.json()['teams']['home']['teamStats']['teamSkaterStats']['goals'];
  away_win   = 1 if away_goals > home_goals else 0
  home_win   = 1 if home_goals > away_goals else 0

  # Convert the JSON response into a pandas DataFrame and add those team level variables.
  away_df = pd.DataFrame(response.json()['teams']['away']['players'])
  away_df = away_df.T # The data is shaped weird, transpose it.
  away_df['team_id']    = away_id;
  away_df['team_name']  = away_name;
  away_df['team_goals'] = away_goals;
  away_df['team_win']   = away_win;
  away_df['is_home_team'] = 0;

  home_df = pd.DataFrame(response.json()['teams']['home']['players'])
  home_df = home_df.T # The data is shaped weird, transpose it.
  home_df['team_id']    = home_id;
  home_df['team_name']  = home_name;
  home_df['team_goals'] = home_goals;
  home_df['team_win']   = home_win;
  home_df['is_home_team'] = 1;
  
  # combine both data frames
  df = pd.concat([away_df, home_df], axis=0)
  df['game_id'] = game_id
  #print(df.head())

  # Wonky ETL because the API is extra.
  df['id']              = df['person'].apply(lambda x: dict(x)['id'])
  df['fullName']        = df['person'].apply(lambda x: dict(x)['fullName'])
  df['position']        = df['person'].apply(lambda x: dict(x)['primaryPosition']['type'])

  # Switched gears here inside the lambda function because the data doesn't exist for goalies!
  df['timeOnIce']            =  df['stats'].apply(lambda x: dict(x)['skaterStats']['timeOnIce']            if x.get('skaterStats') is not None else 0  )
  df['assists']              =  df['stats'].apply(lambda x: dict(x)['skaterStats']['assists']              if x.get('skaterStats') is not None else 0  )
  df['goals']                =  df['stats'].apply(lambda x: dict(x)['skaterStats']['goals']                if x.get('skaterStats') is not None else 0  )
  df['shots']                =  df['stats'].apply(lambda x: dict(x)['skaterStats']['shots']                if x.get('skaterStats') is not None else 0  )
  df['hits']                 =  df['stats'].apply(lambda x: dict(x)['skaterStats']['hits']                 if x.get('skaterStats') is not None else 0  )
  df['penaltyMinutes']       =  df['stats'].apply(lambda x: dict(x)['skaterStats']['penaltyMinutes']       if x.get('skaterStats') is not None else 0  )
  df['powerPlayGoals']       =  df['stats'].apply(lambda x: dict(x)['skaterStats']['powerPlayGoals']       if x.get('skaterStats') is not None else 0  )
  df['powerPlayAssists']     =  df['stats'].apply(lambda x: dict(x)['skaterStats']['powerPlayAssists']     if x.get('skaterStats') is not None else 0  )
  df['penaltyMinutes']       =  df['stats'].apply(lambda x: dict(x)['skaterStats']['penaltyMinutes']       if x.get('skaterStats') is not None else 0  )
  #df['faceOffPct']           =  df['stats'].apply(lambda x: dict(x)['skaterStats']['faceOffPct']           if x.get('skaterStats') is not None else 0  ) # I don't know why this one doesn't work.
  df['faceOffWins']          =  df['stats'].apply(lambda x: dict(x)['skaterStats']['faceOffWins']          if x.get('skaterStats') is not None else 0  )
  df['faceOffTaken']         =  df['stats'].apply(lambda x: dict(x)['skaterStats']['faceoffTaken']         if x.get('skaterStats') is not None else 0  )
  df['takeaways']            =  df['stats'].apply(lambda x: dict(x)['skaterStats']['takeaways']            if x.get('skaterStats') is not None else 0  )
  df['giveaways']            =  df['stats'].apply(lambda x: dict(x)['skaterStats']['giveaways']            if x.get('skaterStats') is not None else 0  )
  df['shortHandedGoals']     =  df['stats'].apply(lambda x: dict(x)['skaterStats']['shortHandedGoals']     if x.get('skaterStats') is not None else 0  )
  df['shortHandedAssists']   =  df['stats'].apply(lambda x: dict(x)['skaterStats']['shortHandedAssists']   if x.get('skaterStats') is not None else 0  )
  df['blocked']              =  df['stats'].apply(lambda x: dict(x)['skaterStats']['blocked']              if x.get('skaterStats') is not None else 0  )
  df['plusMinus']            =  df['stats'].apply(lambda x: dict(x)['skaterStats']['plusMinus']            if x.get('skaterStats') is not None else 0  )
  df['evenTimeOnIce']        =  df['stats'].apply(lambda x: dict(x)['skaterStats']['evenTimeOnIce']        if x.get('skaterStats') is not None else 0  )
  df['powerPlayTimeOnIce']   =  df['stats'].apply(lambda x: dict(x)['skaterStats']['powerPlayTimeOnIce']   if x.get('skaterStats') is not None else 0  )
  df['shortHandedTimeOnIce'] =  df['stats'].apply(lambda x: dict(x)['skaterStats']['shortHandedTimeOnIce'] if x.get('skaterStats') is not None else 0  )
  #Goalie Stats
  df['goalieTimeOnIce']         =  df['stats'].apply(lambda x: dict(x)['goalieStats']['timeOnIce']               if x.get('goalieStats') is not None else 0  )
  df['goaliePenaltyMinutes']    =  df['stats'].apply(lambda x: dict(x)['goalieStats']['pim']                     if x.get('goalieStats') is not None else 0  )
  df['shotsAgainst']            =  df['stats'].apply(lambda x: dict(x)['goalieStats']['shots']                   if x.get('goalieStats') is not None else 0  )
  df['saves']                   =  df['stats'].apply(lambda x: dict(x)['goalieStats']['saves']                   if x.get('goalieStats') is not None else 0  )
  df['powerPlaySaves']          =  df['stats'].apply(lambda x: dict(x)['goalieStats']['powerPlaySaves']          if x.get('goalieStats') is not None else 0  )
  df['shortHandedSaves']        =  df['stats'].apply(lambda x: dict(x)['goalieStats']['shortHandedSaves']        if x.get('goalieStats') is not None else 0  )
  df['evenSaves']               =  df['stats'].apply(lambda x: dict(x)['goalieStats']['evenSaves']               if x.get('goalieStats') is not None else 0  )
  df['shortHandedShotsAgainst'] =  df['stats'].apply(lambda x: dict(x)['goalieStats']['shortHandedShotsAgainst'] if x.get('goalieStats') is not None else 0  )
  df['evenShotsAgainst']        =  df['stats'].apply(lambda x: dict(x)['goalieStats']['evenShotsAgainst']        if x.get('goalieStats') is not None else 0  )
  df['powerPlayShotsAgainst']   =  df['stats'].apply(lambda x: dict(x)['goalieStats']['powerPlayShotsAgainst']   if x.get('goalieStats') is not None else 0  )
  #df['decision']                =  df['stats'].apply(lambda x: dict(x)['goalieStats']['decision']                if x.get('goalieStats') is not None else 0  ) # For some reason, decision isn't entered for each game!
  #df['goalieWin']               =  df['decision'].apply(lambda x: 1 if x == 'W' else 0 ) 
  df['goalsAllowed']            =  df['shotsAgainst'] - df['saves']
  #df['shutouts']                =  df.apply(lambda x: 1 if x['decision'] == 'W' else 0 , axis = 1)

  # Possible Y's
  df['points'] = df['goals'] + df['assists']
  df['fantasyPoints'] = df['goals'] * 3 + df['assists'] * 2 + df['plusMinus'] + df['penaltyMinutes'] * .5 + df['powerPlayGoals'] + df['powerPlayAssists'] + df['shotsAgainst'] * .4  + df['goalsAllowed'] * -1 + df['saves'] * .2 # + df['goalieWin'] * 4 + df['shutouts'] * 2


  #Remove the unformatted columns.
  df.drop(['person', 'jerseyNumber', 'stats' ], axis=1, inplace=True)

  # check if the big_df is empty. Should only be the case on the first game.
  if big_df.empty:
      big_df = df
  else:
      big_df = pd.concat([big_df, df], axis=0)

#finally, add a row number for panel data t's
big_df['t'] = big_df.groupby('id')['game_id'].cumcount() + 1

# Saved to this ethereal session. Download it or you have to run this again!
big_df.to_csv('TheVegasFlu.csv', index=False)



1 :  <Response [200]> https://statsapi.web.nhl.com/api/v1/game/2022030125/boxscore
2 :  <Response [200]> https://statsapi.web.nhl.com/api/v1/game/2022030145/boxscore
3 :  <Response [200]> https://statsapi.web.nhl.com/api/v1/game/2022030175/boxscore
