In [188]:
# Pandas
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import numpy as np

# Polars (Arrow)
from pyarrow.dataset import dataset
import polars as pl
pl.Config.set_tbl_rows(n=-1)
pl.Config.set_tbl_cols(n=-1)

# Hit API
import requests

# Tools
from itertools import chain
from datetime import datetime, timedelta
from math import pi

# Save
import pickle
import os

# Get Game ID's From Previous Seasons

- Saves two lists (Current Game IDs and All Game IDs)

In [2]:
# Get Dates
yday = datetime.today() - timedelta(days=1)
end_date = yday.strftime('%Y%m%d')

# Check and Load Dates
date_file_name = 'dates_loaded.pkl'

if os.path.exists(date_file_name):
    with open(date_file_name, "rb") as file:
        season_dates = pickle.load(file)
else:
    season_dates = []

load_dates = pd.date_range(start='20121001', end=end_date, freq='D')
load_dates = [d for d in load_dates if d not in season_dates]

# Chcek and Load Game IDs
g_id_file_name  = 'game_ids.pkl'

if os.path.exists(g_id_file_name):
    with open(g_id_file_name, "rb") as file:
        game_ids = pickle.load(file)
else:
    game_ids = []

In [3]:
# Loop For Loading Games By Date
f_g_id = []
for i in load_dates:
    i_str = i.strftime('%Y-%m-%d')
    sched_link = "https://api-web.nhle.com/v1/schedule/"+i_str
    response = requests.get(sched_link)

    # Parse the JSON content of the response
    raw_data = pd.json_normalize(response.json())
    sched_data = pd.json_normalize(raw_data['gameWeek'][0])
    sched_data = pd.json_normalize(sched_data['games'][0])

    if len(sched_data) == 0:
        pass
    else:
        sched_data = sched_data[sched_data['gameType'] != 1]
        f_g_id.append(sched_data['id'].tolist())

# Create Lists (Game ID and Dates Loaded):
f_g_id = list(chain(*f_g_id))
game_ids = game_ids + f_g_id
current_game_ids = [game_id for game_id in game_ids if str(game_id).startswith("2023")]
previous_game_ids = [game_id for game_id in game_ids if game_id not in current_game_ids]

loaded_dates = sorted(load_dates + season_dates)

# Save
with open('game_ids.pkl', 'wb') as file:
    pickle.dump(game_ids, file)

with open('dates_loaded.pkl', 'wb') as file:
    pickle.dump(loaded_dates, file)

In [4]:
df_list = []
for i in game_ids:
    pbp_link = 'https://api-web.nhle.com/v1/gamecenter/'+str(i)+'/play-by-play'

    pbp_response = requests.get(pbp_link)
    pbp_data = pd.json_normalize(pbp_response.json())
    pbp_data = pbp_data[pbp_data['gameType'] != 1]

    ## GAME DATA
    game_data = pbp_data[['id', 'season', 'gameDate', 'gameType', 'awayTeam.id', 'awayTeam.abbrev', 'homeTeam.id', 'homeTeam.abbrev']]

    # PLAYS DATA
    plays_1 = pd.json_normalize(pbp_data['plays'])

    # Create an empty DataFrame to store the normalized plays
    normalized_plays = pd.DataFrame()

    # Iterate over each row in plays_1 and normalize the JSON data
    for _, row in plays_1.iterrows():
        # Normalize the JSON data in the current row
        normalized_row = pd.json_normalize(row)

        # Concatenate the normalized row to the result DataFrame
        normalized_plays = pd.concat([normalized_plays, normalized_row], ignore_index=True)

    result_df = pd.merge(game_data.assign(key=1), normalized_plays.assign(key=1), on='key').drop('key', axis=1)
    result_df = pl.DataFrame(result_df)

    df_list.append(result_df)
    
print('Loading Complete -- Begin Diagonal Union of', len(df_list),'Games')

Loading Complete -- Begin Diagonal Union of 14181 Games


# Cleaning Functions For PBP Data

In [20]:
def min_to_sec(time_str):
    """This function will help to convert time's formatted like MM:SS to a round seconds number"""
    minutes, seconds = map(int, time_str.split(':'))
    return minutes * 60 + seconds

In [217]:
def reconcile_api_data(data):
    """ This Function will take a polars dataframe and reconcile column names, values, and data types to match SDV cleaning functions to save time and effort in building more tweak functions"""

    # Create Dictionaries For Column Name/Value Rename
    drop_cols = ['details.scoringPlayerTotal', 'periodDescriptor.number', 'periodDescriptor.otPeriods', 'details.assist1PlayerTotal', 'details.assist2PlayerTotal', 'details.homeSOG', 'details.awaySOG']
    # Column Names
    rename_dict = {
        "id": "game_id",
        "gameDate": "game_date",
        "awayTeam.id": "away_id",
        "awayTeam.abbrev": "away_abbreviation",
        "homeTeam.id": "home_id",
        "homeTeam.abbrev": "home_abbreviation",
        "gameType": "season_type",
        "eventId": "event_id",
        "typeDescKey": "event_type",
        "sortOrder": "event_idx",
        "periodDescriptor.periodType": "period_type",
        "details.eventOwnerTeamId": "event_team_id",
        "details.xCoord": "x",
        "details.yCoord": "y",
        "details.zoneCode": "event_zone",
        "details.shotType": "secondary_type",
        "details.awayScore": "away_score",
        "details.homeScore": "home_score",
        "details.goalieInNetId": "event_goalie_id",
        "details.blockingPlayerId": "blocking_player_id",
        "details.drawnByPlayerId": "drawnby_player_id",
        "details.servedByPlayerId": "servedby_player_id",
        "details.committedByPlayerId": "committedby_player_id",
        "details.hittingPlayerId": "hitting_player_id",
        "details.hitteePlayerId": "hittee_player_id",
        "details.assist1PlayerId": "assist_1_player_id",
        "details.assist2PlayerId": "assist_2_player_id",
        "details.shootingPlayerId": "shooting_player_id",
        "details.reason": "reason",
        "details.scoringPlayerId": "scoring_player_id",
        "details.duration": "penalty_minutes",
        "details.winningPlayerId": "winning_player_id",
        "details.losingPlayerId": "losing_player_id"
    }

    # Event Type
    event_type_dict = {
        "faceoff": "FACEOFF",
        "shot-on-goal": "SHOT",
        "stoppage": "STOPPAGE",
        "hit": "HIT",
        "blocked-shot": "BLOCKED_SHOT",
        "missed-shot": "MISSED_SHOT",
        "giveaway": "GIVEAWAY",
        "takeaway": "TAKEAWAY",
        "penalty": "PENALTY",
        "goal": "GOAL",
        "period-start": "PERIOD_START",
        "period-end": "PERIOD_END",
        "delayed-penalty": "DELAYED_PENALTY",
        "game-end": "GAME_END",
        "shootout-complete": "SHOOTOUT_COMPLETE",
        "failed-shot-attempt": "FAILED_SHOT",
        None:None
    }

    # Season Type
    season_type_dict = {
        2: "R",
        3: "P",
        4: "I",
        None:None
    }

    # Shot Type
    shot_type_dict = {
        "snap": "Snap",
        "between-legs": "Between Legs",
        "wrap-around": "Wrap-Around",
        "tip-in": "Tip-In",
        "cradle": "Wrap-Around",
        "poke": 'Poked',
        "bat": 'Batted',
        "deflected": "Deflected",
        "wrist": "Wrist",
        "slap":	"Slap",
        "backhand": "Backhand",
        None: None
    }

    # Rename Columns + Values AND Add Event/Season Type Helpers
    data = data.rename(rename_dict).filter(pl.col('period_type') != 'SO').drop(drop_cols)

    data = (
        data
        .with_columns([
            (pl.col('season_type').map_dict(season_type_dict, default = pl.col('season_type'))).alias('season_type'),
            (pl.col('event_type').map_dict(event_type_dict,default = pl.col('event_type'))).alias('event_type'),
            (pl.col('secondary_type').map_dict(shot_type_dict,default = pl.col('secondary_type'))).alias('secondary_type'),
            pl.when(pl.col('event_team_id') == pl.col('home_id')).then(pl.lit('home')).otherwise(pl.lit('away')).alias('event_team_type'),
            pl.when(pl.col('event_team_id') == pl.col('home_id')).then(pl.lit('home_abbreviation')).otherwise(pl.lit('away_abbreviation')).alias('event_team_abbr')
            ])
        #.drop('gameType', 'typeDescKey', 'details.shotType')
        .filter(~pl.col('situationCode').is_in(["PERIOD_START", "PERIOD_END", "GAME_START", "GAME_END"]))
    )

    # Create Game and Period Seconds Remaining from timeInPeriod, timeRemaining: 'period', 'period_seconds', 'period_seconds_remaining', 'game_seconds', 'game_seconds_remaining'
    data = (
        data
        .with_columns((pl.col('timeInPeriod').apply(min_to_sec)).alias('period_seconds'))
        .with_columns([
            (1200 - pl.col('period_seconds')).alias('period_seconds_remaining'),
            (pl.col('period_seconds') + ((pl.col('period')-1)*1200)).alias('game_seconds'),
            ((3600 - pl.col('period_seconds')) + ((pl.col('period') - 3) * 1200)).alias('game_seconds_remaining')
        ])
    )

    # Create event_player_1_id and event_player_2_id columns based on event_type and corresponding columns
    remove_ply_ids = ['winning_player_id', 'hitting_player_id', 'scoring_player_id', 'shooting_player_id', 'committedby_player_id',
                      'details.playerId', 'losing_player_id', 'hittee_player_id', 'drawnby_player_id', 'assist_1_player_id', 'assist_2_player_id',
                      'blocking_player_id']
    data = (
        data
        .with_columns([
            (pl.when(pl.col('event_type') == 'FACEOFF').then(pl.col('winning_player_id'))
               .when(pl.col('event_type') == 'HIT').then(pl.col('hitting_player_id'))
               .when(pl.col('event_type') == 'GOAL').then(pl.col('scoring_player_id'))
               .when(pl.col('event_type').is_in(['SHOT', 'MISSED_SHOT', "BLOCKED_SHOT"])).then(pl.col('shooting_player_id'))
               .when(pl.col('event_type') == 'PENALTY').then(pl.col('committedby_player_id'))
               .when(pl.col('event_type') == 'GIVEAWAY').then(pl.col('details.playerId'))
               .when(pl.col('event_type') == 'TAKEAWAY').then(pl.col('details.playerId'))
               .otherwise(pl.lit(None))
             ).alias("event_player_1_id"),
             (pl.when(pl.col('event_type') == 'FACEOFF').then(pl.col('losing_player_id'))
               .when(pl.col('event_type') == 'HIT').then(pl.col('hittee_player_id'))
               .when(pl.col('event_type').is_in(['GOAL','SHOT', 'MISSED_SHOT', 'BLOCKED_SHOT'])).then(pl.col('event_goalie_id'))
               .when(pl.col('event_type') == 'PENALTY').then(pl.col('drawnby_player_id'))
               .otherwise(pl.lit(None))
             ).alias("event_player_2_id"),
             (pl.when((pl.col('event_type') == 'GOAL') & (~pl.col('assist_1_player_id').is_null())).then(pl.col('assist_1_player_id'))
               .when((pl.col('event_type') == 'PENALTY') & (~pl.col('servedby_player_id').is_null())).then(pl.col('servedby_player_id'))
               .when((pl.col('event_type') == 'BLOCKED_SHOT') & (~pl.col('blocking_player_id').is_null())).then(pl.col('blocking_player_id'))
               .otherwise(pl.lit(None))
             ).alias("event_player_3_id"),
             (pl.when((pl.col('event_type') == 'GOAL') & (~pl.col('assist_2_player_id').is_null())).then(pl.col('assist_2_player_id'))
               .otherwise(pl.lit(None))
             ).alias("event_player_4_id"),
             (pl.when(pl.col('event_type') == 'FACEOFF').then(pl.lit('Winner'))
               .when(pl.col('event_type') == 'HIT').then(pl.lit('Hitter'))
               .when(pl.col('event_type') == 'GOAL').then(pl.lit('Scorer'))
               .when(pl.col('event_type').is_in(['SHOT', 'MISSED_SHOT', "BLOCKED_SHOT"])).then(pl.lit('Shooter'))
               .when(pl.col('event_type') == 'PENALTY').then(pl.lit('PenaltyOn'))
               .when(pl.col('event_type') == 'GIVEAWAY').then(pl.lit('PlayerID'))
               .when(pl.col('event_type') == 'TAKEAWAY').then(pl.lit('PlayerID'))
               .otherwise(pl.lit(None))
             ).alias("event_player_1_type"),
             (pl.when(pl.col('event_type') == 'FACEOFF').then(pl.lit('Loser'))
               .when(pl.col('event_type') == 'HIT').then(pl.lit('Hittee'))
               .when((pl.col('event_type') == 'GOAL') & (~pl.col('event_goalie_id').is_null())).then(pl.lit('Goalie'))
               .when((pl.col('event_type') == 'GOAL') & (pl.col('event_goalie_id').is_null())).then(pl.lit('EmptyNet'))
               .when(pl.col('event_type').is_in(['SHOT', 'MISSED_SHOT', 'BLOCKED_SHOT'])).then(pl.lit('Goalie'))
               .when(pl.col('event_type') == 'PENALTY').then(pl.lit('DrewBy'))
               .when(pl.col('event_type') == 'GIVEAWAY').then(pl.lit('PlayerID'))
               .when(pl.col('event_type') == 'TAKEAWAY').then(pl.lit('PlayerID'))
               .otherwise(pl.lit(None))
             ).alias("event_player_2_type"),
             (pl.when((pl.col('event_type') == 'GOAL') & (~pl.col('assist_1_player_id').is_null())).then(pl.lit('Assist'))
               .when((pl.col('event_type') == 'PENALTY') & (~pl.col('servedby_player_id').is_null())).then(pl.lit('ServedBy'))
               .when((pl.col('event_type') == 'BLOCKED_SHOT') & (~pl.col('blocking_player_id').is_null())).then(pl.lit('Blocker'))
               .otherwise(pl.lit(None))
             ).alias("event_player_3_type"),
             (pl.when((pl.col('event_type') == 'GOAL') & (~pl.col('assist_2_player_id').is_null())).then(pl.lit('Assist'))
               .otherwise(pl.lit(None))
             ).alias("event_player_4_type")
        ])
        .drop(remove_ply_ids)
    )
    # Parse Situation Code For Home/Away Skaters/EmptyNet
    data = (
        data
        .sort('season', 'game_id', 'period', 'event_idx')
        .with_columns(
            pl.when(pl.col('situationCode').is_null()).then(pl.col("situationCode").fill_null(strategy="forward")).otherwise(pl.col('situationCode')).alias('situationCode')
        )
        .filter(~pl.col('situationCode').is_in(['0101', '1010']))
        .with_columns([
            pl.col("situationCode").str.slice(0, 1).cast(pl.Int32).alias("away_en"),
            pl.col("situationCode").str.slice(3, 1).cast(pl.Int32).alias("home_en"),
            pl.col("situationCode").str.slice(1, 1).cast(pl.Int32).alias("away_skaters"),
            pl.col("situationCode").str.slice(2, 1).cast(pl.Int32).alias("home_skaters")
        ])
        .with_columns([
            (pl.concat_str([pl.col('home_skaters'), pl.lit('v'), pl.col('away_skaters')])).alias('strength_state'),
            (pl.concat_str([pl.col('home_skaters'), pl.lit('v'), pl.col('away_skaters')])).alias('true_strength_state')
        ])
    )

    # Create x_fixed and y_fixed. These coordinates will be relative to the event team's attacking zone (i.e., x_abs is positive)
    data = (
        data
        .with_columns([
            pl.when((pl.col('event_zone') == 'O') & (pl.col('x').mean() > 0)).then(pl.lit(1)).otherwise(pl.lit(-1)).alias('flipped_coords')
        ])
        .with_columns([
            # Where homeTeamDefendingSide Exists
            (pl.when((~pl.col('homeTeamDefendingSide').is_null()) &
                     ( pl.col('homeTeamDefendingSide') == 'left') &
                     ( pl.col('event_team_type') == 'home'))
                     .then(pl.col('x'))
               .when((~pl.col('homeTeamDefendingSide').is_null()) &
                     ( pl.col('homeTeamDefendingSide') == 'right') &
                     ( pl.col('event_team_type') == 'home'))
                     .then(pl.col('x')*-1)
               .when((~pl.col('homeTeamDefendingSide').is_null()) &
                     ( pl.col('homeTeamDefendingSide') == 'left') &
                     ( pl.col('event_team_type') == 'away'))
                     .then(pl.col('x')*-1)
               .when((~pl.col('homeTeamDefendingSide').is_null()) &
                     ( pl.col('homeTeamDefendingSide') == 'right') &
                     ( pl.col('event_team_type') == 'away'))
                     .then(pl.col('x'))
              # Where homeTeamDefendingSide does not exist
              .when((pl.col('homeTeamDefendingSide').is_null()) &
                    (pl.col('event_zone') == 'O'))
                    .then(pl.col('x').abs())
              .when((pl.col('homeTeamDefendingSide').is_null()) &
                    (pl.col('event_zone') == 'D'))
                    .then((pl.col('x').abs())*-1)
              .when((pl.col('homeTeamDefendingSide').is_null()) &
                    (pl.col('event_zone') == 'N'))
                    .then((pl.col('x')) * (pl.col('flipped_coords').max().over(['season', 'game_id', 'period'])))
              .otherwise(pl.lit(None)).alias('x_abs')
            ),
            (pl.when((~pl.col('homeTeamDefendingSide').is_null()) &
                     ( pl.col('homeTeamDefendingSide') == 'left') &
                     ( pl.col('event_team_type') == 'home'))
                     .then(pl.col('y'))
               .when((~pl.col('homeTeamDefendingSide').is_null()) &
                     ( pl.col('homeTeamDefendingSide') == 'right') &
                     ( pl.col('event_team_type') == 'home'))
                     .then(pl.col('y')*-1)
               .when((~pl.col('homeTeamDefendingSide').is_null()) &
                     ( pl.col('homeTeamDefendingSide') == 'left') &
                     ( pl.col('event_team_type') == 'away'))
                     .then(pl.col('y')*-1)
               .when((~pl.col('homeTeamDefendingSide').is_null()) &
                     ( pl.col('homeTeamDefendingSide') == 'right') &
                     ( pl.col('event_team_type') == 'away'))
                     .then(pl.col('y'))
              # Where homeTeamDefendingSide does not exist
              .when((pl.col('homeTeamDefendingSide').is_null()) &
                    (pl.col('event_zone') == 'O'))
                    .then(pl.col('y').abs())
              .when((pl.col('homeTeamDefendingSide').is_null()) &
                    (pl.col('event_zone') == 'D'))
                    .then((pl.col('y').abs())*-1)
              .when((pl.col('homeTeamDefendingSide').is_null()) &
                    (pl.col('event_zone') == 'N'))
                    .then((pl.col('y')) * (pl.col('flipped_coords').max().over(['season', 'game_id', 'period'])))
              .otherwise(pl.lit(None)).alias('y_abs')
            )
        ])
        .drop("flipped_coords")
    )

    # Create Event Distance Calculation
    data = data.with_columns(
        pl.when(pl.col('x_abs') >= 0).then(pl.Series.sqrt((89 - pl.Series.abs(data['x_abs']))**2 + data['y_abs']**2))
          .when(pl.col('x_abs') <  0).then(pl.Series.sqrt((pl.Series.abs(data['x_abs']) + 89)**2 + data['y_abs']**2))
          .alias('event_distance')
    )

    # Create Event Angle Calculation
    data = (
        data
        .with_columns(
        pl.when(data['x_abs'] >= 0)
          .then(pl.Series.arctan(data['y_abs'] / (89 - pl.Series.abs(data['x_abs'])))
                .apply(lambda x: abs(x * (180 / pi))))
          .when(data['x_abs'] < 0)
          .then(pl.Series.arctan(data['y_abs'] / (pl.Series.abs(data['x_abs']) + 89))
                .apply(lambda x: abs(x * (180 / pi))))
          .alias('event_angle')
        )
        .with_columns(
            pl.when(pl.col('x_abs') > 89).then((180 - pl.col('event_angle'))).otherwise(pl.col('event_angle')).alias('event_angle')
        )
    )

    return data


In [31]:
# Your master column list
master_columns = ['id', 'gameDate', 'season', 'details.eventOwnerTeamId', 'gameType', 'details.scoringPlayerTotal', 'awayTeam.id', 'situationCode', 'details.goalieInNetId', 'details.blockingPlayerId', 'details.descKey', 'details.drawnByPlayerId', 'homeTeam.id', 'details.servedByPlayerId', 'awayTeam.abbrev', 'periodDescriptor.periodType', 'details.typeCode', 'details.homeScore', 'homeTeam.abbrev', 'period', 'details.committedByPlayerId', 'periodDescriptor.number', 'details.zoneCode', 'details.xCoord', 'periodDescriptor.otPeriods', 'timeRemaining', 'details.assist1PlayerTotal', 'timeInPeriod', 'details.hitteePlayerId', 'details.assist2PlayerId', 'details.shotType', 'details.shootingPlayerId', 'details.awayScore', 'details.reason', 'details.homeSOG', 'typeDescKey', 'details.hittingPlayerId', 'details.scoringPlayerId', 'details.assist1PlayerId', 'details.assist2PlayerTotal', 'details.duration', 'typeCode', 'eventId', 'details.playerId', 'details.awaySOG', 'details.secondaryReason', 'details.winningPlayerId', 'details.losingPlayerId', 'sortOrder', 'homeTeamDefendingSide', 'details.yCoord']

# Initialize an empty DataFrame with the master column list
union_df = pl.DataFrame({}, schema=master_columns)

# Iterate over each dataframe and align columns
for df in df_list:
    # Identify missing columns
    missing_columns = set(master_columns) - set(df.columns)
    
    # Fill missing columns with null values
    for col in missing_columns:
        df = df.with_columns(pl.lit(None).alias(col))

    df = df.select(master_columns)

    # Concatenate aligned dataframe to the union_df
    union_df = pl.concat([union_df, df], how='vertical_relaxed')

# Rename + Clean + Create Certain Columns
#PBP_RAW = reconcile_api_data(union_df)

In [218]:
PBP_RAW = reconcile_api_data(union_df)

In [220]:
import pathlib
xG_path = "API_RAW_PBP_Data.parquet"
PBP_RAW.write_parquet(
    xG_path,
    use_pyarrow=True,
)

In [219]:
PBP_RAW.filter(pl.col('event_type')=='GOAL')['strength_state'].value_counts().sort("counts")

strength_state,counts
str,u32
"""6v3""",5
"""3v6""",10
"""3v4""",177
"""4v6""",228
"""4v3""",231
"""6v4""",247
"""3v5""",385
"""5v3""",460
"""4v4""",1361
"""3v3""",1464


In [94]:
sample_sitCode = (
    union_df
    .filter((pl.col('period_type') != 'SO') & (~pl.col('event_type').is_in(['period-start', 'period-end', 'game-start', 'game-end'])))
    .filter((pl.col('event_type') == 'penalty'))
    .with_columns([
        pl.col('situationCode').fill_null(strategy="forward").alias('fwd_sitCode'),
        pl.col('situationCode').fill_null(strategy="backward").alias('back_sitCode')
        ])
     .filter((pl.col('situationCode').is_null()))
    
)

sample_sitCode.head(20)

game_id,game_date,season,event_team_id,season_type,away_id,situationCode,event_goalie_id,blocking_player_id,details.descKey,drawnby_player_id,home_id,servedby_player_id,away_abbreviation,period_type,details.typeCode,home_score,home_abbreviation,period,committedby_player_id,periodDescriptor.number,event_zone,x,timeRemaining,timeInPeriod,hittee_player_id,assist_2_player_id,secondary_type,shooting_player_id,away_score,reason,event_type,hitting_player_Id,scoring_player_id,assist_1_player_id,duration,typeCode,event_id,details.playerId,details.secondaryReason,winning_player_id,losing_player_id,event_idx,homeTeamDefendingSide,y,fwd_sitCode,back_sitCode
f64,str,f64,f64,f64,f64,str,f64,f64,str,f64,f64,f64,str,str,str,f64,str,f64,f64,f64,str,f64,str,str,f64,f64,str,f64,f64,str,str,f64,f64,f64,f64,f64,f64,f64,str,f64,f64,f64,str,f64,str,str
2012000000.0,"""2013-01-19""",20122013.0,21.0,2.0,21.0,,,,"""high-sticking-…",8469820.0,30.0,,"""COL""","""REG""","""MIN""",,"""MIN""",2.0,8470699.0,2.0,"""D""",95.0,"""02:45""","""17:15""",,,,,,,"""penalty""",,,,4.0,509.0,573.0,,,,,437.0,,32.0,"""0651""","""1551"""
2012000000.0,"""2013-01-22""",20122013.0,1.0,2.0,4.0,,,,"""roughing""",8474190.0,1.0,,"""PHI""","""REG""","""MIN""",,"""NJD""",2.0,8470609.0,2.0,"""D""",88.0,"""07:10""","""12:50""",,,,,,,"""penalty""",,,,2.0,509.0,287.0,,,,,345.0,,-10.0,"""1451""","""1551"""
2012000000.0,"""2013-02-01""",20122013.0,27.0,2.0,27.0,,,,"""fighting""",8475455.0,25.0,,"""PHX""","""REG""","""MAJ""",,"""DAL""",2.0,8471231.0,2.0,"""O""",-93.0,"""06:07""","""13:53""",,,,,,,"""penalty""",,,,5.0,509.0,297.0,,,,,351.0,,14.0,"""1341""","""1551"""
2012000000.0,"""2013-02-01""",20122013.0,27.0,2.0,27.0,,,,"""misconduct""",8475455.0,25.0,,"""PHX""","""REG""","""MIS""",,"""DAL""",2.0,8471231.0,2.0,"""O""",-93.0,"""06:07""","""13:53""",,,,,,,"""penalty""",,,,10.0,509.0,299.0,,,,,353.0,,12.0,"""1341""","""1551"""
2012000000.0,"""2013-02-05""",20122013.0,17.0,2.0,20.0,,,,"""ps-hooking-on-…",8462042.0,17.0,,"""CGY""","""REG""","""PS""",,"""DET""",2.0,8470318.0,2.0,"""D""",-76.0,"""06:48""","""13:12""",,,,,,,"""penalty""",,,,0.0,509.0,403.0,,,,,364.0,,0.0,"""1551""","""1551"""
2012000000.0,"""2013-02-07""",20122013.0,23.0,2.0,23.0,,,,"""tripping""",8474618.0,30.0,,"""VAN""","""REG""","""MIN""",,"""MIN""",2.0,8474668.0,2.0,"""O""",-75.0,"""10:25""","""09:35""",,,,,,,"""penalty""",,,,2.0,509.0,504.0,,,,,378.0,,-35.0,"""1451""","""1560"""
2012000000.0,"""2013-02-09""",20122013.0,8.0,2.0,10.0,,,,"""interference""",8474497.0,8.0,8476851.0,"""TOR""","""REG""","""MIN""",,"""MTL""",3.0,8475848.0,3.0,"""O""",-78.0,"""12:01""","""07:59""",,,,,,,"""penalty""",,,,2.0,509.0,638.0,,,,,616.0,,-1.0,"""1451""","""1551"""
2012000000.0,"""2013-02-09""",20122013.0,10.0,2.0,10.0,,,,"""misconduct""",8471504.0,8.0,,"""TOR""","""REG""","""MIS""",,"""MTL""",3.0,8468778.0,3.0,"""N""",11.0,"""04:59""","""15:01""",,,,,,,"""penalty""",,,,10.0,509.0,777.0,,,,,706.0,,29.0,"""1351""","""1551"""
2012000000.0,"""2013-02-17""",20122013.0,25.0,2.0,20.0,,,,"""holding""",8462042.0,25.0,,"""CGY""","""REG""","""MIN""",,"""DAL""",1.0,8462196.0,1.0,,,"""12:14""","""07:46""",,,,,,,"""penalty""",,,,2.0,509.0,29.0,,,,,103.0,,,"""1551""","""1551"""
2012000000.0,"""2013-02-19""",20122013.0,10.0,2.0,10.0,,,,"""misconduct""",,14.0,,"""TOR""","""REG""","""MIS""",,"""TBL""",3.0,8468778.0,3.0,"""N""",13.0,"""08:52""","""11:08""",,,,,,,"""penalty""",,,,10.0,509.0,636.0,,,,,566.0,,33.0,"""1441""","""1551"""


In [191]:
sdv_cols = ['event_type', 'event', 'description', 'period', 'period_seconds', 'period_seconds_remaining', 'game_seconds', 'game_seconds_remaining', 'home_score', 'away_score', 'strength_state', 'event_idx', 'extra_attacker', 'home_skaters', 'away_skaters', 'game_id', 'period_type', 'ordinal_num', 'period_time', 'period_time_remaining', 'date_time', 'home_final', 'away_final', 'season', 'season_type', 'game_date', 'game_start', 'game_end', 'game_length', 'game_state', 'detailed_state', 'venue_name', 'venue_link', 'home_name', 'home_abbreviation', 'home_division_name', 'home_conference_name', 'home_id', 'away_name', 'away_abbreviation', 'away_division_name', 'away_conference_name', 'away_id', 'event_id', 'event_team', 'event_team_type', 'num_on', 'players_on', 'players_off', 'away_on_1', 'away_on_2', 'away_on_3', 'away_on_4', 'away_on_5', 'away_goalie', 'ids_on', 'ids_off', 'secondary_type', 'home_on_1', 'home_on_2', 'home_on_3', 'home_on_4', 'home_on_5', 'home_goalie', 'event_player_1_name', 'event_player_1_type', 'event_player_2_name', 'event_player_2_type', 'strength_code', 'strength', 'x', 'y', 'x_fixed', 'y_fixed', 'event_player_1_id', 'event_player_1_link', 'event_player_2_id', 'event_player_2_link', 'event_team_id', 'event_team_link', 'event_team_abbr', 'num_off', 'penalty_severity', 'penalty_minutes', 'away_on_6', 'shot_distance', 'shot_angle', 'event_goalie_name', 'event_goalie_id', 'event_goalie_link', 'event_goalie_type', 'event_player_3_name', 'event_player_3_type', 'game_winning_goal', 'empty_net', 'event_player_3_id', 'event_player_3_link', 'event_player_4_type', 'event_player_4_id', 'event_player_4_name', 'event_player_4_link', 'home_on_6', 'venue_id']

# Check Similar Columns:
sim_cols = [l for l in sdv_cols if l in PBP_RAW.columns]

# Check SDV Not In API Cols
ignore_cols = ['event', 'description', 'extra_attacker', 'ordinal_num', 'period_time', 'period_time_remaining', 'home_final', 'away_final', 'game_start', 'game_end', 'game_length', 'game_state', 'detailed_state']
sdv_not_api = [l for l in sdv_cols if (l not in PBP_RAW.columns) & (~l.endswith('_link'))]

# Check API Not In SDV Cols
api_not_sdv = [l for l in PBP_RAW.columns if l not in sdv_cols]

print(sim_cols)
print(len(sim_cols))

print(sdv_not_api)
print(len(sdv_not_api))

print(api_not_sdv)
print(len(api_not_sdv))

## Columns To Create:

# From Raw DF:
    # Strength State (Find Code Translation online)
    # Extra Attacker
    # X and Y Fixed (and ABS)
    # *event_1_player_type
    # *event_2_player_type

# Shift Related
    # away_goalie / home_goalie
    # event_player_1_name, event_player_2_name, event_player_3_name, event_player_4_name, event_goalie_name
    # home_on_1, home_on_2, home_on_3, home_on_5, home_on_6
    # away_on_1, away_on_2, away_on_3, away_on_5, away_on_6

    # *num_on
    # *players_on/players_off
    # *ids_on/ids_off

   

# Ignore

    # event
    # description
    # ordinal_num
    # home_final
    # away_final
    # num_off
    # penalty_severity


['event_type', 'period', 'period_seconds', 'period_seconds_remaining', 'game_seconds', 'game_seconds_remaining', 'home_score', 'away_score', 'strength_state', 'event_idx', 'home_skaters', 'away_skaters', 'game_id', 'period_type', 'season', 'season_type', 'game_date', 'home_abbreviation', 'home_id', 'away_abbreviation', 'away_id', 'event_id', 'event_team_type', 'secondary_type', 'event_player_1_type', 'event_player_2_type', 'x', 'y', 'event_player_1_id', 'event_player_2_id', 'event_team_id', 'event_team_abbr', 'penalty_minutes', 'event_goalie_id', 'event_player_3_type', 'event_player_3_id', 'event_player_4_type', 'event_player_4_id']
38
['event', 'description', 'extra_attacker', 'ordinal_num', 'period_time', 'period_time_remaining', 'date_time', 'home_final', 'away_final', 'game_start', 'game_end', 'game_length', 'game_state', 'detailed_state', 'venue_name', 'home_name', 'home_division_name', 'home_conference_name', 'away_name', 'away_division_name', 'away_conference_name', 'event_team'

period,counts
f64,u32
1.0,1487654
2.0,1486584
3.0,1396093
4.0,64138
5.0,16332
6.0,495
7.0,218
8.0,60


In [None]:
# Convert homeTeamDefendingSide and x + y to create x_abs and y_abs. (Use code from model createion for logic)


# Create event_player_2_XXX cols from details.goalieInNetId, details.losingPlayerId, details.blockingPlayerId, details.hitteePlayerId, 

In [152]:
## shift For More Information
shift_link = "https://api.nhle.com/stats/rest/en/shiftcharts?cayenneExp=gameId=2021020001"
shift_response = requests.get(shift_link)
shift_raw = pd.json_normalize(shift_response.json())
shift_raw = pd.json_normalize(shift_raw['data'])


# Create an empty DataFrame to store the normalized plays
normalized_shift = pd.DataFrame()

# Iterate over each row in plays_1 and normalize the JSON data
for _, row in shift_raw.iterrows():
    # Normalize the JSON data in the current row
    normalized_row_g = pd.json_normalize(row)

    # Concatenate the normalized row to the result DataFrame
    normalized_shift = pd.concat([normalized_shift, normalized_row_g], ignore_index=True)
    shift_keep_cols = ['gameId', 'id', 'duration', 'period', 'startTime', 'endTime', 'eventNumber', 'firstName', 'lastName', 'playerId', 'teamId', 'teamName', 'shiftNumber', 'typeCode']
    normalized_shift = normalized_shift[shift_keep_cols]

print(normalized_shift.head())

       gameId        id duration  period startTime endTime  eventNumber  \
0  2021020001  11053104    00:32       1     00:00   00:32            7   
1  2021020001  11053105    00:42       1     02:30   03:12          107   
2  2021020001  11053106    00:39       1     04:39   05:18          112   
3  2021020001  11053191    00:43       1     06:48   07:31          116   
4  2021020001  11053192    00:50       1     09:33   10:23          122   

  firstName lastName  playerId  teamId             teamName  shiftNumber  \
0      Jeff   Carter   8470604       5  Pittsburgh Penguins            1   
1      Jeff   Carter   8470604       5  Pittsburgh Penguins            2   
2      Jeff   Carter   8470604       5  Pittsburgh Penguins            3   
3      Jeff   Carter   8470604       5  Pittsburgh Penguins            4   
4      Jeff   Carter   8470604       5  Pittsburgh Penguins            5   

   typeCode  
0       517  
1       517  
2       517  
3       517  
4       517  
