In [1]:
# Pandas
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import numpy as np

# Polars (Arrow)
from pyarrow.dataset import dataset
import polars as pl
pl.Config.set_tbl_rows(n=-1)
pl.Config.set_tbl_cols(n=-1)

# Hit API
import requests

# Tools
from itertools import chain
from datetime import datetime, timedelta
from math import pi

# Save
import pickle
import os
import pathlib

# Get Game ID's From Previous Seasons

- Saves two lists (Current Game IDs and All Game IDs)

In [2]:
# Get Dates
yday = datetime.today() - timedelta(days=1)
end_date = yday.strftime('%Y%m%d')

# Check and Load Dates
date_file_name = 'dates_loaded.pkl'

if os.path.exists(date_file_name):
    with open(date_file_name, "rb") as file:
        season_dates = pickle.load(file)
else:
    season_dates = []

load_dates = pd.date_range(start='20121001', end=end_date, freq='D')
load_dates = [d for d in load_dates if d not in season_dates]

# Chcek and Load Game IDs
g_id_file_name  = 'game_ids.pkl'

if os.path.exists(g_id_file_name):
    with open(g_id_file_name, "rb") as file:
        game_ids = pickle.load(file)
else:
    game_ids = []

In [3]:
# Loop For Loading Games By Date
f_g_id = []
for i in load_dates:
    i_str = i.strftime('%Y-%m-%d')
    sched_link = "https://api-web.nhle.com/v1/schedule/"+i_str
    response = requests.get(sched_link)

    # Parse the JSON content of the response
    raw_data = pd.json_normalize(response.json())
    sched_data = pd.json_normalize(raw_data['gameWeek'][0])
    sched_data = pd.json_normalize(sched_data['games'][0])

    if len(sched_data) == 0:
        pass
    else:
        sched_data = sched_data[~sched_data['gameType'].isin([1,4,6,7,8,12])]
        f_g_id.append(sched_data['id'].tolist())

# Create Lists (Game ID and Dates Loaded):
f_g_id = list(chain(*f_g_id))
game_ids = game_ids + f_g_id
current_game_ids = [game_id for game_id in game_ids if str(game_id).startswith("2023")]
previous_game_ids = [game_id for game_id in game_ids if game_id not in current_game_ids]

loaded_dates = sorted(load_dates + season_dates)

# Save
with open('game_ids.pkl', 'wb') as file:
    pickle.dump(game_ids, file)

with open('dates_loaded.pkl', 'wb') as file:
    pickle.dump(loaded_dates, file)

In [10]:
df_list = []
for i in game_ids:
    pbp_link = 'https://api-web.nhle.com/v1/gamecenter/'+str(i)+'/play-by-play'

    pbp_response = requests.get(pbp_link)
    pbp_data = pd.json_normalize(pbp_response.json())
    pbp_data = pbp_data[~pbp_data['gameType'].isin([1,4,6,7,8,12])]

    ## GAME DATA
    game_data = pbp_data[['id', 'season', 'gameDate', 'gameType', 'awayTeam.id', 'awayTeam.abbrev', 'homeTeam.id', 'homeTeam.abbrev']]

    # PLAYS DATA
    plays_1 = pd.json_normalize(pbp_data['plays'])

    # Create an empty DataFrame to store the normalized plays
    normalized_plays = pd.DataFrame()

    # Iterate over each row in plays_1 and normalize the JSON data
    for _, row in plays_1.iterrows():
        # Normalize the JSON data in the current row
        normalized_row = pd.json_normalize(row)

        # Concatenate the normalized row to the result DataFrame
        normalized_plays = pd.concat([normalized_plays, normalized_row], ignore_index=True)

    result_df = pd.merge(game_data.assign(key=1), normalized_plays.assign(key=1), on='key').drop('key', axis=1)
    result_df = pl.DataFrame(result_df)

    df_list.append(result_df)
    
print('Loading Complete -- Begin Diagonal Union of', len(df_list),'Games')

Loading Complete -- Begin Diagonal Union of 14198 Games


# Cleaning Functions For PBP Data

In [11]:
def min_to_sec(time_str):
    """This function will help to convert time's formatted like MM:SS to a round seconds number"""
    minutes, seconds = map(int, time_str.split(':'))
    return minutes * 60 + seconds

In [13]:
def reconcile_api_data(data):
    """ This Function will take a polars dataframe and reconcile column names, values, and data types to match SDV cleaning functions to save time and effort in building more tweak functions"""

    # Create Dictionaries For Column Name/Value Rename
    drop_cols = ['details.scoringPlayerTotal', 'periodDescriptor.number', 'periodDescriptor.otPeriods', 'details.assist1PlayerTotal', 'details.assist2PlayerTotal', 'details.homeSOG', 'details.awaySOG']
    # Column Names
    rename_dict = {
        "id": "game_id",
        "gameDate": "game_date",
        "awayTeam.id": "away_id",
        "awayTeam.abbrev": "away_abbreviation",
        "homeTeam.id": "home_id",
        "homeTeam.abbrev": "home_abbreviation",
        "gameType": "season_type",
        "eventId": "event_id",
        "typeDescKey": "event_type",
        "sortOrder": "event_idx",
        "periodDescriptor.periodType": "period_type",
        "details.eventOwnerTeamId": "event_team_id",
        "details.xCoord": "x",
        "details.yCoord": "y",
        "details.zoneCode": "event_zone",
        "details.shotType": "secondary_type",
        "details.awayScore": "away_score",
        "details.homeScore": "home_score",
        "details.goalieInNetId": "event_goalie_id",
        "details.blockingPlayerId": "blocking_player_id",
        "details.drawnByPlayerId": "drawnby_player_id",
        "details.servedByPlayerId": "servedby_player_id",
        "details.committedByPlayerId": "committedby_player_id",
        "details.hittingPlayerId": "hitting_player_id",
        "details.hitteePlayerId": "hittee_player_id",
        "details.assist1PlayerId": "assist_1_player_id",
        "details.assist2PlayerId": "assist_2_player_id",
        "details.shootingPlayerId": "shooting_player_id",
        "details.reason": "reason",
        "details.scoringPlayerId": "scoring_player_id",
        "details.duration": "penalty_minutes",
        "details.winningPlayerId": "winning_player_id",
        "details.losingPlayerId": "losing_player_id"
    }

    # Event Type
    event_type_dict = {
        "faceoff": "FACEOFF",
        "shot-on-goal": "SHOT",
        "stoppage": "STOPPAGE",
        "hit": "HIT",
        "blocked-shot": "BLOCKED_SHOT",
        "missed-shot": "MISSED_SHOT",
        "giveaway": "GIVEAWAY",
        "takeaway": "TAKEAWAY",
        "penalty": "PENALTY",
        "goal": "GOAL",
        "period-start": "PERIOD_START",
        "period-end": "PERIOD_END",
        "delayed-penalty": "DELAYED_PENALTY",
        "game-end": "GAME_END",
        "shootout-complete": "SHOOTOUT_COMPLETE",
        "failed-shot-attempt": "FAILED_SHOT",
        None:None
    }

    # Season Type
    season_type_dict = {
        2: "R",
        3: "P",
        None:None
    }

    # Shot Type
    shot_type_dict = {
        "snap": "Snap",
        "between-legs": "Between Legs",
        "wrap-around": "Wrap-Around",
        "tip-in": "Tip-In",
        "cradle": "Wrap-Around",
        "poke": 'Poked',
        "bat": 'Batted',
        "deflected": "Deflected",
        "wrist": "Wrist",
        "slap":	"Slap",
        "backhand": "Backhand",
        None: None
    }

    # Rename Columns + Values AND Add Event/Season Type Helpers
    data = data.rename(rename_dict).filter((pl.col('period_type') != 'SO') & (pl.col('season_type').is_in([2, 3]))).drop(drop_cols)

    data = (
        data
        .with_columns([
            (pl.col('season_type').map_dict(season_type_dict, default = pl.col('season_type'))).alias('season_type'),
            (pl.col('event_type').map_dict(event_type_dict,default = pl.col('event_type'))).alias('event_type'),
            (pl.col('secondary_type').map_dict(shot_type_dict,default = pl.col('secondary_type'))).alias('secondary_type'),
            pl.when(pl.col('event_team_id') == pl.col('home_id')).then(pl.lit('home')).otherwise(pl.lit('away')).alias('event_team_type'),
            pl.when(pl.col('event_team_id') == pl.col('home_id')).then(pl.lit('home_abbreviation')).otherwise(pl.lit('away_abbreviation')).alias('event_team_abbr')
            ])
        #.drop('gameType', 'typeDescKey', 'details.shotType')
        .filter(~pl.col('situationCode').is_in(["PERIOD_START", "PERIOD_END", "GAME_START", "GAME_END"]))
    )

    # Create Game and Period Seconds Remaining from timeInPeriod, timeRemaining: 'period', 'period_seconds', 'period_seconds_remaining', 'game_seconds', 'game_seconds_remaining'
    data = (
        data
        .with_columns((pl.col('timeInPeriod').apply(min_to_sec)).alias('period_seconds'))
        .with_columns([
            (1200 - pl.col('period_seconds')).alias('period_seconds_remaining'),
            (pl.col('period_seconds') + ((pl.col('period')-1)*1200)).alias('game_seconds'),
            ((3600 - pl.col('period_seconds')) + ((pl.col('period') - 3) * 1200)).alias('game_seconds_remaining')
        ])
    )

    # Create event_player_1_id and event_player_2_id columns based on event_type and corresponding columns
    remove_ply_ids = ['winning_player_id', 'hitting_player_id', 'scoring_player_id', 'shooting_player_id', 'committedby_player_id',
                      'details.playerId', 'losing_player_id', 'hittee_player_id', 'drawnby_player_id', 'assist_1_player_id', 'assist_2_player_id',
                      'blocking_player_id']
    data = (
        data
        .with_columns([
            (pl.when(pl.col('event_type') == 'FACEOFF').then(pl.col('winning_player_id'))
               .when(pl.col('event_type') == 'HIT').then(pl.col('hitting_player_id'))
               .when(pl.col('event_type') == 'GOAL').then(pl.col('scoring_player_id'))
               .when(pl.col('event_type').is_in(['SHOT', 'MISSED_SHOT', "BLOCKED_SHOT"])).then(pl.col('shooting_player_id'))
               .when(pl.col('event_type') == 'PENALTY').then(pl.col('committedby_player_id'))
               .when(pl.col('event_type') == 'GIVEAWAY').then(pl.col('details.playerId'))
               .when(pl.col('event_type') == 'TAKEAWAY').then(pl.col('details.playerId'))
               .otherwise(pl.lit(None))
             ).alias("event_player_1_id"),
             (pl.when(pl.col('event_type') == 'FACEOFF').then(pl.col('losing_player_id'))
               .when(pl.col('event_type') == 'HIT').then(pl.col('hittee_player_id'))
               .when(pl.col('event_type').is_in(['GOAL','SHOT', 'MISSED_SHOT', 'BLOCKED_SHOT'])).then(pl.col('event_goalie_id'))
               .when(pl.col('event_type') == 'PENALTY').then(pl.col('drawnby_player_id'))
               .otherwise(pl.lit(None))
             ).alias("event_player_2_id"),
             (pl.when((pl.col('event_type') == 'GOAL') & (~pl.col('assist_1_player_id').is_null())).then(pl.col('assist_1_player_id'))
               .when((pl.col('event_type') == 'PENALTY') & (~pl.col('servedby_player_id').is_null())).then(pl.col('servedby_player_id'))
               .when((pl.col('event_type') == 'BLOCKED_SHOT') & (~pl.col('blocking_player_id').is_null())).then(pl.col('blocking_player_id'))
               .otherwise(pl.lit(None))
             ).alias("event_player_3_id"),
             (pl.when((pl.col('event_type') == 'GOAL') & (~pl.col('assist_2_player_id').is_null())).then(pl.col('assist_2_player_id'))
               .otherwise(pl.lit(None))
             ).alias("event_player_4_id"),
             (pl.when(pl.col('event_type') == 'FACEOFF').then(pl.lit('Winner'))
               .when(pl.col('event_type') == 'HIT').then(pl.lit('Hitter'))
               .when(pl.col('event_type') == 'GOAL').then(pl.lit('Scorer'))
               .when(pl.col('event_type').is_in(['SHOT', 'MISSED_SHOT', "BLOCKED_SHOT"])).then(pl.lit('Shooter'))
               .when(pl.col('event_type') == 'PENALTY').then(pl.lit('PenaltyOn'))
               .when(pl.col('event_type') == 'GIVEAWAY').then(pl.lit('PlayerID'))
               .when(pl.col('event_type') == 'TAKEAWAY').then(pl.lit('PlayerID'))
               .otherwise(pl.lit(None))
             ).alias("event_player_1_type"),
             (pl.when(pl.col('event_type') == 'FACEOFF').then(pl.lit('Loser'))
               .when(pl.col('event_type') == 'HIT').then(pl.lit('Hittee'))
               .when((pl.col('event_type') == 'GOAL') & (~pl.col('event_goalie_id').is_null())).then(pl.lit('Goalie'))
               .when((pl.col('event_type') == 'GOAL') & (pl.col('event_goalie_id').is_null())).then(pl.lit('EmptyNet'))
               .when(pl.col('event_type').is_in(['SHOT', 'MISSED_SHOT', 'BLOCKED_SHOT'])).then(pl.lit('Goalie'))
               .when(pl.col('event_type') == 'PENALTY').then(pl.lit('DrewBy'))
               .when(pl.col('event_type') == 'GIVEAWAY').then(pl.lit('PlayerID'))
               .when(pl.col('event_type') == 'TAKEAWAY').then(pl.lit('PlayerID'))
               .otherwise(pl.lit(None))
             ).alias("event_player_2_type"),
             (pl.when((pl.col('event_type') == 'GOAL') & (~pl.col('assist_1_player_id').is_null())).then(pl.lit('Assist'))
               .when((pl.col('event_type') == 'PENALTY') & (~pl.col('servedby_player_id').is_null())).then(pl.lit('ServedBy'))
               .when((pl.col('event_type') == 'BLOCKED_SHOT') & (~pl.col('blocking_player_id').is_null())).then(pl.lit('Blocker'))
               .otherwise(pl.lit(None))
             ).alias("event_player_3_type"),
             (pl.when((pl.col('event_type') == 'GOAL') & (~pl.col('assist_2_player_id').is_null())).then(pl.lit('Assist'))
               .otherwise(pl.lit(None))
             ).alias("event_player_4_type")
        ])
        .drop(remove_ply_ids)
    )
    # Parse Situation Code For Home/Away Skaters/EmptyNet
    data = (
        data
        .sort('season', 'game_id', 'period', 'event_idx')
        .with_columns(
            pl.when(pl.col('situationCode').is_null()).then(pl.col("situationCode").fill_null(strategy="forward")).otherwise(pl.col('situationCode')).alias('situationCode')
        )
        .filter(~pl.col('situationCode').is_in(['0101', '1010']))
        .with_columns([
            pl.col("situationCode").str.slice(0, 1).cast(pl.Int32).alias("away_en"),
            pl.col("situationCode").str.slice(3, 1).cast(pl.Int32).alias("home_en"),
            pl.col("situationCode").str.slice(1, 1).cast(pl.Int32).alias("away_skaters"),
            pl.col("situationCode").str.slice(2, 1).cast(pl.Int32).alias("home_skaters")
        ])
        .with_columns([
            (pl.concat_str([pl.col('home_skaters'), pl.lit('v'), pl.col('away_skaters')])).alias('strength_state'),
            (pl.concat_str([pl.col('home_skaters'), pl.lit('v'), pl.col('away_skaters')])).alias('true_strength_state')
        ])
    )

    # Create x_fixed and y_fixed. These coordinates will be relative to the event team's attacking zone (i.e., x_abs is positive)
    data = (
        data
        .with_columns([
            pl.when((pl.col('event_zone') == 'O') & (pl.col('x').mean() > 0)).then(pl.lit(1)).otherwise(pl.lit(-1)).alias('flipped_coords')
        ])
        .with_columns([
            # Where homeTeamDefendingSide Exists
            (pl.when((~pl.col('homeTeamDefendingSide').is_null()) &
                     ( pl.col('homeTeamDefendingSide') == 'left') &
                     ( pl.col('event_team_type') == 'home'))
                     .then(pl.col('x'))
               .when((~pl.col('homeTeamDefendingSide').is_null()) &
                     ( pl.col('homeTeamDefendingSide') == 'right') &
                     ( pl.col('event_team_type') == 'home'))
                     .then(pl.col('x')*-1)
               .when((~pl.col('homeTeamDefendingSide').is_null()) &
                     ( pl.col('homeTeamDefendingSide') == 'left') &
                     ( pl.col('event_team_type') == 'away'))
                     .then(pl.col('x')*-1)
               .when((~pl.col('homeTeamDefendingSide').is_null()) &
                     ( pl.col('homeTeamDefendingSide') == 'right') &
                     ( pl.col('event_team_type') == 'away'))
                     .then(pl.col('x'))
              # Where homeTeamDefendingSide does not exist
              .when((pl.col('homeTeamDefendingSide').is_null()) &
                    (pl.col('event_zone') == 'O'))
                    .then(pl.col('x').abs())
              .when((pl.col('homeTeamDefendingSide').is_null()) &
                    (pl.col('event_zone') == 'D'))
                    .then((pl.col('x').abs())*-1)
              .when((pl.col('homeTeamDefendingSide').is_null()) &
                    (pl.col('event_zone') == 'N'))
                    .then((pl.col('x')) * (pl.col('flipped_coords').max().over(['season', 'game_id', 'period'])))
              .otherwise(pl.lit(None)).alias('x_abs')
            ),
            # Where homeTeamDefendingSide does exist
            (pl.when((~pl.col('homeTeamDefendingSide').is_null()) &
                     ( pl.col('homeTeamDefendingSide') == 'left') &
                     ( pl.col('event_team_type') == 'home'))
                     .then(pl.col('y'))
               .when((~pl.col('homeTeamDefendingSide').is_null()) &
                     ( pl.col('homeTeamDefendingSide') == 'right') &
                     ( pl.col('event_team_type') == 'home'))
                     .then(pl.col('y')*-1)
               .when((~pl.col('homeTeamDefendingSide').is_null()) &
                     ( pl.col('homeTeamDefendingSide') == 'left') &
                     ( pl.col('event_team_type') == 'away'))
                     .then(pl.col('y')*-1)
               .when((~pl.col('homeTeamDefendingSide').is_null()) &
                     ( pl.col('homeTeamDefendingSide') == 'right') &
                     ( pl.col('event_team_type') == 'away'))
                     .then(pl.col('y'))
              # Where homeTeamDefendingSide does not exist
              .when((pl.col('homeTeamDefendingSide').is_null()) &
                    (pl.col('event_zone') == 'O'))
                    .then(pl.col('y').abs())
              .when((pl.col('homeTeamDefendingSide').is_null()) &
                    (pl.col('event_zone') == 'D'))
                    .then((pl.col('y').abs())*-1)
              .when((pl.col('homeTeamDefendingSide').is_null()) &
                    (pl.col('event_zone') == 'N'))
                    .then((pl.col('y')) * (pl.col('flipped_coords').max().over(['season', 'game_id', 'period'])))
              .otherwise(pl.lit(None)).alias('y_abs')
            )
        ])
        .drop("flipped_coords")
    )

    # Create Event Distance Calculation
    data = data.with_columns(
        pl.when(pl.col('x_abs') >= 0).then(pl.Series.sqrt((89 - pl.Series.abs(data['x_abs']))**2 + data['y_abs']**2))
          .when(pl.col('x_abs') <  0).then(pl.Series.sqrt((pl.Series.abs(data['x_abs']) + 89)**2 + data['y_abs']**2))
          .alias('event_distance')
    )

    # Create Event Angle Calculation
    data = (
        data
        .with_columns(
        pl.when(data['x_abs'] >= 0)
          .then(pl.Series.arctan(data['y_abs'] / (89 - pl.Series.abs(data['x_abs'])))
                .apply(lambda x: abs(x * (180 / pi))))
          .when(data['x_abs'] < 0)
          .then(pl.Series.arctan(data['y_abs'] / (pl.Series.abs(data['x_abs']) + 89))
                .apply(lambda x: abs(x * (180 / pi))))
          .alias('event_angle')
        )
        .with_columns(
            pl.when(pl.col('x_abs') > 89).then((180 - pl.col('event_angle'))).otherwise(pl.col('event_angle')).alias('event_angle')
        )
    )

    return data

In [14]:
# Your master column list
master_columns = ['id', 'gameDate', 'season', 'details.eventOwnerTeamId', 'gameType', 'details.scoringPlayerTotal', 'awayTeam.id', 'situationCode', 'details.goalieInNetId', 'details.blockingPlayerId', 'details.descKey', 'details.drawnByPlayerId', 'homeTeam.id', 'details.servedByPlayerId', 'awayTeam.abbrev', 'periodDescriptor.periodType', 'details.typeCode', 'details.homeScore', 'homeTeam.abbrev', 'period', 'details.committedByPlayerId', 'periodDescriptor.number', 'details.zoneCode', 'details.xCoord', 'periodDescriptor.otPeriods', 'timeRemaining', 'details.assist1PlayerTotal', 'timeInPeriod', 'details.hitteePlayerId', 'details.assist2PlayerId', 'details.shotType', 'details.shootingPlayerId', 'details.awayScore', 'details.reason', 'details.homeSOG', 'typeDescKey', 'details.hittingPlayerId', 'details.scoringPlayerId', 'details.assist1PlayerId', 'details.assist2PlayerTotal', 'details.duration', 'typeCode', 'eventId', 'details.playerId', 'details.awaySOG', 'details.secondaryReason', 'details.winningPlayerId', 'details.losingPlayerId', 'sortOrder', 'homeTeamDefendingSide', 'details.yCoord']

# Initialize an empty DataFrame with the master column list
union_df = pl.DataFrame({}, schema=master_columns)

# Iterate over each dataframe and align columns
for df in df_list:
    # Identify missing columns
    missing_columns = set(master_columns) - set(df.columns)
    
    # Fill missing columns with null values
    for col in missing_columns:
        df = df.with_columns(pl.lit(None).alias(col))

    df = df.select(master_columns)

    # Concatenate aligned dataframe to the union_df
    union_df = pl.concat([union_df, df], how='vertical_relaxed')

# Tweak Raw Play-By-Play Data - Create PBP_RAW Table
PBP_RAW = reconcile_api_data(union_df)

del union_df

: 

In [None]:
# Tweak Raw Play-By-Play Data - Create PBP_RAW Table
#PBP_RAW = reconcile_api_data(union_df)

In [None]:
import pathlib
PBP_Path = "API_RAW_PBP_Data.parquet"
PBP_RAW.write_parquet(
    PBP_Path,
    use_pyarrow=True,
)

In [20]:
PBP_RAW.filter(pl.col('event_type')=='GOAL')['strength_state'].value_counts().sort("counts")

strength_state,counts
str,u32
"""6v3""",5
"""3v6""",9
"""3v4""",177
"""4v6""",228
"""4v3""",231
"""6v4""",248
"""3v5""",386
"""5v3""",460
"""4v4""",1364
"""3v3""",1467


In [191]:
sdv_cols = ['event_type', 'event', 'description', 'period', 'period_seconds', 'period_seconds_remaining', 'game_seconds', 'game_seconds_remaining', 'home_score', 'away_score', 'strength_state', 'event_idx', 'extra_attacker', 'home_skaters', 'away_skaters', 'game_id', 'period_type', 'ordinal_num', 'period_time', 'period_time_remaining', 'date_time', 'home_final', 'away_final', 'season', 'season_type', 'game_date', 'game_start', 'game_end', 'game_length', 'game_state', 'detailed_state', 'venue_name', 'venue_link', 'home_name', 'home_abbreviation', 'home_division_name', 'home_conference_name', 'home_id', 'away_name', 'away_abbreviation', 'away_division_name', 'away_conference_name', 'away_id', 'event_id', 'event_team', 'event_team_type', 'num_on', 'players_on', 'players_off', 'away_on_1', 'away_on_2', 'away_on_3', 'away_on_4', 'away_on_5', 'away_goalie', 'ids_on', 'ids_off', 'secondary_type', 'home_on_1', 'home_on_2', 'home_on_3', 'home_on_4', 'home_on_5', 'home_goalie', 'event_player_1_name', 'event_player_1_type', 'event_player_2_name', 'event_player_2_type', 'strength_code', 'strength', 'x', 'y', 'x_fixed', 'y_fixed', 'event_player_1_id', 'event_player_1_link', 'event_player_2_id', 'event_player_2_link', 'event_team_id', 'event_team_link', 'event_team_abbr', 'num_off', 'penalty_severity', 'penalty_minutes', 'away_on_6', 'shot_distance', 'shot_angle', 'event_goalie_name', 'event_goalie_id', 'event_goalie_link', 'event_goalie_type', 'event_player_3_name', 'event_player_3_type', 'game_winning_goal', 'empty_net', 'event_player_3_id', 'event_player_3_link', 'event_player_4_type', 'event_player_4_id', 'event_player_4_name', 'event_player_4_link', 'home_on_6', 'venue_id']

# Check Similar Columns:
sim_cols = [l for l in sdv_cols if l in PBP_RAW.columns]

# Check SDV Not In API Cols
ignore_cols = ['event', 'description', 'extra_attacker', 'ordinal_num', 'period_time', 'period_time_remaining', 'home_final', 'away_final', 'game_start', 'game_end', 'game_length', 'game_state', 'detailed_state']
sdv_not_api = [l for l in sdv_cols if (l not in PBP_RAW.columns) & (~l.endswith('_link'))]

# Check API Not In SDV Cols
api_not_sdv = [l for l in PBP_RAW.columns if l not in sdv_cols]

print(sim_cols)
print(len(sim_cols))

print(sdv_not_api)
print(len(sdv_not_api))

print(api_not_sdv)
print(len(api_not_sdv))

## Columns To Create:

# Shift Related
    # away_goalie / home_goalie
    # event_player_1_name, event_player_2_name, event_player_3_name, event_player_4_name, event_goalie_name
    # home_on_1, home_on_2, home_on_3, home_on_5, home_on_6
    # away_on_1, away_on_2, away_on_3, away_on_5, away_on_6

    # *num_on
    # *players_on/players_off
    # *ids_on/ids_off

   

# Ignore

    # event
    # description
    # ordinal_num
    # home_final
    # away_final
    # num_off
    # penalty_severity
    # Extra Attacker


['event_type', 'period', 'period_seconds', 'period_seconds_remaining', 'game_seconds', 'game_seconds_remaining', 'home_score', 'away_score', 'strength_state', 'event_idx', 'home_skaters', 'away_skaters', 'game_id', 'period_type', 'season', 'season_type', 'game_date', 'home_abbreviation', 'home_id', 'away_abbreviation', 'away_id', 'event_id', 'event_team_type', 'secondary_type', 'event_player_1_type', 'event_player_2_type', 'x', 'y', 'event_player_1_id', 'event_player_2_id', 'event_team_id', 'event_team_abbr', 'penalty_minutes', 'event_goalie_id', 'event_player_3_type', 'event_player_3_id', 'event_player_4_type', 'event_player_4_id']
38
['event', 'description', 'extra_attacker', 'ordinal_num', 'period_time', 'period_time_remaining', 'date_time', 'home_final', 'away_final', 'game_start', 'game_end', 'game_length', 'game_state', 'detailed_state', 'venue_name', 'home_name', 'home_division_name', 'home_conference_name', 'away_name', 'away_division_name', 'away_conference_name', 'event_team'

## Shift Data

- This will be joined to the PBP_RAW Table and create columns for which players were on the ice at a given event.
- In this cell block, we will create the shift columns needed:
    - *home_on_1, home_on_2, home_on_3, home_on_5, home_on_6, home_goalie*
    - *away_on_1, away_on_2, away_on_3, away_on_5, away_on_6, away_goalie*
- I believe I have two options on the join:
    - 1) Create a table with every seconds from the game and inner join home/away players
        - May create too large of a table for each join (max height = 9600 rows | normal height = 3600 rows)
    - 2) Diagonally Union the table
        - Need to figure out which columns from PBP_RAW will be null (possibly fill_na(method = 'forward')? )
        - Need to figure out join key and create "CHANGE" event_type as well as other types associated (on the fly vs stoppage?)

In [15]:
## shift For More Information
g_shift_id = 2021020001
shift_link = "https://api.nhle.com/stats/rest/en/shiftcharts?cayenneExp=gameId="+str(g_shift_id)
shift_response = requests.get(shift_link)
shift_raw = pd.json_normalize(shift_response.json())
shift_raw = pd.json_normalize(shift_raw['data'])


# Create an empty DataFrame to store the normalized plays
normalized_shift = pd.DataFrame()

# Iterate over each row in plays_1 and normalize the JSON data
for _, row in shift_raw.iterrows():
    # Normalize the JSON data in the current row
    normalized_row_g = pd.json_normalize(row)

    # Concatenate the normalized row to the result DataFrame
    normalized_shift = pd.concat([normalized_shift, normalized_row_g], ignore_index=True)

# Create Columns From Data (Names and Shift Starts)
normalized_shift['player_name'] = normalized_shift['firstName'] + ' ' + normalized_shift['lastName']
# Period Time
normalized_shift['period_start_seconds'] = pd.to_datetime(normalized_shift['startTime'], format='%M:%S').dt.minute * 60 + pd.to_datetime(normalized_shift['startTime'], format='%M:%S').dt.second
normalized_shift['period_end_seconds'] = pd.to_datetime(normalized_shift['endTime'], format='%M:%S').dt.minute * 60 + pd.to_datetime(normalized_shift['endTime'], format='%M:%S').dt.second
# Game Time
normalized_shift['game_start_seconds'] = normalized_shift['period_start_seconds'] + ((normalized_shift['period'] - 1) * 1200)
normalized_shift['period_end_seconds'] = normalized_shift['period_end_seconds'] + ((normalized_shift['period'] - 1) * 1200)

# Rename
normalized_shift = normalized_shift.rename(columns = {
    'gameId': 'game_id',
    'id': 'shift_id',
    'playerId': 'player_id',
    'teamId': 'team_id',
    'shiftNumber': 'shift_number',
    'teamName': 'team_name'
})

# Keep
shift_keep_cols = ['game_id', 'shift_id', 'team_id', 'player_id', 'player_name', 'period',
                   'period_start_seconds', 'period_end_seconds', 'game_start_seconds', 'period_end_seconds',
                   'eventNumber', 'team_name', 'shift_number', 'typeCode']
normalized_shift = normalized_shift[shift_keep_cols]
normalized_shift = pl.DataFrame(normalized_shift)



print(normalized_shift.sort_values(['game_start_seconds', 'team_id']).head(12))

        game_id  shift_id  team_id  player_id         player_name  period  \
0    2021020001  11053104        5    8470604         Jeff Carter       1   
63   2021020001  11053109        5    8471724         Kris Letang       1   
225  2021020001  11053129        5    8475208      Brian Dumoulin       1   
271  2021020001  11053135        5    8475810          Bryan Rust       1   
418  2021020001  11053480        5    8477465       Tristan Jarry       1   
541  2021020001  11053162        5    8478046       Danton Heinen       1   
132  2021020001  11053117       14    8474151       Ryan McDonagh       1   
299  2021020001  11053138       14    8476292        Ondrej Palat       1   
319  2021020001  11053140       14    8476453     Nikita Kucherov       1   
342  2021020001  11053430       14    8476883  Andrei Vasilevskiy       1   
479  2021020001  11053156       14    8478010       Brayden Point       1   
569  2021020001  11053165       14    8478416         Erik Cernak       1   

## Roster Data

- Here I have a couple options to go about how I want to connect roster data for joins to event_player_id (and all other event_player_id columns)
    - **Use LoadRosters.py**
        - Pros:
            - Already created
            - Join structure exists
            - Has all columns I would need
            - Can be joined to shift data?
        - Cons:
            - May not have the most current players (Some mismatched joins here)
    - **Hit Rosters from beginning of PBP API**
        - Pros:
            - Each game is unique and all players in game will be matched (all players from this source are in the game)
        - Cons:
            - Load time
            - Build query time
            - Still need to join rosters some 

### Option 1: Load Rosters From LoadRosters.py Script

In [22]:
# Path
roster_file = 'NHL_Rosters_2014_2024.csv'

# All Players - Connect To event_player_1_id, event_player_2_id, event_player_3_id, event_player_4_id, event_goalie_id, home_goalie, away_goalie
ROSTER_DF_RAW = pl.read_csv(roster_file)

ROSTER_DF = (
    ROSTER_DF_RAW
    .with_columns([
        pl.col("player_id").cast(pl.Int32),
        (pl.col("first_name").str.to_uppercase() + '.' + pl.col("last_name").str.to_uppercase()).alias('player_name'),
        pl.when((pl.col('pos_G') == 1) & (pl.col('hand_R') == 1)).then(pl.lit(1)).otherwise(pl.lit(0)).alias('G_hand_R'),
        pl.when((pl.col('pos_G') == 1) & (pl.col('hand_L') == 1)).then(pl.lit(1)).otherwise(pl.lit(0)).alias('G_hand_L')
        ])
    .select(['player_id', 'player_name', 'hand_R', 'hand_L', 'pos_F', 'pos_D', 'pos_G', 'G_hand_R', 'G_hand_L'])
    .unique()
)

### Option 2: Load Rosters From Start of Game

In [None]:
df_list = []
for i in [2021020001]:
    pbp_link = 'https://api-web.nhle.com/v1/gamecenter/'+str(i)+'/play-by-play'

    pbp_response = requests.get(pbp_link)
    pbp_data = pd.json_normalize(pbp_response.json())
    pbp_data = pbp_data[pbp_data['gameType'] != 1]

    ## GAME DATA
    game_data = pbp_data[['id', 'season', 'gameDate', 'gameType', 'awayTeam.id', 'awayTeam.abbrev', 'homeTeam.id', 'homeTeam.abbrev']]

    ## PLAYS DATA
    roster_spots = pd.json_normalize(pbp_data['rosterSpots'])
#
    ## Create an empty DataFrame to store the normalized plays
    #normalized_plays = pd.DataFrame()
#
    ## Iterate over each row in plays_1 and normalize the JSON data
    #for _, row in plays_1.iterrows():
    #    # Normalize the JSON data in the current row
    #    normalized_row = pd.json_normalize(row)
#
    #    # Concatenate the normalized row to the result DataFrame
    #    normalized_plays = pd.concat([normalized_plays, normalized_row], ignore_index=True)
#
    #result_df = pd.merge(game_data.assign(key=1), normalized_plays.assign(key=1), on='key').drop('key', axis=1)
    #result_df = pl.DataFrame(result_df)
#
    #df_list.append(result_df)
    
print('Loading Complete -- Begin Diagonal Union of', len(df_list),'Games')

#### *Check Options by Join Rate*

In [39]:
id_check = 'event_player_1_id'
dist_ids = pl.DataFrame({id_check: PBP_RAW.filter(~pl.col(id_check).is_null())[id_check].cast(pl.Int32).unique()})

## Option 1 ##
test_roster_join_1 = dist_ids.join(ROSTER_DF, left_on=id_check, right_on='player_id', how = 'left')

print("inner_join", dist_ids.join(ROSTER_DF, left_on=id_check, right_on='player_id', how = 'inner').height)
print("left_join", dist_ids.join(ROSTER_DF, left_on=id_check, right_on='player_id', how = 'left').height)
print("outer_join", dist_ids.join(ROSTER_DF, left_on=id_check, right_on='player_id', how = 'outer').height)

good_ids = dist_ids.join(ROSTER_DF, left_on=id_check, right_on='player_id', how = 'inner').select(id_check).unique()
good_ids = good_ids[id_check].to_list()

null_ids_outer = dist_ids.join(ROSTER_DF, left_on=id_check, right_on='player_id', how = 'left').filter(~pl.col(id_check).is_in(good_ids))
print(null_ids_outer.height)
print(null_ids_outer.head(10))

inner_join 2456
left_join 2482
outer_join 2524
26
shape: (10, 9)
┌───────────────────┬─────────────┬────────┬────────┬───────┬───────┬───────┬──────────┬──────────┐
│ event_player_1_id ┆ player_name ┆ hand_R ┆ hand_L ┆ pos_F ┆ pos_D ┆ pos_G ┆ G_hand_R ┆ G_hand_L │
│ ---               ┆ ---         ┆ ---    ┆ ---    ┆ ---   ┆ ---   ┆ ---   ┆ ---      ┆ ---      │
│ i32               ┆ str         ┆ i64    ┆ i64    ┆ i64   ┆ i64   ┆ i64   ┆ i32      ┆ i32      │
╞═══════════════════╪═════════════╪════════╪════════╪═══════╪═══════╪═══════╪══════════╪══════════╡
│ 8471279           ┆ null        ┆ null   ┆ null   ┆ null  ┆ null  ┆ null  ┆ null     ┆ null     │
│ 8471328           ┆ null        ┆ null   ┆ null   ┆ null  ┆ null  ┆ null  ┆ null     ┆ null     │
│ 8475828           ┆ null        ┆ null   ┆ null   ┆ null  ┆ null  ┆ null  ┆ null     ┆ null     │
│ 8476864           ┆ null        ┆ null   ┆ null   ┆ null  ┆ null  ┆ null  ┆ null     ┆ null     │
│ 8477035           ┆ null        ┆

In [40]:
PBP_RAW.filter(pl.col('event_player_1_id') == 8471279).head()

game_id,game_date,season,event_team_id,season_type,away_id,situationCode,event_goalie_id,details.descKey,home_id,servedby_player_id,away_abbreviation,period_type,details.typeCode,home_score,home_abbreviation,period,event_zone,x,timeRemaining,timeInPeriod,secondary_type,away_score,reason,event_type,penalty_minutes,typeCode,event_id,details.secondaryReason,event_idx,homeTeamDefendingSide,y,event_team_type,event_team_abbr,period_seconds,period_seconds_remaining,game_seconds,game_seconds_remaining,event_player_1_id,event_player_2_id,event_player_3_id,event_player_4_id,event_player_1_type,event_player_2_type,event_player_3_type,event_player_4_type,away_en,home_en,away_skaters,home_skaters,strength_state,true_strength_state,x_abs,y_abs,event_distance,event_angle
f64,str,f64,f64,str,f64,str,f64,str,f64,f64,str,str,str,f64,str,f64,str,f64,str,str,str,f64,str,str,f64,f64,f64,str,f64,str,f64,str,str,i64,i64,f64,f64,f64,f64,f64,f64,str,str,str,str,i32,i32,i32,i32,str,str,f64,f64,f64,f64
2016100000.0,"""2016-09-18""",20162017.0,7404.0,"""6.0""",7405.0,"""1551""",8476899.0,,7404.0,,"""NAT""","""REG""",,,"""FIN""",1.0,"""O""",35.0,"""18:29""","""01:31""","""Slap""",,,"""SHOT""",,506.0,9.0,,18.0,,16.0,"""home""","""home_abbreviat…",91,1109,91.0,1109.0,8471279.0,8476899.0,,,"""Shooter""","""Goalie""",,,1,1,5,5,"""5v5""","""5v5""",35.0,16.0,56.320511,16.504361
2016100000.0,"""2016-09-18""",20162017.0,7404.0,"""6.0""",7405.0,"""1551""",,,7404.0,,"""NAT""","""REG""",,,"""FIN""",2.0,"""D""",90.0,"""08:23""","""11:37""",,,,"""TAKEAWAY""",,525.0,330.0,,361.0,,21.0,"""home""","""home_abbreviat…",697,503,1897.0,1703.0,8471279.0,,,,"""PlayerID""","""PlayerID""",,,1,1,5,5,"""5v5""","""5v5""",-90.0,-21.0,180.227634,6.691264
2016100000.0,"""2016-09-18""",20162017.0,7404.0,"""6.0""",7405.0,"""1551""",,,7404.0,,"""NAT""","""REG""",,,"""FIN""",2.0,"""D""",63.0,"""08:19""","""11:41""",,,,"""GIVEAWAY""",,504.0,331.0,,362.0,,35.0,"""home""","""home_abbreviat…",701,499,1901.0,1699.0,8471279.0,,,,"""PlayerID""","""PlayerID""",,,1,1,5,5,"""5v5""","""5v5""",-63.0,-35.0,155.977562,12.967084
2016100000.0,"""2016-09-20""",20162017.0,7404.0,"""6.0""",7404.0,"""1551""",,,7407.0,,"""FIN""","""REG""",,,"""SWE""",1.0,"""N""",-21.0,"""09:56""","""10:04""",,,,"""HIT""",,503.0,31.0,,110.0,,35.0,"""away""","""away_abbreviat…",604,596,604.0,596.0,8471279.0,8476887.0,,,"""Hitter""","""Hittee""",,,1,1,5,5,"""5v5""","""5v5""",-21.0,35.0,115.433964,17.650124
2016100000.0,"""2016-09-20""",20162017.0,7404.0,"""6.0""",7404.0,"""1551""",,,7407.0,,"""FIN""","""REG""",,,"""SWE""",1.0,"""D""",26.0,"""03:30""","""16:30""",,,,"""GIVEAWAY""",,504.0,50.0,,181.0,,38.0,"""away""","""away_abbreviat…",990,210,990.0,210.0,8471279.0,,,,"""PlayerID""","""PlayerID""",,,1,1,5,5,"""5v5""","""5v5""",-26.0,-38.0,121.115647,18.285352


In [42]:
PBP_RAW['season_type'].value_counts().sort('counts', descending=True)

season_type,counts
str,u32
"""R""",4071782
"""P""",358948
"""6.0""",3992
"""8.0""",3700
"""I""",2246
"""7.0""",1412
"""12.0""",3


In [50]:
with pl.Config(set_fmt_float="full"):
    print(PBP_RAW.filter(pl.col('season_type') == "I").select('game_id').unique().head())

shape: (5, 1)
┌────────────┐
│ game_id    │
│ ---        │
│ f64        │
╞════════════╡
│ 2016040621 │
│ 2018040642 │
│ 2017040633 │
│ 2016040623 │
│ 2021040663 │
└────────────┘
