In [1]:
# Pandas
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import numpy as np

# Polars (Arrow)
from pyarrow.dataset import dataset
import polars as pl
pl.Config.set_tbl_rows(n=-1)
pl.Config.set_tbl_cols(n=-1)

# Hit API
import requests

# Tools
from itertools import chain
from datetime import datetime, timedelta
from math import pi

# Save
import pickle
import os
import sys
sys.argv.extend(["-Xfrozen_modules=off"])
import pathlib
import psutil

In [2]:
def get_mem_stats(n=5):
    print(f"Total Memory: {psutil.virtual_memory().total / (1024 ** 3):.2f} GB")
    print(f"Used Memory: {psutil.virtual_memory().used / (1024 ** 3):.2f} GB")
    print(f"Free Memory: {psutil.virtual_memory().free / (1024 ** 3):.2f} GB")

    # Get a list of running processes
    processes = list(psutil.process_iter(['pid', 'name', 'memory_info']))

    # Sort processes by memory usage
    processes.sort(key=lambda x: x.info['memory_info'].rss, reverse=True)

    # Print information about the top processes
    for process in processes[:n]:  # Adjust the number to show more or fewer processes
        print(f"PID: {process.info['pid']}, Name: {process.info['name']}, Memory Usage: {process.info['memory_info'].rss / (1024 ** 2):.2f} MB")

get_mem_stats()

Total Memory: 15.73 GB
Used Memory: 7.42 GB
Free Memory: 8.31 GB
PID: 4652, Name: MemCompression, Memory Usage: 556.34 MB
PID: 21428, Name: msedgewebview2.exe, Memory Usage: 292.66 MB
PID: 2364, Name: Code.exe, Memory Usage: 275.82 MB
PID: 12700, Name: Code.exe, Memory Usage: 242.60 MB
PID: 328, Name: Code.exe, Memory Usage: 232.32 MB


In [305]:
model_path = "Data/PBP/API_RAW_PBP_Data.parquet"
PBP_RAW = pl.read_parquet(model_path).filter((pl.col('season_type').is_in(['R', 'P'])) & (pl.col('season').is_in([20232024, 20222023, 20212022])))
PBP_SLIM = PBP_RAW.filter(pl.col('game_id').is_in(PBP_RAW['game_id'].unique()[0:200])) #

In [306]:
print(len(PBP_SLIM['game_id'].unique()))
PBP_SLIM.head()

200


game_id,game_date,season,event_idx,season_type,period,period_type,timeRemaining,timeInPeriod,situationCode,homeTeamDefendingSide,event_team_id,away_id,away_abbreviation,away_score,home_id,home_abbreviation,home_score,event_id,typeCode,details.typeCode,event_type,details.descKey,reason,details.secondaryReason,secondary_type,event_zone,x,y,event_goalie_id,servedby_player_id,penalty_minutes,event_team_type,event_team_abbr,period_seconds,period_seconds_remaining,game_seconds,game_seconds_remaining,event_player_1_id,event_player_2_id,event_player_3_id,event_player_4_id,event_player_1_type,event_player_2_type,event_player_3_type,event_player_4_type,away_en,home_en,away_skaters,home_skaters,strength_state,true_strength_state,x_abs,y_abs,event_distance,event_angle
i32,str,i32,i32,str,i32,str,str,str,str,str,str,str,str,f32,str,str,f32,i32,i32,str,str,str,str,str,str,str,f32,f32,str,str,str,str,str,i64,i64,i64,i64,str,str,str,str,str,str,str,str,i32,i32,i32,i32,str,str,f64,f64,f64,f64
2021020001,"""2021-10-12""",20212022,8,"""R""",1,"""REG""","""20:00""","""00:00""","""1551""","""left""",,"""5""","""PIT""",,"""14""","""TBL""",,51,520,,"""PERIOD_START""",,,,,,,,,,,"""away""","""away_abbreviat…",0,1200,0,1200,,,,,,,,,1,1,5,5,"""5v5""","""5v5""",,,,
2021020001,"""2021-10-12""",20212022,9,"""R""",1,"""REG""","""20:00""","""00:00""","""1551""","""left""","""5.0""","""5""","""PIT""",,"""14""","""TBL""",,52,502,,"""FACEOFF""",,,,,"""N""",0.0,0.0,,,,"""away""","""away_abbreviat…",0,1200,0,1200,"""8470604.0""","""8478010.0""",,,"""Winner""","""Loser""",,,1,1,5,5,"""5v5""","""5v5""",-0.0,-0.0,89.0,0.0
2021020001,"""2021-10-12""",20212022,10,"""R""",1,"""REG""","""19:42""","""00:18""","""1551""","""left""","""14.0""","""5""","""PIT""",,"""14""","""TBL""",,8,503,,"""HIT""",,,,,"""O""",46.0,40.0,,,,"""away""","""away_abbreviat…",18,1182,18,1182,"""8476292.0""","""8470604.0""",,,"""Hitter""","""Hittee""",,,1,1,5,5,"""5v5""","""5v5""",-46.0,-40.0,140.801278,16.504361
2021020001,"""2021-10-12""",20212022,14,"""R""",1,"""REG""","""19:22""","""00:38""","""1551""","""left""",,"""5""","""PIT""",,"""14""","""TBL""",,9,516,,"""STOPPAGE""",,"""puck-in-nettin…",,,,,,,,,"""away""","""away_abbreviat…",38,1162,38,1162,,,,,,,,,1,1,5,5,"""5v5""","""5v5""",,,,
2021020001,"""2021-10-12""",20212022,15,"""R""",1,"""REG""","""19:22""","""00:38""","""1551""","""left""","""14.0""","""5""","""PIT""",,"""14""","""TBL""",,53,502,,"""FACEOFF""",,,,,"""N""",0.0,0.0,,,,"""away""","""away_abbreviat…",38,1162,38,1162,"""8478519.0""","""8478542.0""",,,"""Winner""","""Loser""",,,1,1,5,5,"""5v5""","""5v5""",-0.0,-0.0,89.0,0.0


In [307]:
# Path
roster_file = 'Data/NHL_Rosters_2014_2024.csv'

# All Players - Connect To event_player_1_id, event_player_2_id, event_player_3_id, event_player_4_id, event_goalie_id, home_goalie, away_goalie
ROSTER_DF_RAW = pl.read_csv(roster_file)

ROSTER_DF = (
    ROSTER_DF_RAW
    .with_columns([
        pl.col("player_id").cast(pl.Int32),
        (pl.col("first_name").str.to_uppercase() + '.' + pl.col("last_name").str.to_uppercase()).alias('player_name'),
        pl.when((pl.col('pos_G') == 1) & (pl.col('hand_R') == 1)).then(pl.lit(1)).otherwise(pl.lit(0)).alias('G_hand_R'),
        pl.when((pl.col('pos_G') == 1) & (pl.col('hand_L') == 1)).then(pl.lit(1)).otherwise(pl.lit(0)).alias('G_hand_L')
        ])
    .select(['player_id', 'player_name', 'hand_R', 'hand_L', 'pos_F', 'pos_D', 'pos_G', 'G_hand_R', 'G_hand_L'])
    .unique()
)

ROSTER_DF_RAW.head()

player_id,first_name,last_name,pos_F,pos_D,pos_G,hand_R,hand_L
i64,str,str,i64,i64,i64,i64,i64
8473492,"""Matt""","""Beleskey""",1,0,0,0,1
8474009,"""Nick""","""Bonino""",1,0,0,0,1
8471699,"""Andrew""","""Cogliano""",1,0,0,0,1
8462041,"""Radek""","""Dvorak""",1,0,0,1,0
8475770,"""Emerson""","""Etem""",1,0,0,0,1


## Define Load Functions

### *1) Load Raw Shift Data From NHL API Using Loop*

In [308]:
def loop_shift_load(pbp_data):
    """ INPUT: NHL GAME ID (Can be from Play By Play Data or Game_ID_List) 
        OUTPUT: A Polars DataFrame Containing Data Stored in the NHL's API Shift Chart Endpoint"""
    
    # Load Game ID and Home/Away Ids
    shift_df_list = []
    current_game_ids = pbp_data['game_id'].unique()

    game_info_slim = (
        pbp_data
        .filter(pl.col('game_id').is_in(current_game_ids))
        .select('game_id', 'home_id', 'away_id', 'period', 'game_seconds', 'period_seconds', 'event_id', 'event_idx', 'event_type')
        .unique()
    )


    for i in current_game_ids:
        print("Now Loading Game ID: ", i)
        shift_link = "https://api.nhle.com/stats/rest/en/shiftcharts?cayenneExp=gameId="+str(i)
        shift_response = requests.get(shift_link).json()

        # Assuming "data" is the key containing nested data
        data_list = shift_response.get('data', [])
        keep_keys = ['id', 'endTime', 'firstName', 'gameId', 'lastName', 'period', 'playerId', 'startTime', 'teamAbbrev', 'teamId', 'eventNumber']
        filtered_data = [{key: item[key] for key in keep_keys} for item in data_list]
        shift_raw = pl.DataFrame(filtered_data)

        shift_raw = (
            shift_raw
            .with_columns([
                (pl.col('firstName') + ' ' + pl.col('lastName')).alias('player_name'),
                ((pl.col('startTime').str.slice(0, 2).cast(pl.Int32) * 60) + (pl.col('startTime').str.slice(3, 5).cast(pl.Int32))).alias('period_start_seconds'),
                ((pl.col('endTime').str.slice(0, 2).cast(pl.Int32) * 60) + (pl.col('endTime').str.slice(3, 5).cast(pl.Int32))).alias('period_end_seconds')
            ])
            .with_columns([
                (pl.col('period_start_seconds') + ((pl.col('period') - 1) * 1200)).alias('game_start_seconds'),
                (pl.col('period_end_seconds') + ((pl.col('period') - 1) * 1200)).alias('game_end_seconds'),
            ])
            .rename({
                    'gameId': 'game_id',
                    'id': 'shift_id',
                    'playerId': 'player_id',
                    'teamId': 'team_id',
                    'teamAbbrev': 'team_abbr'
                })
            .select([pl.col('game_id').cast(pl.Int32),
                     pl.col('team_id').cast(pl.Utf8),
                     pl.col('player_id').cast(pl.Utf8),
                     pl.col('player_name').str.to_uppercase().cast(pl.Utf8),
                     pl.col('team_abbr').cast(pl.Utf8),
                     pl.col('period').cast(pl.Int32),
                     pl.col('period_start_seconds').cast(pl.Int64),
                     pl.col('period_end_seconds').cast(pl.Int64),
                     pl.col('game_start_seconds').cast(pl.Int64),
                     pl.col('game_end_seconds').cast(pl.Int64),
                     pl.col('eventNumber').cast(pl.Int32)
                     ]) #'shift_id', 'typeCode', 'shift_number'
        )

        shift_raw = (
        # Join and Create team_type
        shift_raw
        .join(game_info_slim.select('game_id', 'home_id', 'away_id').unique(), on='game_id', how='left')
        .filter((pl.col('home_id') == pl.col('team_id')) | (pl.col('away_id') == pl.col('team_id')))
        .filter(pl.col('game_start_seconds') != pl.col('game_end_seconds') )
        .with_columns(pl.when(pl.col('home_id') == pl.col('team_id')).then(pl.lit('home'))
                        .when(pl.col('away_id') == pl.col('team_id')).then(pl.lit('away')).otherwise(pl.lit(None)).alias('team_type'))
        .drop('home_id', 'away_id')
        .unique()
        )

        # Combine Consecutive Shifts
        gb_cols = [col for col in shift_raw.columns if col not in ['period_start_seconds', 'game_start_seconds', 'eventNumber']]
        shift_raw = (
            shift_raw
            .with_columns([
                pl.col('period_start_seconds').max().over(gb_cols).alias('period_start_seconds'),
                pl.col('game_start_seconds').max().over(gb_cols).alias('game_start_seconds'),
                pl.col('eventNumber').max().over(gb_cols).alias('eventNumber')
            ])
            .unique()
            # Separate Goalies
            .join(ROSTER_DF.with_columns([
                (pl.col('player_id').cast(pl.Utf8).alias('player_id')),
                (pl.col('pos_G').cast(pl.Int32).alias('pos_G'))
            ])
            .select('player_id', 'pos_G'), on='player_id', how='left')
            .unique()
            .sort('game_id', 'period', 'period_start_seconds', 'period_end_seconds')
        )

        # Concat Player IDs into lists for each group (i.e. event and seconds)
        result_df = (
            shift_raw
            .groupby(['game_id', 'period', 'period_start_seconds', 'period_end_seconds', 'team_type', 'pos_G'])
            .agg(
                pl.concat_list('player_id').flatten().unique().alias('player_id_list'),
                pl.concat_list('player_name').flatten().unique().alias('player_name_list')
                )
            .sort('game_id', 'period', 'period_start_seconds', 'period_end_seconds')
        )

        # Separate and Create Player On Columns
        game_data = (
             game_info_slim
             .filter(pl.col('game_id') == i)
             .sort('game_id', 'period', 'period_seconds', 'event_idx')
        )

        def apply_player_lists_pl(x, ty, pos, shift, output):
            return get_player_lists_pl((x['game_id'], x['period'], x['period_seconds'], ty, pos, shift, output))

        def get_player_lists_pl(x):
            # Outline Variables
            g_id, per, p_secs, ty, pos, shift, output = x

            # Adjust conditions as needed
            conditions = (
                (result_df['game_id'] == g_id) &
                (result_df['period'] == per) &
                (result_df['team_type'] == ty) &
                (result_df['pos_G'] == pos)
            )

            if shift == 'current':
                conditions &= (
                    (result_df['period_start_seconds'] < p_secs) &
                    (result_df['period_end_seconds'] > p_secs)
                )
            elif shift == 'on':
                conditions &= (result_df['period_start_seconds'] == p_secs)
            elif shift == 'off':
                conditions &= (result_df['period_end_seconds'] == p_secs)

            filtered_rows = result_df.filter(conditions)

            if output == 'id':
                result_list = set(filtered_rows['player_id_list'].explode().to_list())
            elif output == 'name':
                result_list = set(filtered_rows['player_name_list'].explode().to_list())
            
            return ','.join(str(item) for item in result_list)
        
        # List of columns to generate
        columns_to_generate = [
            ('home', 0, 'current', 'id'),
            ('home', 0, 'current', 'name'),
            ('home', 0, 'on', 'id'),
            ('home', 0, 'on', 'name'),
            ('home', 0, 'off', 'id'),
            ('home', 0, 'off', 'name'),
            ('away', 0, 'current', 'id'),
            ('away', 0, 'current', 'name'),
            ('away', 0, 'on', 'id'),
            ('away', 0, 'on', 'name'),
            ('away', 0, 'off', 'id'),
            ('away', 0, 'off', 'name'),
            ('home', 1, 'current', 'id'),
            ('home', 1, 'current', 'name'),
            ('home', 1, 'on', 'id'),
            ('home', 1, 'on', 'name'),
            ('home', 1, 'off', 'id'),
            ('home', 1, 'off', 'name'),
            ('away', 1, 'current', 'id'),
            ('away', 1, 'current', 'name'),
            ('away', 1, 'on', 'id'),
            ('away', 1, 'on', 'name'),
            ('away', 1, 'off', 'id'),
            ('away', 1, 'off', 'name')
        ]

        # Generate columns dynamically
        for prefix, pos, shift, output in columns_to_generate:
            if pos == 1:
                pos_lab = 'goalie'
            elif pos == 0:
                pos_lab = 'skater'

            col_name = f"{prefix}_{pos_lab}_{shift}_{output}"
            game_data = game_data.with_columns([
                pl.struct(["game_id", "period", "period_seconds"]).apply(lambda x: apply_player_lists_pl(x, prefix, pos, shift, output)).alias(col_name)
            ])

        game_start_end = ['GAME_START', 'PERIOD_START', 'GAME_END', 'PERIOD_END']

        game_data =(
             game_data
             .sort('game_id', 'period', 'period_seconds', 'event_idx')
             .filter(~pl.col('event_type').is_in(game_start_end))
            .with_columns([
                pl.col('event_idx').max().over(['game_id', 'period', 'period_seconds']).alias('max_event_idx')
            ])
            .with_columns([
                (pl.col('game_id').cast(pl.Utf8) + '-' + pl.col('period').cast(pl.Utf8) + '-' + pl.col('period_seconds').cast(pl.Utf8)).alias('event_seconds_id'),
                pl.when(pl.col('event_idx') == pl.col('max_event_idx')).then(pl.col('event_type')).otherwise(pl.lit(None)).alias('max_event_type')
            ])
            .with_columns([
                pl.col('event_seconds_id').count().over(['game_id', 'period', 'period_seconds']).alias('count_event_seconds_id')
            ])
        )

        teams = ['home', 'away']
        positions = ['skater', 'goalie']
        outputvals = ['id', 'name']

        for team in teams:
            for position in positions:
                for outputval in outputvals:

                    cur_cols = f"{team}_{position}_current_{outputval}"
                    off_cols = f"{team}_{position}_off_{outputval}"
                    on_cols = f"{team}_{position}_on_{outputval}"

                    label1 = f"{team}_{position}_on_{outputval}"
                    if position == 'goalie':
                        label2 = f"_goalie_{outputval}"
                    else:
                        label2 = f"on_{outputval}"

                    game_data = (
                        game_data
                        .with_columns([
                            pl.when((pl.col(cur_cols) != "") & (pl.col(on_cols)== "") & (pl.col(off_cols) == "")).then(pl.col(cur_cols))
                              .when((pl.col(cur_cols) == "") & (pl.col(on_cols)!= "") & (pl.col(off_cols) == "")).then(pl.col(on_cols))
                              .when((pl.col(cur_cols) == "") & (pl.col(on_cols)== "") & (pl.col(off_cols) != "")).then(pl.col(off_cols))
                              .when((pl.col(cur_cols) == "") & (pl.col(on_cols)!= "") & (pl.col(off_cols) != "") & (pl.col('event_idx') == pl.col('max_event_idx'))).then(pl.col(on_cols))
                              .when((pl.col(cur_cols) == "") & (pl.col(on_cols)!= "") & (pl.col(off_cols) != "") & (pl.col('event_idx') != pl.col('max_event_idx'))).then(pl.col(off_cols))
                              .when((pl.col(cur_cols) != "") & (pl.col(on_cols)!= "") & (pl.col('event_idx') == pl.col('max_event_idx'))).then(pl.concat_str([pl.col(cur_cols),pl.lit(","),pl.col(on_cols)]))
                              .when((pl.col(cur_cols) != "") & (pl.col(off_cols)!= "") & (pl.col('event_idx') != pl.col('max_event_idx'))).then(pl.concat_str([pl.col(cur_cols),pl.lit(","),pl.col(off_cols)]))
                              .when((pl.col(cur_cols) != "") & (pl.col(off_cols) != "") & (pl.col('event_idx') == pl.col('max_event_idx'))).then(pl.col(cur_cols))
                              .otherwise(pl.lit(None))
                              .alias(label1)
                        ])
                        .with_columns([pl.col(label1).str.split_exact(',', 7)])
                        .unnest(label1)
                        .rename({
                            "field_0" : f"{team}_1_{label2}",
                            "field_1" : f"{team}_2_{label2}",
                            "field_2" : f"{team}_3_{label2}",
                            "field_3" : f"{team}_4_{label2}",
                            "field_4" : f"{team}_5_{label2}",
                            "field_5" : f"{team}_6_{label2}",
                            "field_6" : f"{team}_7_{label2}",
                            "field_7" : f"{team}_8_{label2}"
                        })
                    )
        keep_cols = ['game_id', 'period', 'game_seconds', 'period_seconds', 'event_idx',
                     'home_1__goalie_id', 'home_1__goalie_name',
                     'home_1_on_id', 'home_2_on_id', 'home_3_on_id', 'home_4_on_id', 'home_5_on_id', 'home_6_on_id',
                     'home_1_on_name', 'home_2_on_name', 'home_3_on_name', 'home_4_on_name', 'home_5_on_name', 'home_6_on_name',
                     'away_1_on_id', 'away_2_on_id', 'away_3_on_id', 'away_4_on_id', 'away_5_on_id', 'away_6_on_id',
                     'away_1_on_name', 'away_2_on_name', 'away_3_on_name', 'away_4_on_name', 'away_5_on_name', 'away_6_on_name',
                     'away_1__goalie_id', 'away_1__goalie_name']
        game_data = (
            game_data
            .select(keep_cols)
            .sort('game_id', 'period', 'period_seconds', 'event_idx')
            .rename({
                'away_1__goalie_id': 'away_goalie',
                'away_1__goalie_name': 'away_goalie_name',
                'home_1__goalie_id': 'home_goalie',
                'home_1__goalie_name': 'home_goalie_name'
            })
        )

        # Append To List For Concat
        shift_df_list.append(game_data)

    # Combine DataFrames
    result_df = shift_df_list[0]
    for df in shift_df_list[1:]:
            result_df.extend(df)

    return result_df

In [309]:
raw_shift_df = loop_shift_load(pbp_data = PBP_SLIM)


Now Loading Game ID:  2021020001
Now Loading Game ID:  2021020002
Now Loading Game ID:  2021020003
Now Loading Game ID:  2021020004
Now Loading Game ID:  2021020005
Now Loading Game ID:  2021020006
Now Loading Game ID:  2021020007
Now Loading Game ID:  2021020008
Now Loading Game ID:  2021020009
Now Loading Game ID:  2021020010
Now Loading Game ID:  2021020011
Now Loading Game ID:  2021020012
Now Loading Game ID:  2021020013
Now Loading Game ID:  2021020014
Now Loading Game ID:  2021020015
Now Loading Game ID:  2021020016
Now Loading Game ID:  2021020017
Now Loading Game ID:  2021020018
Now Loading Game ID:  2021020019
Now Loading Game ID:  2021020020
Now Loading Game ID:  2021020021
Now Loading Game ID:  2021020022
Now Loading Game ID:  2021020023
Now Loading Game ID:  2021020024
Now Loading Game ID:  2021020025
Now Loading Game ID:  2021020026
Now Loading Game ID:  2021020027
Now Loading Game ID:  2021020028
Now Loading Game ID:  2021020029
Now Loading Game ID:  2021020030
Now Loadin

In [311]:
# Find All Nulls
PBP_SLIM.join(raw_shift_df, on = ['game_id', 'period', 'game_seconds', 'period_seconds', 'event_idx'], how = "left").head()

game_id,game_date,season,event_idx,season_type,period,period_type,timeRemaining,timeInPeriod,situationCode,homeTeamDefendingSide,event_team_id,away_id,away_abbreviation,away_score,home_id,home_abbreviation,home_score,event_id,typeCode,details.typeCode,event_type,details.descKey,reason,details.secondaryReason,secondary_type,event_zone,x,y,event_goalie_id,servedby_player_id,penalty_minutes,event_team_type,event_team_abbr,period_seconds,period_seconds_remaining,game_seconds,game_seconds_remaining,event_player_1_id,event_player_2_id,event_player_3_id,event_player_4_id,event_player_1_type,event_player_2_type,event_player_3_type,event_player_4_type,away_en,home_en,away_skaters,home_skaters,strength_state,true_strength_state,x_abs,y_abs,event_distance,event_angle,home_goalie,home_goalie_name,home_1_on_id,home_2_on_id,home_3_on_id,home_4_on_id,home_5_on_id,home_6_on_id,home_1_on_name,home_2_on_name,home_3_on_name,home_4_on_name,home_5_on_name,home_6_on_name,away_1_on_id,away_2_on_id,away_3_on_id,away_4_on_id,away_5_on_id,away_6_on_id,away_1_on_name,away_2_on_name,away_3_on_name,away_4_on_name,away_5_on_name,away_6_on_name,away_goalie,away_goalie_name
i32,str,i32,i32,str,i32,str,str,str,str,str,str,str,str,f32,str,str,f32,i32,i32,str,str,str,str,str,str,str,f32,f32,str,str,str,str,str,i64,i64,i64,i64,str,str,str,str,str,str,str,str,i32,i32,i32,i32,str,str,f64,f64,f64,f64,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
2021020001,"""2021-10-12""",20212022,8,"""R""",1,"""REG""","""20:00""","""00:00""","""1551""","""left""",,"""5""","""PIT""",,"""14""","""TBL""",,51,520,,"""PERIOD_START""",,,,,,,,,,,"""away""","""away_abbreviat…",0,1200,0,1200,,,,,,,,,1,1,5,5,"""5v5""","""5v5""",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2021020001,"""2021-10-12""",20212022,9,"""R""",1,"""REG""","""20:00""","""00:00""","""1551""","""left""","""5.0""","""5""","""PIT""",,"""14""","""TBL""",,52,502,,"""FACEOFF""",,,,,"""N""",0.0,0.0,,,,"""away""","""away_abbreviat…",0,1200,0,1200,"""8470604.0""","""8478010.0""",,,"""Winner""","""Loser""",,,1,1,5,5,"""5v5""","""5v5""",-0.0,-0.0,89.0,0.0,"""8476883""","""ANDREI VASILEV…","""8476453""","""8478010""","""8474151""","""8478416""","""8476292""",,"""NIKITA KUCHERO…","""ONDREJ PALAT""","""BRAYDEN POINT""","""ERIK CERNAK""","""RYAN MCDONAGH""",,"""8470604""","""8478046""","""8475810""","""8475208""","""8471724""",,"""BRIAN DUMOULIN…","""DANTON HEINEN""","""JEFF CARTER""","""BRYAN RUST""","""KRIS LETANG""",,"""8477465""","""TRISTAN JARRY"""
2021020001,"""2021-10-12""",20212022,10,"""R""",1,"""REG""","""19:42""","""00:18""","""1551""","""left""","""14.0""","""5""","""PIT""",,"""14""","""TBL""",,8,503,,"""HIT""",,,,,"""O""",46.0,40.0,,,,"""away""","""away_abbreviat…",18,1182,18,1182,"""8476292.0""","""8470604.0""",,,"""Hitter""","""Hittee""",,,1,1,5,5,"""5v5""","""5v5""",-46.0,-40.0,140.801278,16.504361,"""8476883""","""ANDREI VASILEV…","""8476453""","""8478010""","""8474151""","""8478416""","""8476292""",,"""NIKITA KUCHERO…","""ONDREJ PALAT""","""BRAYDEN POINT""","""ERIK CERNAK""","""RYAN MCDONAGH""",,"""8470604""","""8478046""","""8475810""","""8475208""","""8471724""",,"""BRIAN DUMOULIN…","""DANTON HEINEN""","""JEFF CARTER""","""BRYAN RUST""","""KRIS LETANG""",,"""8477465""","""TRISTAN JARRY"""
2021020001,"""2021-10-12""",20212022,14,"""R""",1,"""REG""","""19:22""","""00:38""","""1551""","""left""",,"""5""","""PIT""",,"""14""","""TBL""",,9,516,,"""STOPPAGE""",,"""puck-in-nettin…",,,,,,,,,"""away""","""away_abbreviat…",38,1162,38,1162,,,,,,,,,1,1,5,5,"""5v5""","""5v5""",,,,,"""8476883""","""ANDREI VASILEV…","""8478519""","""8474564""","""8475167""","""8480172""","""8473986""",,"""VICTOR HEDMAN""","""ANTHONY CIRELL…","""JAN RUTTA""","""STEVEN STAMKOS…","""ALEX KILLORN""",,"""8477969""","""8478542""","""8477953""","""8478507""","""8475722""",,"""EVAN RODRIGUES…","""MARCUS PETTERS…","""KASPERI KAPANE…","""JOHN MARINO""","""JASON ZUCKER""",,"""8477465""","""TRISTAN JARRY"""
2021020001,"""2021-10-12""",20212022,15,"""R""",1,"""REG""","""19:22""","""00:38""","""1551""","""left""","""14.0""","""5""","""PIT""",,"""14""","""TBL""",,53,502,,"""FACEOFF""",,,,,"""N""",0.0,0.0,,,,"""away""","""away_abbreviat…",38,1162,38,1162,"""8478519.0""","""8478542.0""",,,"""Winner""","""Loser""",,,1,1,5,5,"""5v5""","""5v5""",-0.0,-0.0,89.0,0.0,"""8476883""","""ANDREI VASILEV…","""8478519""","""8474564""","""8475167""","""8480172""","""8473986""",,"""VICTOR HEDMAN""","""ANTHONY CIRELL…","""JAN RUTTA""","""STEVEN STAMKOS…","""ALEX KILLORN""",,"""8477969""","""8478542""","""8477953""","""8478507""","""8475722""",,"""EVAN RODRIGUES…","""MARCUS PETTERS…","""KASPERI KAPANE…","""JOHN MARINO""","""JASON ZUCKER""",,"""8477465""","""TRISTAN JARRY"""


In [255]:
raw_shift_df.head()#.filter(pl.col('home_skater_on_id').arr.len == 0)
#raw_shift_df.filter((pl.col('game_id') == 2021020014) & (pl.col('period') == 1) & (pl.col('period_seconds') == 1070))

game_id,home_id,away_id,period,game_seconds,period_seconds,event_id,event_idx,event_type,home_skater_current_id,home_skater_current_name,home_skater_on_id,home_skater_on_name,home_skater_off_id,home_skater_off_name,away_skater_current_id,away_skater_current_name,away_skater_on_id,away_skater_on_name,away_skater_off_id,away_skater_off_name,home_goalie_current_id,home_goalie_current_name,home_goalie_on_id,home_goalie_on_name,home_goalie_off_id,home_goalie_off_name,away_goalie_current_id,away_goalie_current_name,away_goalie_on_id,away_goalie_on_name,away_goalie_off_id,away_goalie_off_name,max_event_idx,event_seconds_id,max_event_type,count_event_seconds_id
i32,str,str,i32,i64,i64,i32,i32,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,i32,str,str,u32
2021020001,"""14""","""5""",1,0,0,52,9,"""FACEOFF""","""""","""""","""8476453, 84780…","""NIKITA KUCHERO…","""""","""""","""""","""""","""8470604, 84780…","""BRIAN DUMOULIN…","""""","""""","""""","""""","""8476883""","""ANDREI VASILEV…","""""","""""","""""","""""","""8477465""","""TRISTAN JARRY""","""""","""""",9,"""2021020001-1-0…","""FACEOFF""",1
2021020001,"""14""","""5""",1,18,18,8,10,"""HIT""","""8476453, 84780…","""NIKITA KUCHERO…","""""","""""","""""","""""","""8470604, 84780…","""BRIAN DUMOULIN…","""""","""""","""""","""""","""8476883""","""ANDREI VASILEV…","""""","""""","""""","""""","""8477465""","""TRISTAN JARRY""","""""","""""","""""","""""",10,"""2021020001-1-1…","""HIT""",1
2021020001,"""14""","""5""",1,38,38,9,14,"""STOPPAGE""","""8478519, 84745…","""VICTOR HEDMAN,…","""""","""""","""""","""""","""8477969, 84785…","""EVAN RODRIGUES…","""""","""""","""""","""""","""8476883""","""ANDREI VASILEV…","""""","""""","""""","""""","""8477465""","""TRISTAN JARRY""","""""","""""","""""","""""",15,"""2021020001-1-3…",,2
2021020001,"""14""","""5""",1,38,38,53,15,"""FACEOFF""","""8478519, 84745…","""VICTOR HEDMAN,…","""""","""""","""""","""""","""8477969, 84785…","""EVAN RODRIGUES…","""""","""""","""""","""""","""8476883""","""ANDREI VASILEV…","""""","""""","""""","""""","""8477465""","""TRISTAN JARRY""","""""","""""","""""","""""",15,"""2021020001-1-3…","""FACEOFF""",2
2021020001,"""14""","""5""",1,53,53,10,16,"""HIT""","""8478519, 84745…","""VICTOR HEDMAN,…","""""","""""","""""","""""","""8477969, 84785…","""EVAN RODRIGUES…","""""","""""","""""","""""","""8476883""","""ANDREI VASILEV…","""""","""""","""""","""""","""8477465""","""TRISTAN JARRY""","""""","""""","""""","""""",16,"""2021020001-1-5…","""HIT""",1


In [133]:
cond = ((pl.col('count_event_seconds_id') > 1) ) 
print(raw_shift_df.filter(cond & pl.col('home_skaters_on').is_null()).height, "of", raw_shift_df.filter(cond).height, "condition rows are null")
print(raw_shift_df.filter(cond).height, "of", raw_shift_df.height, "total rows")
raw_shift_df.filter(cond).head(10)

# Cond 1: (pl.col('home_skater_current_id') != "") & (pl.col('home_skater_on_id') == "") & (pl.col('home_skater_off_id') == "") & (pl.col('count_event_seconds_id') == 1)

2014 of 2677 condition rows are null
2677 of 6306 total rows


game_id,home_id,away_id,period,game_seconds,period_seconds,event_id,event_idx,event_type,home_skater_current_id,home_skater_current_name,home_skater_on_id,home_skater_on_name,home_skater_off_id,home_skater_off_name,away_skater_current_id,away_skater_current_name,away_skater_on_id,away_skater_on_name,away_skater_off_id,away_skater_off_name,home_goalie_current_id,home_goalie_current_name,home_goalie_on_id,home_goalie_on_name,home_goalie_off_id,home_goalie_off_name,away_goalie_current_id,away_goalie_current_name,away_goalie_on_id,away_goalie_on_name,away_goalie_off_id,away_goalie_off_name,max_event_idx,event_seconds_id,max_event_type,count_event_seconds_id,home_skaters_on
i32,str,str,i32,i64,i64,i32,i32,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,i32,str,str,u32,str
2021020001,"""14""","""5""",1,38,38,9,14,"""STOPPAGE""","""8478519, 84745…","""VICTOR HEDMAN,…","""""","""""","""""","""""","""8477969, 84785…","""EVAN RODRIGUES…","""""","""""","""""","""""","""8476883""","""ANDREI VASILEV…","""""","""""","""""","""""","""8477465""","""TRISTAN JARRY""","""""","""""","""""","""""",15,"""2021020001-1-3…",,2,"""8478519, 84745…"
2021020001,"""14""","""5""",1,38,38,53,15,"""FACEOFF""","""8478519, 84745…","""VICTOR HEDMAN,…","""""","""""","""""","""""","""8477969, 84785…","""EVAN RODRIGUES…","""""","""""","""""","""""","""8476883""","""ANDREI VASILEV…","""""","""""","""""","""""","""8477465""","""TRISTAN JARRY""","""""","""""","""""","""""",15,"""2021020001-1-3…","""FACEOFF""",2,"""8478519, 84745…"
2021020001,"""14""","""5""",1,63,63,54,18,"""SHOT""","""8479410, 84745…","""MIKHAIL SERGAC…","""8478472, 84706…","""COREY PERRY, M…","""8478519, 84745…","""STEVEN STAMKOS…","""""","""""","""8476934, 84788…","""BRIAN DUMOULIN…","""8477969, 84785…","""EVAN RODRIGUES…","""8476883""","""ANDREI VASILEV…","""""","""""","""""","""""","""8477465""","""TRISTAN JARRY""","""""","""""","""""","""""",22,"""2021020001-1-6…",,3,
2021020001,"""14""","""5""",1,63,63,11,20,"""STOPPAGE""","""8479410, 84745…","""MIKHAIL SERGAC…","""8478472, 84706…","""COREY PERRY, M…","""8478519, 84745…","""STEVEN STAMKOS…","""""","""""","""8476934, 84788…","""BRIAN DUMOULIN…","""8477969, 84785…","""EVAN RODRIGUES…","""8476883""","""ANDREI VASILEV…","""""","""""","""""","""""","""8477465""","""TRISTAN JARRY""","""""","""""","""""","""""",22,"""2021020001-1-6…",,3,
2021020001,"""14""","""5""",1,63,63,55,22,"""FACEOFF""","""8479410, 84745…","""MIKHAIL SERGAC…","""8478472, 84706…","""COREY PERRY, M…","""8478519, 84745…","""STEVEN STAMKOS…","""""","""""","""8476934, 84788…","""BRIAN DUMOULIN…","""8477969, 84785…","""EVAN RODRIGUES…","""8476883""","""ANDREI VASILEV…","""""","""""","""""","""""","""8477465""","""TRISTAN JARRY""","""""","""""","""""","""""",22,"""2021020001-1-6…","""FACEOFF""",3,
2021020001,"""14""","""5""",1,152,152,14,36,"""STOPPAGE""","""8474151, 84779…","""PIERRE-EDOUARD…","""8476292, 84784…","""ONDREJ PALAT, …","""8479390, 84801…","""PAT MAROON, JA…","""8470604, 84780…","""BRIAN DUMOULIN…","""""","""""","""""","""""","""8476883""","""ANDREI VASILEV…","""""","""""","""""","""""","""8477465""","""TRISTAN JARRY""","""""","""""","""""","""""",37,"""2021020001-1-1…",,2,
2021020001,"""14""","""5""",1,152,152,59,37,"""FACEOFF""","""8474151, 84779…","""PIERRE-EDOUARD…","""8476292, 84784…","""ONDREJ PALAT, …","""8479390, 84801…","""PAT MAROON, JA…","""8470604, 84780…","""BRIAN DUMOULIN…","""""","""""","""""","""""","""8476883""","""ANDREI VASILEV…","""""","""""","""""","""""","""8477465""","""TRISTAN JARRY""","""""","""""","""""","""""",37,"""2021020001-1-1…","""FACEOFF""",2,
2021020001,"""14""","""5""",1,168,168,16,40,"""STOPPAGE""","""8478010, 84779…","""PIERRE-EDOUARD…","""""","""""","""""","""""","""8470604, 84780…","""BRIAN DUMOULIN…","""""","""""","""""","""""","""8476883""","""ANDREI VASILEV…","""""","""""","""""","""""","""8477465""","""TRISTAN JARRY""","""""","""""","""""","""""",41,"""2021020001-1-1…",,2,"""8478010, 84779…"
2021020001,"""14""","""5""",1,168,168,61,41,"""FACEOFF""","""8478010, 84779…","""PIERRE-EDOUARD…","""""","""""","""""","""""","""8470604, 84780…","""BRIAN DUMOULIN…","""""","""""","""""","""""","""8476883""","""ANDREI VASILEV…","""""","""""","""""","""""","""8477465""","""TRISTAN JARRY""","""""","""""","""""","""""",41,"""2021020001-1-1…","""FACEOFF""",2,"""8478010, 84779…"
2021020001,"""14""","""5""",1,198,198,18,45,"""STOPPAGE""","""8478010, 84779…","""PIERRE-EDOUARD…","""""","""""","""""","""""","""8477969, 84785…","""EVAN RODRIGUES…","""""","""""","""""","""""","""8476883""","""ANDREI VASILEV…","""""","""""","""""","""""","""8477465""","""TRISTAN JARRY""","""""","""""","""""","""""",46,"""2021020001-1-1…",,2,"""8478010, 84779…"


In [None]:
# Prep Seconds Comparisons via Join and Filter
def combine_shifts():
    shift_start = (pl.col('period_seconds') == pl.col('period_start_seconds'))
    shift_end = (pl.col('period_seconds') == pl.col('period_end_seconds'))
    on_cond = ((pl.col('period_seconds') > pl.col('period_start_seconds')) & (pl.col('period_seconds') < pl.col('period_end_seconds')))

    pivot_df_list = []

    for i in shift_ids:
        pbp_slice = game_info_slim[game_info_slim['game_id'] == i]
        shift_slice = raw_shift_data[raw_shift_data['game_id'] == i]

        df = (
            pbp_slice
            .join(shift_slice, left_on = ['game_id', 'period'], right_on = ['game_id', 'period'], how='left')
            .filter(shift_start|shift_end|on_cond)
            .with_columns([
                pl.when(pl.col('period_seconds') == pl.col('period_start_seconds')).then(pl.lit('shift_on'))
                  .when(pl.col('period_seconds') == pl.col('period_end_seconds')).then(pl.lit('shift_off'))
                  .otherwise(pl.lit('on')).alias('shift_type')
            ])
        )

        pivot_df_list.append(df)

    slim_df = pivot_df_list[0]
    for df in pivot_df_list[1:]:
        slim_df.extend(df)

In [16]:
# Shift Data
# Load Game ID and Home/Away Ids
shift_df_list = []
current_game_ids = PBP_RAW['game_id'].unique()[0:10]


for i in current_game_ids:
    shift_link = "https://api.nhle.com/stats/rest/en/shiftcharts?cayenneExp=gameId="+str(i)
    shift_response = requests.get(shift_link)
    shift_raw = pd.json_normalize(shift_response.json())['data']
    normalized_shift = pd.concat([pd.json_normalize(item) for sublist in shift_raw for item in sublist], ignore_index=True)

    # Create Columns From Data (Names and Shift Starts)
    normalized_shift['player_name'] = normalized_shift['firstName'] + ' ' + normalized_shift['lastName']
    # Period Time
    normalized_shift['period_start_seconds'] = pd.to_datetime(normalized_shift['startTime'], format='%M:%S').dt.minute * 60 + pd.to_datetime(normalized_shift['startTime'], format='%M:%S').dt.second
    normalized_shift['period_end_seconds'] = pd.to_datetime(normalized_shift['endTime'], format='%M:%S').dt.minute * 60 + pd.to_datetime(normalized_shift['endTime'], format='%M:%S').dt.second
    # Game Time
    normalized_shift['game_start_seconds'] = ( pd.to_datetime(normalized_shift['startTime'], format='%M:%S').dt.minute * 60 + pd.to_datetime(normalized_shift['startTime'], format='%M:%S').dt.second) + ((normalized_shift['period'] - 1) * 1200)
    normalized_shift['game_end_seconds'] = ( pd.to_datetime(normalized_shift['endTime'], format='%M:%S').dt.minute * 60 + pd.to_datetime(normalized_shift['endTime'], format='%M:%S').dt.second) + ((normalized_shift['period'] - 1) * 1200)

    # Rename
    normalized_shift = normalized_shift.rename(columns = {
        'gameId': 'game_id',
        'id': 'shift_id',
        'playerId': 'player_id',
        'teamId': 'team_id',
        'shiftNumber': 'shift_number',
        'teamAbbrev': 'team_abbr'
    })

    # Keep
    shift_keep_cols = ['game_id', 'shift_id', 'team_id', 'player_id', 'player_name', 'period',
                       'period_start_seconds', 'period_end_seconds', 'game_start_seconds', 'game_end_seconds',
                       'eventNumber', 'team_abbr', 'shift_number', 'typeCode']
    normalized_shift = normalized_shift[shift_keep_cols]
    normalized_shift = pl.DataFrame(normalized_shift)

    # Join To Get Home Team
    game_info_slim = (
        PBP_RAW
        .filter(pl.col('game_id') == i)
        .select('game_id', 'home_id', 'away_id', 'period', 'game_seconds', 'period_seconds', 'event_id', 'event_idx', 'event_type')
        .with_columns([
            (pl.col('game_id').cast(pl.Int64).alias('game_id')),
            (pl.col('home_id').cast(pl.Int32).alias('home_id')),
            (pl.col('away_id').cast(pl.Int32).alias('away_id')),
            (pl.col('game_seconds').cast(pl.Int64).alias('game_seconds')),
            (pl.col('event_id').cast(pl.Int64).alias('event_id')),
            (pl.col('event_idx').cast(pl.Int32).alias('event_idx'))
        ])
    )

    # Clean Joined Table To PBP: 1) Get Home/Away Team, 2) Filter For Only Teams in Game, 3) Filter Shifts of 0 Seconds, 4) Keep only Distinct Values
    normalized_shift = (
        normalized_shift
        .join(game_info_slim.select('game_id', 'home_id', 'away_id').unique(), on='game_id', how='left')
        .filter((pl.col('home_id') == pl.col('team_id')) | (pl.col('away_id') == pl.col('team_id')))
        .filter(pl.col('game_start_seconds') != pl.col('game_end_seconds') )
        .with_columns(pl.when(pl.col('home_id') == pl.col('team_id')).then(pl.lit('home'))
                        .when(pl.col('away_id') == pl.col('team_id')).then(pl.lit('away')).otherwise(pl.lit(None)).alias('team_type'))
        .drop('home_id', 'away_id')
        .unique()
    )

    # Combine Consecutive Shifts
    gb_cols = [col for col in normalized_shift.columns if col not in ['period_start_seconds', 'game_start_seconds', 'eventNumber']]
    normalized_shift = (
        normalized_shift
        .with_columns([
            pl.col('period_start_seconds').max().over(gb_cols).alias('period_start_seconds'),
            pl.col('game_start_seconds').max().over(gb_cols).alias('game_start_seconds'),
            pl.col('eventNumber').max().over(gb_cols).alias('eventNumber')
        ])
        .unique()
    )

    # Join To Separate Goalies
    normalized_shift = (
        normalized_shift
        .join(ROSTER_DF.with_columns([
            (pl.col('player_id').cast(pl.Int64).alias('player_id')),
            (pl.col('pos_G').cast(pl.Int64).alias('pos_G'))
        ])
        .select('player_id', 'pos_G'), on='player_id', how='left')
        .unique()
    )


    # Group by 'player_id'
    grouped_shifts = normalized_shift.select('player_id', 'game_id', 'period', 'team_type', 'pos_G', 'period_start_seconds', 'period_end_seconds').to_pandas()
    grouped_shifts = grouped_shifts.groupby(['game_id', 'period', 'period_start_seconds', 'period_end_seconds', 'team_type', 'pos_G'])

    # Aggregate using the agg method
    result_df = grouped_shifts.agg(
        player_id_list=('player_id', list),
    ).reset_index()

    seconds_df = game_info_slim.select(pl.col('game_id').cast(pl.Int64),pl.col('period').cast(pl.Int64), pl.col('period_seconds').cast(pl.Int64), 'event_idx', 'event_type').to_pandas()

    # Function to get player lists based on conditions
    def get_player_lists(row, type, pos):
        p_secs = row['period_seconds']
        per = row['period']
        # Adjust conditions as needed
        on_ice_condition = (
            (result_df['period'] == per) &
            (result_df['period_start_seconds'] < p_secs) &
            (result_df['period_end_seconds'] > p_secs) &
            (result_df['team_type'] == type) &
            (result_df['pos_G'] == pos)
        )
        shift_on_condition = (
            (result_df['period'] == per) &
            (result_df['period_start_seconds'] == p_secs) &
            (result_df['team_type'] == type) &
            (result_df['pos_G'] == pos)
        )
        shift_off_condition = (
            (result_df['period'] == per) &
            (result_df['period_end_seconds'] == p_secs) &
            (result_df['team_type'] == type) &
            (result_df['pos_G'] == pos)
        )
        on_ice_rows = result_df[on_ice_condition]
        shift_on_rows = result_df[shift_on_condition]
        shift_off_rows = result_df[shift_off_condition]
    # Combine player lists using sets
        final_on_ice_players = set()
        final_shift_on_players = set()
        final_shift_off_players = set()
        if not on_ice_rows.empty:
            on_ice_players = [player_id for player_list in on_ice_rows['player_id_list'] for player_id in player_list]
            final_on_ice_players.update(on_ice_players)
        if not shift_on_rows.empty:
            shift_on_players = [player_id for player_list in shift_on_rows['player_id_list'] for player_id in player_list]
            final_shift_on_players.update(shift_on_players)
        if not shift_off_rows.empty:
            shift_off_players = [player_id for player_list in shift_off_rows['player_id_list'] for player_id in player_list]
            final_shift_off_players.update(shift_off_players)
        return list(final_on_ice_players), list(final_shift_on_players), list(final_shift_off_players)


    # Apply the function to each row of seconds_df
    ## Home
    home_result = seconds_df.apply(get_player_lists, type='home', pos=0, axis=1)
    seconds_df[['home', 'home_shift_on', 'home_shift_off']] = pd.DataFrame(home_result.tolist(), index=seconds_df.index)

    ## Away
    away_result = seconds_df.apply(get_player_lists, type='away', pos=0, axis=1)
    seconds_df[['away', 'away_shift_on', 'away_shift_off']] = pd.DataFrame(away_result.tolist(), index=seconds_df.index)

    ## Home Goalie
    home_g_result = seconds_df.apply(get_player_lists, type='home', pos=1, axis=1)
    seconds_df[['home_goalie', 'home_goalie_on', 'home_goalie_off']] = pd.DataFrame(home_g_result.tolist(), index=seconds_df.index)

    ## Away Goalie
    away_g_result = seconds_df.apply(get_player_lists, type='away', pos=1, axis=1)
    seconds_df[['away_goalie', 'away_goalie_on', 'away_goalie_off']] = pd.DataFrame(away_g_result.tolist(), index=seconds_df.index)

    # Columns with lists of player IDs
    list_columns = ["home", "home_shift_on", "home_shift_off",
                    "away", "away_shift_on", "away_shift_off",
                    "home_goalie", "home_goalie_on", "home_goalie_off",
                    "away_goalie", "away_goalie_on", "away_goalie_off"]

    # Iterate over each list column
    for column in list_columns:
        # Iterate over each row
        for index, row in seconds_df.iterrows():
            # Extract the list of player IDs
            player_ids = row[column]

            # Iterate over the player IDs in the list
            for i, player_id in enumerate(player_ids):
                # Create a new column name
                new_column_name = f"{column}_{i+1}_on"

                # Create a new column in the DataFrame
                seconds_df.at[index, new_column_name] = player_id

    # Drop unnecessary Columns
    seconds_df = seconds_df.drop(["home", "home_shift_on", "home_shift_off", "away", "away_shift_on", "away_shift_off", "home_goalie", "home_goalie_on", "home_goalie_off", "away_goalie", "away_goalie_on", "away_goalie_off"], axis=1)

    # Add Null Columns if Needed For Append
    columns_to_add = ['home_6_on', 'home_shift_on_6_on', 'home_shift_off_6_on',
                      'away_6_on', 'away_shift_on_6_on', 'away_shift_off_6_on',]
    for column in columns_to_add:
        if column not in seconds_df.columns:
            seconds_df[column] = pd.Series([float('nan')] * len(seconds_df))

    # Append To List For Concat
    shift_df_list.append(seconds_df)

# Combine DataFrames
final_shift_df = pl.DataFrame(pd.concat(shift_df_list))

# Combine To Create home_1_on, away_1_on, etc.
game_start_end = ['GAME_START', 'PERIOD_START', 'GAME_END', 'PERIOD_END']
stoppages = ['STOPPAGE', 'PENALTY', 'GOAL'] # DELAYED.PENALTY
#stoppage_cond = (pl.col('event_type') == 'FACEOFF') & (pl.col(''))
## Build Max Event_IDX Col for each game, period, second
final_shift_df = (
    final_shift_df
    .filter(~pl.col('event_type').is_in(game_start_end))
    .with_columns([
        pl.col('event_idx').max().over(['game_id', 'period', 'period_seconds']).alias('max_event_idx')
    ])
    .with_columns([
        (pl.col('game_id').cast(pl.Utf8) + '-' + pl.col('period').cast(pl.Utf8) + '-' + pl.col('period_seconds').cast(pl.Utf8)).alias('event_seconds_id'),
        pl.when(pl.col('event_idx') == pl.col('max_event_idx')).then(pl.col('event_type')).otherwise(pl.lit(None)).alias('max_event_type')
    ])
    .with_columns([
        pl.col('event_seconds_id').count().over(['game_id', 'period', 'period_seconds']).alias('count_event_seconds_id')
    ])
    # Fill In Player On Columns
    .with_columns([
        # Home Skaters
        (pl.when((pl.col('home_1_on').is_null()) & (~pl.col('home_shift_on_1_on').is_null())).then(pl.col('home_shift_on_1_on'))
           .when((pl.col('home_1_on').is_null()) & (~pl.col('home_shift_off_1_on').is_null())).then(pl.col('home_shift_off_1_on'))).otherwise(pl.col('home_1_on')).alias('home_1_on'),
        (pl.when((pl.col('home_2_on').is_null()) & (~pl.col('home_shift_on_2_on').is_null())).then(pl.col('home_shift_on_2_on'))
           .when((pl.col('home_2_on').is_null()) & (~pl.col('home_shift_off_2_on').is_null())).then(pl.col('home_shift_off_2_on'))).otherwise(pl.col('home_2_on')).alias('home_2_on'),
        (pl.when((pl.col('home_3_on').is_null()) & (~pl.col('home_shift_on_3_on').is_null())).then(pl.col('home_shift_on_3_on'))
           .when((pl.col('home_3_on').is_null()) & (~pl.col('home_shift_off_3_on').is_null())).then(pl.col('home_shift_off_3_on'))).otherwise(pl.col('home_3_on')).alias('home_3_on'),
        (pl.when((pl.col('home_4_on').is_null()) & (~pl.col('home_shift_on_4_on').is_null())).then(pl.col('home_shift_on_4_on'))
           .when((pl.col('home_4_on').is_null()) & (~pl.col('home_shift_off_4_on').is_null())).then(pl.col('home_shift_off_4_on'))).otherwise(pl.col('home_4_on')).alias('home_4_on'),
        (pl.when((pl.col('home_5_on').is_null()) & (~pl.col('home_shift_on_5_on').is_null())).then(pl.col('home_shift_on_5_on'))
           .when((pl.col('home_5_on').is_null()) & (~pl.col('home_shift_off_5_on').is_null())).then(pl.col('home_shift_off_5_on'))).otherwise(pl.col('home_5_on')).alias('home_5_on'),
        (pl.when((pl.col('home_6_on').is_null()) & (~pl.col('home_shift_on_6_on').is_null())).then(pl.col('home_shift_on_6_on'))
           .when((pl.col('home_6_on').is_null()) & (~pl.col('home_shift_off_6_on').is_null())).then(pl.col('home_shift_off_6_on'))).otherwise(pl.col('home_6_on')).alias('home_6_on'),
        # Away Skaters
        (pl.when((pl.col('away_1_on').is_null()) & (~pl.col('away_shift_on_1_on').is_null())).then(pl.col('away_shift_on_1_on'))
           .when((pl.col('away_1_on').is_null()) & (~pl.col('away_shift_off_1_on').is_null())).then(pl.col('away_shift_off_1_on'))).otherwise(pl.col('away_1_on')).alias('away_1_on'),
        (pl.when((pl.col('away_2_on').is_null()) & (~pl.col('away_shift_on_2_on').is_null())).then(pl.col('away_shift_on_2_on'))
           .when((pl.col('away_2_on').is_null()) & (~pl.col('away_shift_off_2_on').is_null())).then(pl.col('away_shift_off_2_on'))).otherwise(pl.col('away_2_on')).alias('away_2_on'),
        (pl.when((pl.col('away_3_on').is_null()) & (~pl.col('away_shift_on_3_on').is_null())).then(pl.col('away_shift_on_3_on'))
           .when((pl.col('away_3_on').is_null()) & (~pl.col('away_shift_off_3_on').is_null())).then(pl.col('away_shift_off_3_on'))).otherwise(pl.col('away_3_on')).alias('away_3_on'),
        (pl.when((pl.col('away_4_on').is_null()) & (~pl.col('away_shift_on_4_on').is_null())).then(pl.col('away_shift_on_4_on'))
           .when((pl.col('away_4_on').is_null()) & (~pl.col('away_shift_off_4_on').is_null())).then(pl.col('away_shift_off_4_on'))).otherwise(pl.col('away_4_on')).alias('away_4_on'),
        (pl.when((pl.col('away_5_on').is_null()) & (~pl.col('away_shift_on_5_on').is_null())).then(pl.col('away_shift_on_5_on'))
           .when((pl.col('away_5_on').is_null()) & (~pl.col('away_shift_off_5_on').is_null())).then(pl.col('away_shift_off_5_on'))).otherwise(pl.col('away_5_on')).alias('away_5_on'),
        (pl.when((pl.col('away_6_on').is_null()) & (~pl.col('away_shift_on_6_on').is_null())).then(pl.col('away_shift_on_6_on'))
           .when((pl.col('away_6_on').is_null()) & (~pl.col('away_shift_off_6_on').is_null())).then(pl.col('away_shift_off_6_on'))).otherwise(pl.col('away_6_on')).alias('away_6_on'),
        # Goalies
        (pl.when((pl.col('home_goalie_1_on').is_null()) & (~pl.col('home_goalie_on_1_on').is_null())).then(pl.col('home_goalie_on_1_on'))
           .when((pl.col('home_goalie_1_on').is_null()) & (~pl.col('home_goalie_off_1_on').is_null())).then(pl.col('home_goalie_off_1_on'))
           .otherwise(pl.col('home_goalie_1_on'))).alias('home_goalie'),
        (pl.when((pl.col('away_goalie_1_on').is_null()) & (~pl.col('away_goalie_on_1_on').is_null())).then(pl.col('away_goalie_on_1_on'))
           .when((pl.col('away_goalie_1_on').is_null()) & (~pl.col('away_goalie_off_1_on').is_null())).then(pl.col('away_goalie_off_1_on'))
           .otherwise(pl.col('away_goalie_1_on'))).alias('away_goalie')
    ])
    .select([
        'game_id', 'period', 'period_seconds', 'event_idx', 'event_type',
        'home_1_on', 'home_2_on', 'home_3_on', 'home_4_on', 'home_5_on', 'home_6_on', 'home_goalie',
        'away_1_on', 'away_2_on', 'away_3_on', 'away_4_on', 'away_5_on', 'away_6_on', 'away_goalie'
    ])
)

# Display the updated DataFrame
final_shift_df.head()

game_id,period,period_seconds,event_idx,event_type,home_1_on,home_2_on,home_3_on,home_4_on,home_5_on,home_6_on,home_goalie,away_1_on,away_2_on,away_3_on,away_4_on,away_5_on,away_6_on,away_goalie
i64,i64,i64,i32,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
2021020001,1,0,9,"""FACEOFF""",8476292.0,8476453.0,8474151.0,8478416.0,8478010.0,,8476883.0,8475810.0,8475208.0,8471724.0,8470604.0,8478046.0,,8477465.0
2021020001,1,18,10,"""HIT""",8476292.0,8476453.0,8474151.0,8478416.0,8478010.0,,8476883.0,8475810.0,8475208.0,8471724.0,8470604.0,8478046.0,,8477465.0
2021020001,1,38,14,"""STOPPAGE""",8473986.0,8474564.0,8480172.0,8478519.0,8475167.0,,8476883.0,8477953.0,8475722.0,8478507.0,8478542.0,8477969.0,,8477465.0
2021020001,1,38,15,"""FACEOFF""",8473986.0,8474564.0,8480172.0,8478519.0,8475167.0,,8476883.0,8477953.0,8475722.0,8478507.0,8478542.0,8477969.0,,8477465.0
2021020001,1,53,16,"""HIT""",8473986.0,8474564.0,8480172.0,8478519.0,8475167.0,,8476883.0,8477953.0,8475722.0,8478507.0,8478542.0,8477969.0,,8477465.0


In [221]:
pl.DataFrame(final_shift_df).groupby(['game_id', 'period', 'period_seconds']).agg([
    pl.col('event_idx').n_unique().alias('dist_events'),
    pl.col('event_type').n_unique().alias('dist_event_types'),
    (pl.when(pl.col('event_type')=='PENALTY').then(pl.lit(0)).otherwise(pl.lit(1))).sum().alias('Non-Penalty-Events')
    ]).sort("dist_events", descending=True).head(5)

#[2022021031, 2021030412, 2022020906, 2022030214, 2023020037]

game_id,period,period_seconds,dist_events,dist_event_types,Non-Penalty-Events
i64,i64,i64,u32,u32,i32
2022021031,3,736,23,3,23
2021030412,3,938,19,3,19
2022020906,2,1035,16,3,16
2022030214,3,1200,15,4,15
2023020037,3,1200,15,3,15


In [224]:
PBP_RAW.filter((pl.col('game_id') == 2022030214) & (pl.col('period') == 3) & (pl.col('period_seconds') == 1200) & (pl.col('event_type') != 'PENALTY'))

game_id,game_date,season,event_idx,season_type,period,period_type,timeRemaining,timeInPeriod,situationCode,homeTeamDefendingSide,event_team_id,away_id,away_abbreviation,away_score,home_id,home_abbreviation,home_score,event_id,typeCode,details.typeCode,event_type,details.descKey,reason,details.secondaryReason,secondary_type,event_zone,x,y,event_goalie_id,servedby_player_id,penalty_minutes,event_team_type,event_team_abbr,period_seconds,period_seconds_remaining,game_seconds,game_seconds_remaining,event_player_1_id,event_player_2_id,event_player_3_id,event_player_4_id,event_player_1_type,event_player_2_type,event_player_3_type,event_player_4_type,away_en,home_en,away_skaters,home_skaters,strength_state,true_strength_state,x_abs,y_abs,event_distance,event_angle
i32,str,i32,i32,str,i32,str,str,str,str,str,str,str,str,f32,str,str,f32,i32,i32,str,str,str,str,str,str,str,f32,f32,str,str,str,str,str,i64,i64,i64,i64,str,str,str,str,str,str,str,str,i32,i32,i32,i32,str,str,f64,f64,f64,f64
2022030214,"""2023-05-10""",20222023,817,"""P""",3,"""REG""","""00:00""","""20:00""","""1560""","""right""","""10.0""","""10""","""TOR""",,"""13""","""FLA""",,758,503,,"""HIT""",,,,,"""D""",-99.0,-10.0,,,,"""away""","""away_abbreviat…",1200,0,3600,2400,"""8476931.0""","""8477933.0""",,,"""Hitter""","""Hittee""",,,1,0,5,6,"""6v5""","""6v5""",-99.0,-10.0,188.26577,3.044778
2022030214,"""2023-05-10""",20222023,842,"""P""",3,"""REG""","""00:00""","""20:00""","""1550""","""right""",,"""10""","""TOR""",,"""13""","""FLA""",,759,521,,"""PERIOD_END""",,,,,,,,,,,"""away""","""away_abbreviat…",1200,0,3600,2400,,,,,,,,,1,0,5,5,"""5v5""","""5v5""",,,,
2022030214,"""2023-05-10""",20222023,858,"""P""",3,"""REG""","""00:00""","""20:00""","""1550""","""right""",,"""10""","""TOR""",,"""13""","""FLA""",,763,524,,"""GAME_END""",,,,,,,,,,,"""away""","""away_abbreviat…",1200,0,3600,2400,,,,,,,,,1,0,5,5,"""5v5""","""5v5""",,,,


In [247]:
final_shift_df.filter(pl.col('event_seconds_id').is_in(final_shift_df.filter((pl.col('count_event_seconds_id') > 1) & (pl.col('max_event_type') != 'FACEOFF'))['event_seconds_id']))

game_id,period,period_seconds,event_idx,event_type,home_stay_on_1_on,home_stay_on_2_on,home_stay_on_3_on,home_stay_on_4_on,home_stay_on_5_on,home_stay_on_6_on,home_shift_on_1_on,home_shift_on_2_on,home_shift_on_3_on,home_shift_on_4_on,home_shift_on_5_on,home_shift_off_1_on,home_shift_off_2_on,home_shift_off_3_on,home_shift_off_4_on,home_shift_off_5_on,away_stay_on_1_on,away_stay_on_2_on,away_stay_on_3_on,away_stay_on_4_on,away_stay_on_5_on,away_stay_on_6_on,away_shift_on_1_on,away_shift_on_2_on,away_shift_on_3_on,away_shift_on_4_on,away_shift_on_5_on,away_shift_on_6_on,away_shift_off_1_on,away_shift_off_2_on,away_shift_off_3_on,away_shift_off_4_on,away_shift_off_5_on,away_shift_off_6_on,home_goalie_1_on,home_goalie_on_1_on,home_goalie_off_1_on,away_goalie_1_on,away_goalie_on_1_on,away_goalie_off_1_on,home_shift_off_6_on,max_event_idx,event_seconds_id,max_event_type,count_event_seconds_id
i64,i64,i64,i32,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i32,str,str,u32
2021030412,3,825,791,"""BLOCKED_SHOT""",8480069.0,8476455.0,8471794.0,8478038.0,8477501.0,,,,,,,,,,,,8476292.0,8476453.0,8480172.0,8477426.0,8475167.0,,,,,,,,,,,,,,8475311.0,,,8476883.0,,,,792,"""2021030412-3-8…",,2
2021030412,3,825,792,"""HIT""",8480069.0,8476455.0,8471794.0,8478038.0,8477501.0,,,,,,,,,,,,8476292.0,8476453.0,8480172.0,8477426.0,8475167.0,,,,,,,,,,,,,,8475311.0,,,8476883.0,,,,792,"""2021030412-3-8…","""HIT""",2
2022020906,2,397,393,"""SHOT""",8476483.0,8475208.0,8471724.0,8471675.0,8477404.0,,,,,,,,,,,,8477506.0,8475314.0,8476917.0,8474709.0,8477500.0,,,,,,,,,,,,,,8477465.0,,,8478009.0,,,,394,"""2022020906-2-3…",,2
2022020906,2,397,394,"""HIT""",8476483.0,8475208.0,8471724.0,8471675.0,8477404.0,,,,,,,,,,,,8477506.0,8475314.0,8476917.0,8474709.0,8477500.0,,,,,,,,,,,,,,8477465.0,,,8478009.0,,,,394,"""2022020906-2-3…","""HIT""",2
2022021031,1,41,17,"""HIT""",8479525.0,8479661.0,8477426.0,8479410.0,8480246.0,,,,,,,,,,,,8476448.0,8478020.0,8477447.0,8481032.0,8475188.0,,,,,,,,,,,,,,8476883.0,,,8471734.0,,,,18,"""2022021031-1-4…",,2
2022021031,1,41,18,"""BLOCKED_SHOT""",8479525.0,8479661.0,8477426.0,8479410.0,8480246.0,,,,,,,,,,,,8476448.0,8478020.0,8477447.0,8481032.0,8475188.0,,,,,,,,,,,,,,8476883.0,,,8471734.0,,,,18,"""2022021031-1-4…","""BLOCKED_SHOT""",2
2022030214,2,469,398,"""GIVEAWAY""",8477409.0,8478055.0,8477932.0,8477493.0,8477407.0,,,,,,,,,,,,8475714.0,8475718.0,8474673.0,8478483.0,8479318.0,,,,,,,,,,,,,,8475683.0,,,8479361.0,,,,399,"""2022030214-2-4…",,2
2022030214,2,469,399,"""HIT""",8477409.0,8478055.0,8477932.0,8477493.0,8477407.0,,,,,,,,,,,,8475714.0,8475718.0,8474673.0,8478483.0,8479318.0,,,,,,,,,,,,,,8475683.0,,,8479361.0,,,,399,"""2022030214-2-4…","""HIT""",2
2022030214,2,769,451,"""DELAYED_PENALT…",8482113.0,8475462.0,8479372.0,8477933.0,8480185.0,,,,,,,,,,,,8475718.0,8478569.0,8480144.0,8474673.0,8478043.0,,,,,,,,,,,,,,8475683.0,,,8479361.0,,,,452,"""2022030214-2-7…",,2
2022030214,2,769,452,"""SHOT""",8482113.0,8475462.0,8479372.0,8477933.0,8480185.0,,,,,,,,,,,,8475718.0,8478569.0,8480144.0,8474673.0,8478043.0,,,,,,,,,,,,,,8475683.0,,,8479361.0,,,,452,"""2022030214-2-7…","""SHOT""",2


In [None]:
save_final = final_shift_df

In [46]:
#final_shift_df = pl.DataFrame(final_shift_df)
player_id_cols = final_shift_df.columns[-39:]
final_shift_df = (
    final_shift_df
    .with_columns(
    pl.col('game_id').cast(pl.Int32),
    pl.col('period').cast(pl.Int32),
    pl.col('event_idx').cast(pl.Int32)
    )
    .with_columns(
        *[pl.col(column).cast(pl.Utf8) if column in player_id_cols else pl.col(column) for column in final_shift_df.columns]
    )
)

FINAL_PBP = PBP_RAW.join(final_shift_df, on = ["game_id", "period", "period_seconds", "event_idx"], how = 'left').sort('game_id', 'period', 'event_idx')

In [47]:
FINAL_PBP.head()

game_id,game_date,season,event_idx,season_type,period,period_type,timeRemaining,timeInPeriod,situationCode,homeTeamDefendingSide,event_team_id,away_id,away_abbreviation,away_score,home_id,home_abbreviation,home_score,event_id,typeCode,details.typeCode,event_type,details.descKey,reason,details.secondaryReason,secondary_type,event_zone,x,y,event_goalie_id,servedby_player_id,penalty_minutes,event_team_type,event_team_abbr,period_seconds,period_seconds_remaining,game_seconds,game_seconds_remaining,event_player_1_id,event_player_2_id,event_player_3_id,event_player_4_id,event_player_1_type,event_player_2_type,event_player_3_type,event_player_4_type,away_en,home_en,away_skaters,home_skaters,strength_state,true_strength_state,x_abs,y_abs,event_distance,event_angle,home_1_on,home_2_on,home_3_on,home_4_on,home_5_on,home_6_on,away_1_on,away_2_on,away_3_on,away_4_on,away_5_on,home_goalie_1_on,away_goalie_1_on,away_6_on,away_7_on,away_8_on,home_7_on,home_8_on,home_9_on,home_10_on,away_9_on,away_10_on,away_11_on,away_12_on,home_goalie_2_on,away_goalie_2_on,home_goalie_3_on,away_13_on,away_14_on,away_15_on,away_16_on,away_17_on,away_18_on,away_19_on,away_20_on,away_goalie_3_on,away_goalie_4_on,home_11_on,home_12_on
i32,str,i32,i32,str,i32,str,str,str,str,str,str,str,str,f32,str,str,f32,i32,i32,str,str,str,str,str,str,str,f32,f32,str,str,str,str,str,i64,i64,i64,i64,str,str,str,str,str,str,str,str,i32,i32,i32,i32,str,str,f64,f64,f64,f64,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
2021020001,"""2021-10-12""",20212022,8,"""R""",1,"""REG""","""20:00""","""00:00""","""1551""","""left""",,"""5""","""PIT""",,"""14""","""TBL""",,51,520,,"""PERIOD_START""",,,,,,,,,,,"""away""","""away_abbreviat…",0,1200,0,1200,,,,,,,,,1,1,5,5,"""5v5""","""5v5""",,,,,"""8478416.0""","""8476453.0""","""8476292.0""","""8478010.0""","""8474151.0""",,"""8475208.0""","""8471724.0""","""8475810.0""","""8470604.0""","""8478046.0""","""8476883.0""","""8477465.0""",,,,,,,,,,,,,,,,,,,,,,,,,,
2021020001,"""2021-10-12""",20212022,9,"""R""",1,"""REG""","""20:00""","""00:00""","""1551""","""left""","""5.0""","""5""","""PIT""",,"""14""","""TBL""",,52,502,,"""FACEOFF""",,,,,"""N""",0.0,0.0,,,,"""away""","""away_abbreviat…",0,1200,0,1200,"""8470604.0""","""8478010.0""",,,"""Winner""","""Loser""",,,1,1,5,5,"""5v5""","""5v5""",-0.0,-0.0,89.0,0.0,"""8478416.0""","""8476453.0""","""8476292.0""","""8478010.0""","""8474151.0""",,"""8475208.0""","""8471724.0""","""8475810.0""","""8470604.0""","""8478046.0""","""8476883.0""","""8477465.0""",,,,,,,,,,,,,,,,,,,,,,,,,,
2021020001,"""2021-10-12""",20212022,10,"""R""",1,"""REG""","""19:42""","""00:18""","""1551""","""left""","""14.0""","""5""","""PIT""",,"""14""","""TBL""",,8,503,,"""HIT""",,,,,"""O""",46.0,40.0,,,,"""away""","""away_abbreviat…",18,1182,18,1182,"""8476292.0""","""8470604.0""",,,"""Hitter""","""Hittee""",,,1,1,5,5,"""5v5""","""5v5""",-46.0,-40.0,140.801278,16.504361,"""8478416.0""","""8476453.0""","""8476292.0""","""8478010.0""","""8474151.0""",,"""8475208.0""","""8471724.0""","""8475810.0""","""8470604.0""","""8478046.0""","""8476883.0""","""8477465.0""",,,,,,,,,,,,,,,,,,,,,,,,,,
2021020001,"""2021-10-12""",20212022,14,"""R""",1,"""REG""","""19:22""","""00:38""","""1551""","""left""",,"""5""","""PIT""",,"""14""","""TBL""",,9,516,,"""STOPPAGE""",,"""puck-in-nettin…",,,,,,,,,"""away""","""away_abbreviat…",38,1162,38,1162,,,,,,,,,1,1,5,5,"""5v5""","""5v5""",,,,,"""8480172.0""","""8475167.0""","""8473986.0""","""8474564.0""","""8478519.0""",,"""8477969.0""","""8478507.0""","""8475722.0""","""8478542.0""","""8477953.0""","""8476883.0""","""8477465.0""",,,,,,,,,,,,,,,,,,,,,,,,,,
2021020001,"""2021-10-12""",20212022,15,"""R""",1,"""REG""","""19:22""","""00:38""","""1551""","""left""","""14.0""","""5""","""PIT""",,"""14""","""TBL""",,53,502,,"""FACEOFF""",,,,,"""N""",0.0,0.0,,,,"""away""","""away_abbreviat…",38,1162,38,1162,"""8478519.0""","""8478542.0""",,,"""Winner""","""Loser""",,,1,1,5,5,"""5v5""","""5v5""",-0.0,-0.0,89.0,0.0,"""8480172.0""","""8475167.0""","""8473986.0""","""8474564.0""","""8478519.0""",,"""8477969.0""","""8478507.0""","""8475722.0""","""8478542.0""","""8477953.0""","""8476883.0""","""8477465.0""",,,,,,,,,,,,,,,,,,,,,,,,,,


In [179]:
def peek_shift_raw(id, period, secs):
    shift_link = "https://api.nhle.com/stats/rest/en/shiftcharts?cayenneExp=gameId="+str(id)
    shift_response = requests.get(shift_link)
    shift_raw = pd.json_normalize(shift_response.json())['data']
    normalized_shift = pd.concat([pd.json_normalize(item) for sublist in shift_raw for item in sublist], ignore_index=True)

    # Create Columns From Data (Names and Shift Starts)
    normalized_shift['player_name'] = normalized_shift['firstName'] + ' ' + normalized_shift['lastName']
    # Period Time
    normalized_shift['period_start_seconds'] = pd.to_datetime(normalized_shift['startTime'], format='%M:%S').dt.minute * 60 + pd.to_datetime(normalized_shift['startTime'], format='%M:%S').dt.second
    normalized_shift['period_end_seconds'] = pd.to_datetime(normalized_shift['endTime'], format='%M:%S').dt.minute * 60 + pd.to_datetime(normalized_shift['endTime'], format='%M:%S').dt.second
    # Game Time
    normalized_shift['game_start_seconds'] = ( pd.to_datetime(normalized_shift['startTime'], format='%M:%S').dt.minute * 60 + pd.to_datetime(normalized_shift['startTime'], format='%M:%S').dt.second) + ((normalized_shift['period'] - 1) * 1200)
    normalized_shift['game_end_seconds'] = ( pd.to_datetime(normalized_shift['endTime'], format='%M:%S').dt.minute * 60 + pd.to_datetime(normalized_shift['endTime'], format='%M:%S').dt.second) + ((normalized_shift['period'] - 1) * 1200)

    # Rename
    normalized_shift = normalized_shift.rename(columns = {
        'gameId': 'game_id',
        'id': 'shift_id',
        'playerId': 'player_id',
        'teamId': 'team_id',
        'shiftNumber': 'shift_number',
        'teamAbbrev': 'team_abbr'
    })

    # Keep
    shift_keep_cols = ['game_id', 'team_id', 'player_id', 'player_name', 'period',
                       'period_start_seconds', 'period_end_seconds', 'game_start_seconds', 'game_end_seconds',
                       'eventNumber', 'team_abbr', 'typeCode']
    normalized_shift = normalized_shift[shift_keep_cols]
    normalized_shift = pl.DataFrame(normalized_shift)

    # Join To Get Home Team
    game_info_slim = (
        PBP_RAW
        .select('game_id', 'home_id', 'away_id', 'game_seconds', 'period_seconds', 'event_id', 'event_idx', 'event_type')
        .with_columns([
            (pl.col('game_id').cast(pl.Int64).alias('game_id')),
            (pl.col('home_id').cast(pl.Int32).alias('home_id')),
            (pl.col('away_id').cast(pl.Int32).alias('away_id')),
            (pl.col('game_seconds').cast(pl.Int64).alias('game_seconds')),
            (pl.col('event_id').cast(pl.Int64).alias('event_id')),
            (pl.col('event_idx').cast(pl.Int32).alias('event_idx'))
        ])
    )

    # Clean Joined Table To PBP: 1) Get Home/Away Team, 2) Filter For Only Teams in Game, 3) Filter Shifts of 0 Seconds, 4) Keep only Distinct Values
    normalized_shift = (
        normalized_shift
        .join(game_info_slim.select('game_id', 'home_id', 'away_id').unique(), on='game_id', how='left')
        .filter((pl.col('home_id') == pl.col('team_id')) | (pl.col('away_id') == pl.col('team_id')))
        .filter(pl.col('game_start_seconds') != pl.col('game_end_seconds') )
        .with_columns(pl.when(pl.col('home_id') == pl.col('team_id')).then(pl.lit('home'))
                        .when(pl.col('away_id') == pl.col('team_id')).then(pl.lit('away')).otherwise(pl.lit(None)).alias('team_type'))
        .drop('home_id', 'away_id')
        .unique()
    )

    # Combine Consecutive Shifts
    gb_cols = [col for col in normalized_shift.columns if col not in ['period_start_seconds', 'game_start_seconds', 'eventNumber']]
    normalized_shift = (
        normalized_shift
        .with_columns([
            pl.col('period_start_seconds').max().over(gb_cols).alias('period_start_seconds'),
            pl.col('game_start_seconds').max().over(gb_cols).alias('game_start_seconds'),
            pl.col('eventNumber').max().over(gb_cols).alias('eventNumber')
        ])
        .unique()
    )

    # Filter By Seconds
    return_df = (
        normalized_shift
        .filter(
            (pl.col('period') == period) &
            (((secs == pl.col('period_start_seconds')) | (secs == pl.col('period_end_seconds'))) | ((pl.col('period_start_seconds') < secs) & (pl.col('period_end_seconds') > secs)))
        )
        .with_columns(
            (pl.col('period_end_seconds') - pl.col('period_start_seconds')).alias('shift_length'),
            pl.when(pl.col('period_start_seconds') == secs).then(pl.lit('SHIFT_ON'))
              .when(pl.col('period_end_seconds') == secs).then(pl.lit('SHIFT_OFF'))
              .when((pl.col('period_start_seconds') < secs) & (pl.col('period_end_seconds') > secs)).then(pl.lit('ON_ICE'))
              .otherwise(pl.lit('OTHER')).alias('on_off')
        )
    )

    return normalized_shift

In [181]:
peek_cols = ['game_id', 'season', 'event_idx', 'period', 'period_seconds', 'timeInPeriod', 'situationCode', 'event_type']
player_id_cols = FINAL_PBP.columns[-39:]
peek_df = FINAL_PBP.select(peek_cols + player_id_cols).filter(~pl.col('away_goalie_4_on').is_null())

print(peek_df.height)
peek_df['game_id'].value_counts()

286


game_id,counts
i32,u32
2021020513,280
2023020327,6


In [185]:
#peek_dupe_shift_df = peek_shift_raw(2021020012, period = 1, secs = 837)
#
#peek_dupe_shift_df = (
#    peek_dupe_shift_df
#    .sort(['player_id', 'period', 'period_start_seconds'])
#    .filter((pl.col('game_end_seconds') == pl.col('game_end_seconds').shift()) | (pl.col('game_end_seconds') == pl.col('game_end_seconds').shift(-1)))
#)
#peek_dupe_shift_df

#peek_dupe_shift_df.filter(pl.col('eventNumber').is_in([303, 304])).sort('eventNumber')


peek_shift_raw(2023020327, period = 1, secs=0).filter(pl.col('player_id') == 8478009).sort('period_start_seconds')
#peek_shift_raw(2021020012, period = 1, secs = 840).filter((pl.col('team_id') == 2)).sort(['period', 'period_start_seconds'], descending=False).drop('typeCode')
#.sort(['player_id', 'period', 'period_start_seconds']).filter(pl.col('game_end_seconds') == pl.col('game_end_seconds').shift()).sort(['period', 'period_start_seconds'], descending=False)

game_id,team_id,player_id,player_name,period,period_start_seconds,period_end_seconds,game_start_seconds,game_end_seconds,eventNumber,team_abbr,typeCode,team_type
i64,i64,i64,str,i64,i32,i32,i64,i64,i64,str,i64,str
2023020327,2,8478009,"""Ilya Sorokin""",1,0,999,0,999,7,"""NYI""",517,"""away"""
2023020327,2,8478009,"""Ilya Sorokin""",3,0,1177,2400,3577,856,"""NYI""",517,"""away"""
2023020327,2,8478009,"""Ilya Sorokin""",2,0,1200,1200,2400,557,"""NYI""",517,"""away"""
2023020327,2,8478009,"""Ilya Sorokin""",1,259,421,259,421,207,"""NYI""",517,"""away"""
2023020327,2,8478009,"""Ilya Sorokin""",1,455,530,455,530,244,"""NYI""",517,"""away"""
2023020327,2,8478009,"""Ilya Sorokin""",1,551,616,551,616,260,"""NYI""",517,"""away"""
2023020327,2,8478009,"""Ilya Sorokin""",1,617,1200,617,1200,374,"""NYI""",517,"""away"""


In [24]:
(((PBP_RAW['game_id'].n_unique())*1.4)/60)/60

5.489944444444444

In [56]:
# Load Game ID and Home/Away Ids
def append_shift_data(data):
    """This function will take the game ID from the RAW Play by Play Data and create columns for players on ice during given events."""
    shift_df_list = []
    for i in data['game_id'].unique():
        shift_link = "https://api.nhle.com/stats/rest/en/shiftcharts?cayenneExp=gameId="+str(i)
        shift_response = requests.get(shift_link)
        shift_raw = pd.json_normalize(shift_response.json())['data']
        normalized_shift = pd.concat([pd.json_normalize(item) for sublist in shift_raw for item in sublist], ignore_index=True)

        # Create Columns From Data (Names and Shift Starts)
        normalized_shift['player_name'] = normalized_shift['firstName'] + ' ' + normalized_shift['lastName']
        # Period Time
        normalized_shift['period_start_seconds'] = pd.to_datetime(normalized_shift['startTime'], format='%M:%S').dt.minute * 60 + pd.to_datetime(normalized_shift['startTime'], format='%M:%S').dt.second
        normalized_shift['period_end_seconds'] = pd.to_datetime(normalized_shift['endTime'], format='%M:%S').dt.minute * 60 + pd.to_datetime(normalized_shift['endTime'], format='%M:%S').dt.second
        # Game Time
        normalized_shift['game_start_seconds'] = ( pd.to_datetime(normalized_shift['startTime'], format='%M:%S').dt.minute * 60 + pd.to_datetime(normalized_shift['startTime'], format='%M:%S').dt.second) + ((normalized_shift['period'] - 1) * 1200)
        normalized_shift['game_end_seconds'] = ( pd.to_datetime(normalized_shift['endTime'], format='%M:%S').dt.minute * 60 + pd.to_datetime(normalized_shift['endTime'], format='%M:%S').dt.second) + ((normalized_shift['period'] - 1) * 1200)

        # Rename
        normalized_shift = normalized_shift.rename(columns = {
            'gameId': 'game_id',
            'id': 'shift_id',
            'playerId': 'player_id',
            'teamId': 'team_id',
            'shiftNumber': 'shift_number',
            'teamAbbrev': 'team_abbr'
        })

        # Keep
        shift_keep_cols = ['game_id', 'shift_id', 'team_id', 'player_id', 'player_name', 'period',
                           'period_start_seconds', 'period_end_seconds', 'game_start_seconds', 'game_end_seconds',
                           'eventNumber', 'team_abbr', 'shift_number', 'typeCode']
        
        normalized_shift = normalized_shift[shift_keep_cols]
        #normalized_shift = pl.DataFrame(normalized_shift)

        # Join To Get Home Team + Drop Duplicates
        game_info_slim = (
            data
            .select('game_id', 'home_id', 'away_id', 'game_seconds', 'period_seconds', 'event_id', 'event_idx', 'event_type')
            .with_columns([
                (pl.col('game_id').cast(pl.Int64).alias('game_id')),
                (pl.col('home_id').cast(pl.Int64).alias('home_id')),
                (pl.col('away_id').cast(pl.Int64).alias('away_id'))
            ])
        ).unique().to_pandas()


        normalized_shift = pd.merge(normalized_shift, game_info_slim, on='game_id', how='left')
        normalized_shift['team_type'] = np.where(normalized_shift['home_id'] == normalized_shift['team_id'], 'home', 'away')
        normalized_shift = normalized_shift.drop(['home_id', 'away_id'], axis=1).drop_duplicates()

        # Join To Separate Goalies
        roster = ROSTER_DF.to_pandas()
        roster['player_id'] = roster['player_id'].astype(int)
        roster['pos_G'] = roster['pos_G'].astype(int)

        normalized_shift = pd.merge(normalized_shift, roster[['player_id', 'pos_G']].drop_duplicates(), on='player_id', how='left')
        normalized_shift = normalized_shift.drop_duplicates()


        # Group by 'player_id'
        grouped_shifts = normalized_shift[['player_id', 'game_id', 'period', 'team_type', 'pos_G', 'period_start_seconds', 'period_end_seconds']].groupby(['game_id', 'period', 'period_start_seconds', 'period_end_seconds', 'team_type', 'pos_G'])

        # Aggregate using the agg method
        result_df = grouped_shifts.agg(
            player_id_list=('player_id', list),
        ).reset_index()

        seconds_df = data.select(
            pl.col('game_id').cast(pl.Int64),
            pl.col('period').cast(pl.Int64),
            pl.col('period_seconds').cast(pl.Int64),
            'event_idx'
            ).filter(pl.col('game_id') == i).to_pandas()


        # Function to get player lists based on conditions
        def get_player_lists(row, type, pos):
            p_secs = row['period_seconds']
            per = row['period']

            # Adjust conditions as needed
            condition = (
                (result_df['period'] == per) &
                (result_df['period_start_seconds'] <= p_secs) &
                (result_df['period_end_seconds'] > p_secs) &
                (result_df['team_type'] == type) &
                (result_df['pos_G'] == pos)
            )

            selected_rows = result_df[condition]

            if not selected_rows.empty:
                # Combine player lists
                combined_players = [player_id for player_list in selected_rows['player_id_list'] for player_id in player_list]
                return combined_players
            else:
                return []

        # Apply the function to each row of seconds_df
        seconds_df['home'] = seconds_df.apply(get_player_lists,type='home', pos=0, axis=1)
        seconds_df['away'] = seconds_df.apply(get_player_lists,type='away', pos=0, axis=1)
        seconds_df['home_goalie'] = seconds_df.apply(get_player_lists, type='home', pos=1, axis=1)
        seconds_df['away_goalie'] = seconds_df.apply(get_player_lists, type='away', pos=1, axis=1)

        # Columns with lists of player IDs
        list_columns = ["home", "away", "home_goalie", "away_goalie"]

        # Iterate over each list column
        for column in list_columns:
            # Create a DataFrame with all new columns
            new_columns_df = pd.DataFrame(seconds_df[column].tolist(), index=seconds_df.index)
            # Rename the new columns
            new_columns_df.columns = [f"{column}_{i+1}_on" for i in range(new_columns_df.shape[1])]
            # Concatenate the new columns to the original DataFrame
            seconds_df = pd.concat([seconds_df, new_columns_df], axis=1)

        # Drop the original list columns
        seconds_df = seconds_df.drop(list_columns, axis=1)

        # Append to DF List
        shift_df_list.append(seconds_df)

    # Combine DataFrames
    final_shift_df = pd.concat(shift_df_list)
    return final_shift_df


In [None]:
SHIFT_23 = append_shift_data(PBP_RAW.filter(pl.col('season') == 20232024))

In [54]:
# Load Game ID and Home/Away Ids
def load_raw_shift_data(data):
    """This function will take the game ID from the RAW Play by Play Data and create columns for players on ice during given events."""
    raw_shift_df_list = []
    for i in data['game_id'].unique():
        shift_link = "https://api.nhle.com/stats/rest/en/shiftcharts?cayenneExp=gameId="+str(i)
        shift_response = requests.get(shift_link)
        shift_raw = pd.json_normalize(shift_response.json())['data']
        normalized_shift = pd.concat([pd.json_normalize(item) for sublist in shift_raw for item in sublist], ignore_index=True)

        # Create Columns From Data (Names and Shift Starts)
        normalized_shift['player_name'] = normalized_shift['firstName'] + ' ' + normalized_shift['lastName']
        # Period Time
        normalized_shift['period_start_seconds'] = pd.to_datetime(normalized_shift['startTime'], format='%M:%S').dt.minute * 60 + pd.to_datetime(normalized_shift['startTime'], format='%M:%S').dt.second
        normalized_shift['period_end_seconds'] = pd.to_datetime(normalized_shift['endTime'], format='%M:%S').dt.minute * 60 + pd.to_datetime(normalized_shift['endTime'], format='%M:%S').dt.second
        # Game Time
        normalized_shift['game_start_seconds'] = ( pd.to_datetime(normalized_shift['startTime'], format='%M:%S').dt.minute * 60 + pd.to_datetime(normalized_shift['startTime'], format='%M:%S').dt.second) + ((normalized_shift['period'] - 1) * 1200)
        normalized_shift['game_end_seconds'] = ( pd.to_datetime(normalized_shift['endTime'], format='%M:%S').dt.minute * 60 + pd.to_datetime(normalized_shift['endTime'], format='%M:%S').dt.second) + ((normalized_shift['period'] - 1) * 1200)

        # Rename
        normalized_shift = normalized_shift.rename(columns = {
            'gameId': 'game_id',
            'id': 'shift_id',
            'playerId': 'player_id',
            'teamId': 'team_id',
            'shiftNumber': 'shift_number',
            'teamAbbrev': 'team_abbr'
        })

        # Append to DF List
        raw_shift_df_list.append(normalized_shift)

    # Combine DataFrames
    final_raw_shift_df = pd.concat(raw_shift_df_list)

    return final_raw_shift_df

In [55]:
SHIFT_23 = load_raw_shift_data(PBP_RAW.filter(pl.col('season') == 20232024))

In [77]:
SHIFT_23.head()

Unnamed: 0,shift_id,detailCode,duration,endTime,eventDescription,eventDetails,eventNumber,firstName,game_id,hexValue,lastName,period,player_id,shift_number,startTime,team_abbr,team_id,teamName,typeCode,player_name,period_start_seconds,period_end_seconds,game_start_seconds,game_end_seconds
0,13244643,0,00:30,00:30,,,7,Ryan,2023020001,#FFB81C,McDonagh,1,8474151,1,00:00,NSH,18,Nashville Predators,517,Ryan McDonagh,0,30,0,30
1,13244644,0,00:51,03:02,,,74,Ryan,2023020001,#FFB81C,McDonagh,1,8474151,2,02:11,NSH,18,Nashville Predators,517,Ryan McDonagh,131,182,131,182
2,13244645,0,00:44,04:34,,,92,Ryan,2023020001,#FFB81C,McDonagh,1,8474151,3,03:50,NSH,18,Nashville Predators,517,Ryan McDonagh,230,274,230,274
3,13244646,0,00:47,06:45,,,222,Ryan,2023020001,#FFB81C,McDonagh,1,8474151,4,05:58,NSH,18,Nashville Predators,517,Ryan McDonagh,358,405,358,405
4,13244647,0,00:41,09:00,,,250,Ryan,2023020001,#FFB81C,McDonagh,1,8474151,5,08:19,NSH,18,Nashville Predators,517,Ryan McDonagh,499,540,499,540


In [127]:
def clean_shift_data(data):
    """ This function will take the raw shift data loaded using load_raw_shift_data and clean the columns to create everything we need to append to our PBP_RAW Table"""

    df = data
    # Create Static Table For Games (add Home+Away) and Rosters
    game_info_slim = (
        PBP_RAW
        .filter(pl.col('season') == 20232024)
        .select((pl.col('game_id').cast(pl.Int64)),
            (pl.col('home_id').cast(pl.Int64)),
            (pl.col('away_id').cast(pl.Int64)),
            (pl.col('period').cast(pl.Int64)),
            (pl.col('period_seconds').cast(pl.Int64).alias('period_seconds')),
            'event_idx'
            )
    ).to_pandas()

    # Join To Separate Goalies
    roster = ROSTER_DF.to_pandas().drop_duplicates()
    roster['player_id'] = roster['player_id'].astype(int)
    roster['pos_G'] = roster['pos_G'].astype(int)
    roster = roster[['player_id', 'pos_G']]

    # Join Static Tables To Shift Data (df)
    df = pd.merge(df, game_info_slim.drop(['period', 'period_seconds', 'event_idx'], axis = 1).drop_duplicates(), on='game_id', how='left')
    df['team_type'] = np.where(df['home_id'] == df['team_id'], 'home', 'away')
    df = df.drop(['home_id', 'away_id'], axis=1).drop_duplicates()
    
    
    df = pd.merge(df, roster, on='player_id', how='left')
    df = df.drop_duplicates()

    # Group by 'player_id'
    grouped_shifts = df[['player_id', 'game_id', 'period', 'team_type', 'pos_G', 'period_start_seconds', 'period_end_seconds']].groupby(['game_id', 'period', 'period_start_seconds', 'period_end_seconds', 'team_type', 'pos_G'])
    # Aggregate using the agg method
    result_df = grouped_shifts.agg(
        player_id_list=('player_id', list),
    ).reset_index()

    seconds_df = game_info_slim[['game_id', 'period', 'period_seconds', 'event_idx']]
    seconds_df = seconds_df[seconds_df['game_id'].isin(result_df['game_id'].unique())]

    print("Static Tables Created - Moving on To Row Conditions")

    
    
    # Function to get player lists based on conditions
    def get_player_lists(row, type, pos, gm_id):
        p_secs = row['period_seconds']
        per = row['period']

        # Adjust conditions as needed
        result_slim = result_df[result_df['game_id'] == gm_id]
        condition = (
            (result_slim['period'] == per) &
            (result_slim['period_start_seconds'] <= p_secs) &
            (result_slim['period_end_seconds'] > p_secs) &
            (result_slim['team_type'] == type) &
            (result_slim['pos_G'] == pos)
        )

        selected_rows = result_slim[condition]

        if not selected_rows.empty:
            # Combine player lists
            combined_players = [player_id for player_list in selected_rows['player_id_list'] for player_id in player_list]
            return combined_players
        else:
            return []
        
    concat_list = []
    for i in seconds_df['game_id'].unique():
        df = seconds_df[seconds_df['game_id'] == i].copy()
        print(i)
        

        # Apply the function to each row of seconds_df
        df['home'] = df.apply(lambda row: get_player_lists(row, type='home', pos=0, gm_id=i), axis=1)
        #print("Home Skaters Complete")
        df['away'] = df.apply(lambda row: get_player_lists(row, type='away', pos=0, gm_id=i), axis=1)
        #print("Away Skaters Complete")
        df['home_goalie'] = df.apply(lambda row: get_player_lists(row, type='home', pos=1, gm_id=i), axis=1)
        #print("home Goalies Complete")
        df['away_goalie'] = df.apply(lambda row: get_player_lists(row, type='away', pos=1, gm_id=i), axis=1)
        #print("Away Goalies Complete")

        # Columns with lists of player IDs
        list_columns = ["home", "away", "home_goalie", "away_goalie"]

        # Iterate over each list column
        for column in list_columns:
            # Iterate over each row
            for index, row in df.iterrows():
                # Extract the list of player IDs
                player_ids = row[column]

                # Iterate over the player IDs in the list
                for i, player_id in enumerate(player_ids):
                    # Create a new column name
                    new_column_name = f"{column}_{i+1}_on"

                    # Create a new column in the DataFrame
                    df.at[index, new_column_name] = player_id

        # Drop the original list columns
        df = df.drop(list_columns, axis=1)
        # Append to DF List
        concat_list.append(df)
    # Combine DataFrames
    concat_df = pd.concat(concat_list)
    return concat_df


In [131]:
CLEAN_SHIFT = clean_shift_data(load_raw_shift_data(PBP_RAW))

ValueError: No objects to concatenate