In [1]:
# Pandas
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import numpy as np

# Polars (Arrow)
from pyarrow.dataset import dataset
import polars as pl
pl.Config.set_tbl_rows(n=-1)
pl.Config.set_tbl_cols(n=-1)

# Hit API
import requests

# Tools
from itertools import chain
from datetime import datetime, timedelta
from math import pi

# Save
import pickle
import json
import os
import pathlib

# System Stats
import psutil

In [2]:
def get_mem_stats(n=5):
    print(f"Total Memory: {psutil.virtual_memory().total / (1024 ** 3):.2f} GB")
    print(f"Used Memory: {psutil.virtual_memory().used / (1024 ** 3):.2f} GB")
    print(f"Free Memory: {psutil.virtual_memory().free / (1024 ** 3):.2f} GB")

    # Get a list of running processes
    processes = list(psutil.process_iter(['pid', 'name', 'memory_info']))

    # Sort processes by memory usage
    processes.sort(key=lambda x: x.info['memory_info'].rss, reverse=True)

    # Print information about the top processes
    for process in processes[:n]:  # Adjust the number to show more or fewer processes
        print(f"PID: {process.info['pid']}, Name: {process.info['name']}, Memory Usage: {process.info['memory_info'].rss / (1024 ** 2):.2f} MB")

get_mem_stats()


Total Memory: 15.73 GB
Used Memory: 8.93 GB
Free Memory: 6.80 GB
PID: 4652, Name: MemCompression, Memory Usage: 653.34 MB
PID: 24288, Name: Code.exe, Memory Usage: 302.16 MB
PID: 4044, Name: explorer.exe, Memory Usage: 287.41 MB
PID: 6368, Name: SearchHost.exe, Memory Usage: 279.33 MB
PID: 21616, Name: Code.exe, Memory Usage: 265.54 MB


## Load Most Current Rosters

In [3]:
# Path
roster_file = 'Data/NHL_Rosters_2014_2024.csv'

# All Players - Connect To event_player_1_id, event_player_2_id, event_player_3_id, event_player_4_id, event_goalie_id, home_goalie, away_goalie
ROSTER_DF_RAW = pl.read_csv(roster_file)

ROSTER_DF = (
    ROSTER_DF_RAW
    .with_columns([
        pl.col("player_id").cast(pl.Int32),
        (pl.col("first_name").str.to_uppercase() + '.' + pl.col("last_name").str.to_uppercase()).alias('player_name'),
        pl.when((pl.col('pos_G') == 1) & (pl.col('hand_R') == 1)).then(pl.lit(1)).otherwise(pl.lit(0)).alias('G_hand_R'),
        pl.when((pl.col('pos_G') == 1) & (pl.col('hand_L') == 1)).then(pl.lit(1)).otherwise(pl.lit(0)).alias('G_hand_L')
        ])
    .select(['player_id', 'player_name', 'hand_R', 'hand_L', 'pos_F', 'pos_D', 'pos_G', 'G_hand_R', 'G_hand_L'])
    .unique()
)

ROSTER_DF_RAW.head()

player_id,first_name,last_name,pos_F,pos_D,pos_G,hand_R,hand_L
i64,str,str,i64,i64,i64,i64,i64
8473492,"""Matt""","""Beleskey""",1,0,0,0,1
8474009,"""Nick""","""Bonino""",1,0,0,0,1
8471699,"""Andrew""","""Cogliano""",1,0,0,0,1
8462041,"""Radek""","""Dvorak""",1,0,0,1,0
8475770,"""Emerson""","""Etem""",1,0,0,0,1


# Get Game ID's From Previous Seasons

- Saves two lists (Current Game IDs and All Game IDs)

In [4]:
def align_and_cast_columns(data, sch):
    # Identify missing and extra columns
    missing_cols_int = set(sch.keys()) - set(data.columns)
    extra_cols_int = set(data.columns) - set(sch.keys())
    data = data.drop(extra_cols_int)

    # Fill missing columns with null values and cast to the correct type
    for col in sch.keys():

        col_type = sch.get(col)

        if (col in data.columns) & (col_type == 'str'):
            data = data.with_columns(pl.col(col).cast(pl.Utf8).alias(col))
        elif (col in data.columns) & (col_type == 'i32'):
            data = data.with_columns(pl.col(col).cast(pl.Int32).alias(col))
        elif (col in data.columns) & (col_type == 'f32'):
            data = data.with_columns(pl.col(col).cast(pl.Float32).alias(col))
        elif (col not in data.columns) & (col_type == 'str'):
            data = data.with_columns(pl.lit(None).cast(pl.Utf8).alias(col))
        elif (col not in data.columns) & (col_type == 'f32'):
            data = data.with_columns(pl.lit(None).cast(pl.Float32).alias(col))

    # Select columns and update schema
    data = data.select(sch.keys())

    return data

# Master Schema For Union
raw_schema = {
    'id': 'i32',
    'gameDate': 'str',
    'season': 'i32',
    'sortOrder': 'i32',
    'gameType': 'i32',
    'period': 'i32',
    'periodDescriptor.periodType': 'str',
    'timeRemaining': 'str',
    'timeInPeriod': 'str',
    'situationCode': 'str',
    'homeTeamDefendingSide': 'str',
    'details.eventOwnerTeamId': 'str',
    'awayTeam.id': 'str',
    'awayTeam.abbrev': 'str',
    'details.awayScore': 'f32',
    'homeTeam.id': 'str',
    'homeTeam.abbrev': 'str',
    'details.homeScore': 'f32',
    'eventId': 'i32',
    'typeCode': 'i32',
    'details.typeCode': 'str',
    'typeDescKey': 'str',
    'details.descKey': 'str',
    'details.reason': 'str',
    'details.secondaryReason': 'str',
    'details.shotType': 'str',
    'details.zoneCode': 'str',
    'details.xCoord': 'f32',
    'details.yCoord': 'f32',
    'details.scoringPlayerId': 'str',
    'details.shootingPlayerId': 'str',
    'details.goalieInNetId': 'str',
    'details.blockingPlayerId': 'str',
    'details.committedByPlayerId': 'str',
    'details.drawnByPlayerId': 'str',
    'details.servedByPlayerId': 'str',
    'details.duration': 'str',
    'details.hittingPlayerId': 'str',
    'details.hitteePlayerId': 'str',
    'details.winningPlayerId': 'str',
    'details.losingPlayerId': 'str',
    'details.assist1PlayerId': 'str',
    'details.assist2PlayerId': 'str',
    'details.playerId': 'str'    
}

### Defining Load Functions: Cleaning

In [5]:
def min_to_sec(time_str):
    """This function will help to convert time's formatted like MM:SS to a round seconds number"""
    if time_str is None:
        return None
    
    minutes, seconds = map(int, time_str.split(':'))
    return minutes * 60 + seconds

In [6]:
def reconcile_api_data(data):
    """ This Function will take a polars dataframe and reconcile column names, values, and data types to match SDV cleaning functions to save time and effort in building more tweak functions"""

    # Create Dictionaries For Column Name/Value Rename
    rename_dict = {
        "id": "game_id",
        "gameDate": "game_date",
        "awayTeam.id": "away_id",
        "awayTeam.abbrev": "away_abbreviation",
        "homeTeam.id": "home_id",
        "homeTeam.abbrev": "home_abbreviation",
        "gameType": "season_type",
        "eventId": "event_id",
        "typeDescKey": "event_type",
        "sortOrder": "event_idx",
        "periodDescriptor.periodType": "period_type",
        "details.eventOwnerTeamId": "event_team_id",
        "details.xCoord": "x",
        "details.yCoord": "y",
        "details.zoneCode": "event_zone",
        "details.shotType": "secondary_type",
        "details.awayScore": "away_score",
        "details.homeScore": "home_score",
        "details.goalieInNetId": "event_goalie_id",
        "details.blockingPlayerId": "blocking_player_id",
        "details.drawnByPlayerId": "drawnby_player_id",
        "details.servedByPlayerId": "servedby_player_id",
        "details.committedByPlayerId": "committedby_player_id",
        "details.hittingPlayerId": "hitting_player_id",
        "details.hitteePlayerId": "hittee_player_id",
        "details.assist1PlayerId": "assist_1_player_id",
        "details.assist2PlayerId": "assist_2_player_id",
        "details.shootingPlayerId": "shooting_player_id",
        "details.reason": "reason",
        "details.scoringPlayerId": "scoring_player_id",
        "details.duration": "penalty_minutes",
        "details.winningPlayerId": "winning_player_id",
        "details.losingPlayerId": "losing_player_id"
    }

    # Event Type
    event_type_dict = {
        "faceoff": "FACEOFF",
        "shot-on-goal": "SHOT",
        "stoppage": "STOPPAGE",
        "hit": "HIT",
        "blocked-shot": "BLOCKED_SHOT",
        "missed-shot": "MISSED_SHOT",
        "giveaway": "GIVEAWAY",
        "takeaway": "TAKEAWAY",
        "penalty": "PENALTY",
        "goal": "GOAL",
        "period-start": "PERIOD_START",
        "period-end": "PERIOD_END",
        "delayed-penalty": "DELAYED_PENALTY",
        "game-end": "GAME_END",
        "shootout-complete": "SHOOTOUT_COMPLETE",
        "failed-shot-attempt": "FAILED_SHOT",
        None:None
    }

    # Season Type
    season_type_dict = {
        2: "R",
        3: "P",
        None:None
    }

    # Shot Type
    shot_type_dict = {
        "snap": "Snap",
        "between-legs": "Between Legs",
        "wrap-around": "Wrap-Around",
        "tip-in": "Tip-In",
        "cradle": "Wrap-Around",
        "poke": 'Poked',
        "bat": 'Batted',
        "deflected": "Deflected",
        "wrist": "Wrist",
        "slap":	"Slap",
        "backhand": "Backhand",
        None: None
    }

    # Rename Columns + Values AND Add Event/Season Type Helpers
    data = data.rename(rename_dict).filter((pl.col('period_type') != 'SO') & (pl.col('season_type').is_in([2, 3])))

    data = (
        data
        .with_columns([
            (pl.col('season_type').map_dict(season_type_dict, default = pl.col('season_type'))).alias('season_type'),
            (pl.col('event_type').map_dict(event_type_dict,default = pl.col('event_type'))).alias('event_type'),
            (pl.col('secondary_type').map_dict(shot_type_dict,default = pl.col('secondary_type'))).alias('secondary_type'),
            pl.when(pl.col('event_team_id') == pl.col('home_id')).then(pl.lit('home')).otherwise(pl.lit('away')).alias('event_team_type'),
            pl.when(pl.col('event_team_id') == pl.col('home_id')).then(pl.lit('home_abbreviation')).otherwise(pl.lit('away_abbreviation')).alias('event_team_abbr')
            ])
        #.drop('gameType', 'typeDescKey', 'details.shotType')
        .filter(~pl.col('situationCode').is_in(["PERIOD_START", "PERIOD_END", "GAME_START", "GAME_END"]))
    )

    # Create Game and Period Seconds Remaining from timeInPeriod, timeRemaining: 'period', 'period_seconds', 'period_seconds_remaining', 'game_seconds', 'game_seconds_remaining'
    data = (
        data
        .with_columns(pl.when(pl.col('timeInPeriod').is_null()).then(pl.lit(None)).otherwise(pl.col('timeInPeriod').apply(min_to_sec)).alias('period_seconds'))
        .with_columns([
            (1200 - pl.col('period_seconds')).alias('period_seconds_remaining'),
            (pl.col('period_seconds') + ((pl.col('period')-1)*1200)).alias('game_seconds'),
            ((3600 - pl.col('period_seconds')) + ((pl.col('period') - 3) * 1200)).alias('game_seconds_remaining')
        ])
    )

    # Create event_player_1_id and event_player_2_id columns based on event_type and corresponding columns
    remove_ply_ids = ['winning_player_id', 'hitting_player_id', 'scoring_player_id', 'shooting_player_id', 'committedby_player_id',
                      'details.playerId', 'losing_player_id', 'hittee_player_id', 'drawnby_player_id', 'assist_1_player_id', 'assist_2_player_id',
                      'blocking_player_id']
    data = (
        data
        .with_columns([
            (pl.when(pl.col('event_type') == 'FACEOFF').then(pl.col('winning_player_id'))
               .when(pl.col('event_type') == 'HIT').then(pl.col('hitting_player_id'))
               .when(pl.col('event_type') == 'GOAL').then(pl.col('scoring_player_id'))
               .when(pl.col('event_type').is_in(['SHOT', 'MISSED_SHOT', "BLOCKED_SHOT"])).then(pl.col('shooting_player_id'))
               .when(pl.col('event_type') == 'PENALTY').then(pl.col('committedby_player_id'))
               .when(pl.col('event_type') == 'GIVEAWAY').then(pl.col('details.playerId'))
               .when(pl.col('event_type') == 'TAKEAWAY').then(pl.col('details.playerId'))
               .otherwise(pl.lit(None))
             ).alias("event_player_1_id"),
             (pl.when(pl.col('event_type') == 'FACEOFF').then(pl.col('losing_player_id'))
               .when(pl.col('event_type') == 'HIT').then(pl.col('hittee_player_id'))
               .when(pl.col('event_type').is_in(['GOAL','SHOT', 'MISSED_SHOT', 'BLOCKED_SHOT'])).then(pl.col('event_goalie_id'))
               .when(pl.col('event_type') == 'PENALTY').then(pl.col('drawnby_player_id'))
               .otherwise(pl.lit(None))
             ).alias("event_player_2_id"),
             (pl.when((pl.col('event_type') == 'GOAL') & (~pl.col('assist_1_player_id').is_null())).then(pl.col('assist_1_player_id'))
               .when((pl.col('event_type') == 'PENALTY') & (~pl.col('servedby_player_id').is_null())).then(pl.col('servedby_player_id'))
               .when((pl.col('event_type') == 'BLOCKED_SHOT') & (~pl.col('blocking_player_id').is_null())).then(pl.col('blocking_player_id'))
               .otherwise(pl.lit(None))
             ).alias("event_player_3_id"),
             (pl.when((pl.col('event_type') == 'GOAL') & (~pl.col('assist_2_player_id').is_null())).then(pl.col('assist_2_player_id'))
               .otherwise(pl.lit(None))
             ).alias("event_player_4_id"),
             (pl.when(pl.col('event_type') == 'FACEOFF').then(pl.lit('Winner'))
               .when(pl.col('event_type') == 'HIT').then(pl.lit('Hitter'))
               .when(pl.col('event_type') == 'GOAL').then(pl.lit('Scorer'))
               .when(pl.col('event_type').is_in(['SHOT', 'MISSED_SHOT', "BLOCKED_SHOT"])).then(pl.lit('Shooter'))
               .when(pl.col('event_type') == 'PENALTY').then(pl.lit('PenaltyOn'))
               .when(pl.col('event_type') == 'GIVEAWAY').then(pl.lit('PlayerID'))
               .when(pl.col('event_type') == 'TAKEAWAY').then(pl.lit('PlayerID'))
               .otherwise(pl.lit(None))
             ).alias("event_player_1_type"),
             (pl.when(pl.col('event_type') == 'FACEOFF').then(pl.lit('Loser'))
               .when(pl.col('event_type') == 'HIT').then(pl.lit('Hittee'))
               .when((pl.col('event_type') == 'GOAL') & (~pl.col('event_goalie_id').is_null())).then(pl.lit('Goalie'))
               .when((pl.col('event_type') == 'GOAL') & (pl.col('event_goalie_id').is_null())).then(pl.lit('EmptyNet'))
               .when(pl.col('event_type').is_in(['SHOT', 'MISSED_SHOT', 'BLOCKED_SHOT'])).then(pl.lit('Goalie'))
               .when(pl.col('event_type') == 'PENALTY').then(pl.lit('DrewBy'))
               .when(pl.col('event_type') == 'GIVEAWAY').then(pl.lit('PlayerID'))
               .when(pl.col('event_type') == 'TAKEAWAY').then(pl.lit('PlayerID'))
               .otherwise(pl.lit(None))
             ).alias("event_player_2_type"),
             (pl.when((pl.col('event_type') == 'GOAL') & (~pl.col('assist_1_player_id').is_null())).then(pl.lit('Assist'))
               .when((pl.col('event_type') == 'PENALTY') & (~pl.col('servedby_player_id').is_null())).then(pl.lit('ServedBy'))
               .when((pl.col('event_type') == 'BLOCKED_SHOT') & (~pl.col('blocking_player_id').is_null())).then(pl.lit('Blocker'))
               .otherwise(pl.lit(None))
             ).alias("event_player_3_type"),
             (pl.when((pl.col('event_type') == 'GOAL') & (~pl.col('assist_2_player_id').is_null())).then(pl.lit('Assist'))
               .otherwise(pl.lit(None))
             ).alias("event_player_4_type")
        ])
        .drop(remove_ply_ids)
    )
    # Parse Situation Code For Home/Away Skaters/EmptyNet
    data = (
        data
        .sort('season', 'game_id', 'period', 'event_idx')
        .with_columns(
            pl.when(pl.col('situationCode').is_null()).then(pl.col("situationCode").fill_null(strategy="forward")).otherwise(pl.col('situationCode')).alias('situationCode')
        )
        .filter(~pl.col('situationCode').is_in(['0101', '1010']))
        .with_columns([
            pl.col("situationCode").str.slice(0, 1).cast(pl.Int32).alias("away_en"),
            pl.col("situationCode").str.slice(3, 1).cast(pl.Int32).alias("home_en"),
            pl.col("situationCode").str.slice(1, 1).cast(pl.Int32).alias("away_skaters"),
            pl.col("situationCode").str.slice(2, 1).cast(pl.Int32).alias("home_skaters")
        ])
        .with_columns([
            (pl.concat_str([pl.col('home_skaters'), pl.lit('v'), pl.col('away_skaters')])).alias('strength_state'),
            (pl.concat_str([pl.col('home_skaters'), pl.lit('v'), pl.col('away_skaters')])).alias('true_strength_state')
        ])
    )

    # Create x_fixed and y_fixed. These coordinates will be relative to the event team's attacking zone (i.e., x_abs is positive)
    data = (
        data
        .with_columns([
            pl.when((pl.col('event_zone') == 'O') & (pl.col('x').mean() > 0)).then(pl.lit(1)).otherwise(pl.lit(-1)).alias('flipped_coords')
        ])
        .with_columns([
            # Where homeTeamDefendingSide Exists
            (pl.when((~pl.col('homeTeamDefendingSide').is_null()) &
                     ( pl.col('homeTeamDefendingSide') == 'left') &
                     ( pl.col('event_team_type') == 'home'))
                     .then(pl.col('x'))
               .when((~pl.col('homeTeamDefendingSide').is_null()) &
                     ( pl.col('homeTeamDefendingSide') == 'right') &
                     ( pl.col('event_team_type') == 'home'))
                     .then(pl.col('x')*-1)
               .when((~pl.col('homeTeamDefendingSide').is_null()) &
                     ( pl.col('homeTeamDefendingSide') == 'left') &
                     ( pl.col('event_team_type') == 'away'))
                     .then(pl.col('x')*-1)
               .when((~pl.col('homeTeamDefendingSide').is_null()) &
                     ( pl.col('homeTeamDefendingSide') == 'right') &
                     ( pl.col('event_team_type') == 'away'))
                     .then(pl.col('x'))
              # Where homeTeamDefendingSide does not exist
              .when((pl.col('homeTeamDefendingSide').is_null()) &
                    (pl.col('event_zone') == 'O'))
                    .then(pl.col('x').abs())
              .when((pl.col('homeTeamDefendingSide').is_null()) &
                    (pl.col('event_zone') == 'D'))
                    .then((pl.col('x').abs())*-1)
              .when((pl.col('homeTeamDefendingSide').is_null()) &
                    (pl.col('event_zone') == 'N'))
                    .then((pl.col('x')) * (pl.col('flipped_coords').max().over(['season', 'game_id', 'period'])))
              .otherwise(pl.lit(None)).alias('x_abs')
            ),
            # Where homeTeamDefendingSide does exist
            (pl.when((~pl.col('homeTeamDefendingSide').is_null()) &
                     ( pl.col('homeTeamDefendingSide') == 'left') &
                     ( pl.col('event_team_type') == 'home'))
                     .then(pl.col('y'))
               .when((~pl.col('homeTeamDefendingSide').is_null()) &
                     ( pl.col('homeTeamDefendingSide') == 'right') &
                     ( pl.col('event_team_type') == 'home'))
                     .then(pl.col('y')*-1)
               .when((~pl.col('homeTeamDefendingSide').is_null()) &
                     ( pl.col('homeTeamDefendingSide') == 'left') &
                     ( pl.col('event_team_type') == 'away'))
                     .then(pl.col('y')*-1)
               .when((~pl.col('homeTeamDefendingSide').is_null()) &
                     ( pl.col('homeTeamDefendingSide') == 'right') &
                     ( pl.col('event_team_type') == 'away'))
                     .then(pl.col('y'))
              # Where homeTeamDefendingSide does not exist
              .when((pl.col('homeTeamDefendingSide').is_null()) &
                    (pl.col('event_zone') == 'O'))
                    .then(pl.col('y').abs())
              .when((pl.col('homeTeamDefendingSide').is_null()) &
                    (pl.col('event_zone') == 'D'))
                    .then((pl.col('y').abs())*-1)
              .when((pl.col('homeTeamDefendingSide').is_null()) &
                    (pl.col('event_zone') == 'N'))
                    .then((pl.col('y')) * (pl.col('flipped_coords').max().over(['season', 'game_id', 'period'])))
              .otherwise(pl.lit(None)).alias('y_abs')
            )
        ])
        .drop("flipped_coords")
    )

    # Create Event Distance Calculation
    data = data.with_columns(
        pl.when(pl.col('x_abs') >= 0).then(pl.Series.sqrt((89 - pl.Series.abs(data['x_abs']))**2 + data['y_abs']**2))
          .when(pl.col('x_abs') <  0).then(pl.Series.sqrt((pl.Series.abs(data['x_abs']) + 89)**2 + data['y_abs']**2))
          .alias('event_distance')
    )

    # Create Event Angle Calculation
    data = (
        data
        .with_columns(
        pl.when(data['x_abs'] >= 0)
          .then(pl.Series.arctan(data['y_abs'] / (89 - pl.Series.abs(data['x_abs'])))
                .apply(lambda x: abs(x * (180 / pi))))
          .when(data['x_abs'] < 0)
          .then(pl.Series.arctan(data['y_abs'] / (pl.Series.abs(data['x_abs']) + 89))
                .apply(lambda x: abs(x * (180 / pi))))
          .alias('event_angle')
        )
        .with_columns(
            pl.when(pl.col('x_abs') > 89).then((180 - pl.col('event_angle'))).otherwise(pl.col('event_angle')).alias('event_angle')
        )
    )

    return data

In [20]:
shift_link = "https://api.nhle.com/stats/rest/en/shiftcharts?cayenneExp=gameId="+str(2012020001)
shift_response = requests.get(shift_link).json()
# Assuming "data" is the key containing nested data
data_list = shift_response.get('data', [])
keep_keys = ['id', 'endTime', 'firstName', 'gameId', 'lastName', 'period', 'playerId', 'startTime', 'teamAbbrev', 'teamId']
filtered_data = [{key: item[key] for key in keep_keys} for item in data_list]
pl.DataFrame(filtered_data).head()

id,endTime,firstName,gameId,lastName,period,playerId,startTime,teamAbbrev,teamId
i64,str,str,i64,str,i64,i64,str,str,i64
2066098,"""00:29""","""Kimmo""",2012020001,"""Timonen""",1,8459670,"""00:00""","""PHI""",4
2066099,"""02:12""","""Kimmo""",2012020001,"""Timonen""",1,8459670,"""01:33""","""PHI""",4
2066100,"""03:46""","""Kimmo""",2012020001,"""Timonen""",1,8459670,"""02:47""","""PHI""",4
2066101,"""04:40""","""Kimmo""",2012020001,"""Timonen""",1,8459670,"""04:36""","""PHI""",4
2066102,"""06:09""","""Kimmo""",2012020001,"""Timonen""",1,8459670,"""04:50""","""PHI""",4


In [25]:
def append_shift_data(data):
    """ This function will load shift data allowing the user to see which players are on the ice at a given time in each game"""
    # Load Game ID and Home/Away Ids
    i = data['game_id'][0]

    game_info_slim = (
        data
        .filter(pl.col('game_id') == i)
        .select('game_id', 'home_id', 'away_id', 'period', 'game_seconds', 'period_seconds', 'event_id', 'event_idx', 'event_type')
        .unique()
    )

    shift_link = "https://api.nhle.com/stats/rest/en/shiftcharts?cayenneExp=gameId="+str(i)
    shift_response = requests.get(shift_link).json()

    # Assuming "data" is the key containing nested data
    data_list = shift_response.get('data', [])
    keep_keys = ['id', 'endTime', 'firstName', 'gameId', 'lastName', 'period', 'playerId', 'startTime', 'teamAbbrev', 'teamId']
    filtered_data = [{key: item[key] for key in keep_keys} for item in data_list]
    shift_raw = pl.DataFrame(filtered_data)
    shift_raw = (
        shift_raw
        .with_columns([
            (pl.col('firstName') + ' ' + pl.col('lastName')).alias('player_name'),
            ((pl.col('startTime').str.slice(0, 2).cast(pl.Int32) * 60) + (pl.col('startTime').str.slice(3, 5).cast(pl.Int32))).alias('period_start_seconds'),
            ((pl.col('endTime').str.slice(0, 2).cast(pl.Int32) * 60) + (pl.col('endTime').str.slice(3, 5).cast(pl.Int32))).alias('period_end_seconds')
        ])
        .with_columns([
            (pl.col('period_start_seconds') + ((pl.col('period') - 1) * 1200)).alias('game_start_seconds'),
            (pl.col('period_end_seconds') + ((pl.col('period') - 1) * 1200)).alias('game_end_seconds'),
        ])
        .rename({
                'gameId': 'game_id',
                'id': 'shift_id',
                'playerId': 'player_id',
                'teamId': 'team_id',
                'teamAbbrev': 'team_abbr'
            })
        .select([pl.col('game_id').cast(pl.Int32),
                 pl.col('team_id').cast(pl.Utf8),
                 pl.col('player_id').cast(pl.Utf8),
                 pl.col('player_name').str.to_uppercase().cast(pl.Utf8),
                 pl.col('team_abbr').cast(pl.Utf8),
                 pl.col('period').cast(pl.Int32),
                 pl.col('period_start_seconds').cast(pl.Int64),
                 pl.col('period_end_seconds').cast(pl.Int64),
                 pl.col('game_start_seconds').cast(pl.Int64),
                 pl.col('game_end_seconds').cast(pl.Int64)
                 ]) #'shift_id', 'typeCode', 'shift_number', 'eventNumber'
    )
    shift_raw = (
        # Join and Create team_type
        shift_raw
        .join(game_info_slim.select('game_id', 'home_id', 'away_id').unique(), on='game_id', how='left')
        .filter((pl.col('home_id') == pl.col('team_id')) | (pl.col('away_id') == pl.col('team_id')))
        .filter(pl.col('game_start_seconds') != pl.col('game_end_seconds') )
        .with_columns(pl.when(pl.col('home_id') == pl.col('team_id')).then(pl.lit('home'))
                        .when(pl.col('away_id') == pl.col('team_id')).then(pl.lit('away')).otherwise(pl.lit(None)).alias('team_type'))
        .drop('home_id', 'away_id')
        .unique()
    )
    # Combine Consecutive Shifts
    gb_cols = [col for col in shift_raw.columns if col not in ['period_start_seconds', 'game_start_seconds']]
    shift_raw = (
        shift_raw
        .with_columns([
            pl.col('period_start_seconds').max().over(gb_cols).alias('period_start_seconds'),
            pl.col('game_start_seconds').max().over(gb_cols).alias('game_start_seconds')#,
            #pl.col('eventNumber').max().over(gb_cols).alias('eventNumber')
        ])
        .unique()
        # Separate Goalies
        .join(ROSTER_DF.with_columns([
            (pl.col('player_id').cast(pl.Utf8).alias('player_id')),
            (pl.col('pos_G').cast(pl.Int32).alias('pos_G'))
        ])
        .select('player_id', 'pos_G'), on='player_id', how='left')
        .unique()
        .sort('game_id', 'period', 'period_start_seconds', 'period_end_seconds')
    )
    # Concat Player IDs into lists for each group (i.e. event and seconds)
    result_df = (
        shift_raw
        .groupby(['game_id', 'period', 'period_start_seconds', 'period_end_seconds', 'team_type', 'pos_G'])
        .agg(
            pl.concat_list('player_id').flatten().unique().alias('player_id_list'),
            pl.concat_list('player_name').flatten().unique().alias('player_name_list')
            )
        .sort('game_id', 'period', 'period_start_seconds', 'period_end_seconds')
    )
    # Separate and Create Player On Columns
    game_data = (
         game_info_slim
         .filter(pl.col('game_id') == i)
         .sort('game_id', 'period', 'period_seconds', 'event_idx')
    )
    def apply_player_lists_pl(x, ty, pos, shift, output):
        return get_player_lists_pl((x['game_id'], x['period'], x['period_seconds'], ty, pos, shift, output))
    def get_player_lists_pl(x):
        # Outline Variables
        g_id, per, p_secs, ty, pos, shift, output = x
        # Adjust conditions as needed
        conditions = (
            (result_df['game_id'] == g_id) &
            (result_df['period'] == per) &
            (result_df['team_type'] == ty) &
            (result_df['pos_G'] == pos)
        )
        if shift == 'current':
            conditions &= (
                (result_df['period_start_seconds'] < p_secs) &
                (result_df['period_end_seconds'] > p_secs)
            )
        elif shift == 'on':
            conditions &= (result_df['period_start_seconds'] == p_secs)
        elif shift == 'off':
            conditions &= (result_df['period_end_seconds'] == p_secs)
        filtered_rows = result_df.filter(conditions)

        if output == 'id':
            result_list = set(filtered_rows['player_id_list'].explode().to_list())
        elif output == 'name':
            result_list = set(filtered_rows['player_name_list'].explode().to_list())
        
        return ','.join(str(item) for item in result_list)
    
    # List of columns to generate
    columns_to_generate = [
        ('home', 0, 'current', 'id'),
        ('home', 0, 'current', 'name'),
        ('home', 0, 'on', 'id'),
        ('home', 0, 'on', 'name'),
        ('home', 0, 'off', 'id'),
        ('home', 0, 'off', 'name'),
        ('away', 0, 'current', 'id'),
        ('away', 0, 'current', 'name'),
        ('away', 0, 'on', 'id'),
        ('away', 0, 'on', 'name'),
        ('away', 0, 'off', 'id'),
        ('away', 0, 'off', 'name'),
        ('home', 1, 'current', 'id'),
        ('home', 1, 'current', 'name'),
        ('home', 1, 'on', 'id'),
        ('home', 1, 'on', 'name'),
        ('home', 1, 'off', 'id'),
        ('home', 1, 'off', 'name'),
        ('away', 1, 'current', 'id'),
        ('away', 1, 'current', 'name'),
        ('away', 1, 'on', 'id'),
        ('away', 1, 'on', 'name'),
        ('away', 1, 'off', 'id'),
        ('away', 1, 'off', 'name')
    ]

    # Generate columns dynamically
    for prefix, pos, shift, output in columns_to_generate:
        if pos == 1:
            pos_lab = 'goalie'
        elif pos == 0:
            pos_lab = 'skater'
        col_name = f"{prefix}_{pos_lab}_{shift}_{output}"
        game_data = game_data.with_columns([
            pl.struct(["game_id", "period", "period_seconds"]).apply(lambda x: apply_player_lists_pl(x, prefix, pos, shift, output)).alias(col_name)
        ])
    game_start_end = ['GAME_START', 'PERIOD_START', 'GAME_END', 'PERIOD_END']
    game_data =(
         game_data
         .sort('game_id', 'period', 'period_seconds', 'event_idx')
         .filter(~pl.col('event_type').is_in(game_start_end))
        .with_columns([
            pl.col('event_idx').max().over(['game_id', 'period', 'period_seconds']).alias('max_event_idx')
        ])
        .with_columns([
            (pl.col('game_id').cast(pl.Utf8) + '-' + pl.col('period').cast(pl.Utf8) + '-' + pl.col('period_seconds').cast(pl.Utf8)).alias('event_seconds_id'),
            pl.when(pl.col('event_idx') == pl.col('max_event_idx')).then(pl.col('event_type')).otherwise(pl.lit(None)).alias('max_event_type')
        ])
        .with_columns([
            pl.col('event_seconds_id').count().over(['game_id', 'period', 'period_seconds']).alias('count_event_seconds_id')
        ])
    )

    teams = ['home', 'away']
    positions = ['skater', 'goalie']
    outputvals = ['id', 'name']
    for team in teams:
        for position in positions:
            for outputval in outputvals:
                cur_cols = f"{team}_{position}_current_{outputval}"
                off_cols = f"{team}_{position}_off_{outputval}"
                on_cols = f"{team}_{position}_on_{outputval}"
                label1 = f"{team}_{position}_on_{outputval}"
                if position == 'goalie':
                    label2 = f"_goalie_{outputval}"
                else:
                    label2 = f"on_{outputval}"
                game_data = (
                    game_data
                    .with_columns([
                        pl.when((pl.col(cur_cols) != "") & (pl.col(on_cols)== "") & (pl.col(off_cols) == "")).then(pl.col(cur_cols))
                          .when((pl.col(cur_cols) == "") & (pl.col(on_cols)!= "") & (pl.col(off_cols) == "")).then(pl.col(on_cols))
                          .when((pl.col(cur_cols) == "") & (pl.col(on_cols)== "") & (pl.col(off_cols) != "")).then(pl.col(off_cols))
                          .when((pl.col(cur_cols) == "") & (pl.col(on_cols)!= "") & (pl.col(off_cols) != "") & (pl.col('event_idx') == pl.col('max_event_idx'))).then(pl.col(on_cols))
                          .when((pl.col(cur_cols) == "") & (pl.col(on_cols)!= "") & (pl.col(off_cols) != "") & (pl.col('event_idx') != pl.col('max_event_idx'))).then(pl.col(off_cols))
                          .when((pl.col(cur_cols) != "") & (pl.col(on_cols)!= "") & (pl.col('event_idx') == pl.col('max_event_idx'))).then(pl.concat_str([pl.col(cur_cols),pl.lit(","),pl.col(on_cols)]))
                          .when((pl.col(cur_cols) != "") & (pl.col(off_cols)!= "") & (pl.col('event_idx') != pl.col('max_event_idx'))).then(pl.concat_str([pl.col(cur_cols),pl.lit(","),pl.col(off_cols)]))
                          .when((pl.col(cur_cols) != "") & (pl.col(off_cols) != "") & (pl.col('event_idx') == pl.col('max_event_idx'))).then(pl.col(cur_cols))
                          .otherwise(pl.lit(None))
                          .alias(label1)
                    ])
                    .with_columns([pl.col(label1).str.split_exact(',', 7)])
                    .unnest(label1)
                    .rename({
                        "field_0" : f"{team}_1_{label2}",
                        "field_1" : f"{team}_2_{label2}",
                        "field_2" : f"{team}_3_{label2}",
                        "field_3" : f"{team}_4_{label2}",
                        "field_4" : f"{team}_5_{label2}",
                        "field_5" : f"{team}_6_{label2}",
                        "field_6" : f"{team}_7_{label2}",
                        "field_7" : f"{team}_8_{label2}"
                    })
                )
    keep_cols = ['game_id', 'period', 'game_seconds', 'period_seconds', 'event_idx',
                 'home_1__goalie_id', 'home_1__goalie_name',
                 'home_1_on_id', 'home_2_on_id', 'home_3_on_id', 'home_4_on_id', 'home_5_on_id', 'home_6_on_id',
                 'home_1_on_name', 'home_2_on_name', 'home_3_on_name', 'home_4_on_name', 'home_5_on_name', 'home_6_on_name',
                 'away_1_on_id', 'away_2_on_id', 'away_3_on_id', 'away_4_on_id', 'away_5_on_id', 'away_6_on_id',
                 'away_1_on_name', 'away_2_on_name', 'away_3_on_name', 'away_4_on_name', 'away_5_on_name', 'away_6_on_name',
                 'away_1__goalie_id', 'away_1__goalie_name']
    game_data = (
        game_data
        .select(keep_cols)
        .sort('game_id', 'period', 'period_seconds', 'event_idx')
        .rename({
            'away_1__goalie_id': 'away_goalie',
            'away_1__goalie_name': 'away_goalie_name',
            'home_1__goalie_id': 'home_goalie',
            'home_1__goalie_name': 'home_goalie_name'
        })
    )

    # Combine DataFrames
    result_df = data.join(game_data, on = ['game_id', 'period', 'game_seconds', 'period_seconds', 'event_idx'], how = "left")

    return result_df

In [9]:
import time
def load_games(load_path = 'Data/PBP/API_RAW_PBP_Data.parquet', season_start = 2012 , existing=False):
    """This function will load all game play by play data using the functions above to clean the raw API Data from the NHL.
    
    If Existing is True, the function will only load games that are not in the most current PBP_RAW Parquet File"""
    # Get Dates
    max_date_file = open('last_load_date.json', 'r+')
    max_date= json.load(max_date_file)['max_date']
    yday = datetime.today() - timedelta(days=1)
    end_date = yday.strftime('%Y%m%d')
    last_load = datetime.strptime(max_date, "%Y-%m-%d").strftime('%Y%m%d')


    if (existing==True):
        # Print Information
        print("Now Loading Most Recent Play By Play Data From Existing File Path", load_path)
        start_time = time.time()

        # Get Dates For Load
        load_dates = pd.date_range(start=last_load, end=end_date, freq='D')

        f_g_id = []
        for i in load_dates:
            i_str = i.strftime('%Y-%m-%d')
            sched_link = "https://api-web.nhle.com/v1/schedule/"+i_str
            response = requests.get(sched_link)

            # Parse the JSON content of the response
            raw_data = pd.json_normalize(response.json())
            sched_data = pd.json_normalize(raw_data['gameWeek'][0])
            sched_data = pd.json_normalize(sched_data['games'][0])

            if len(sched_data) == 0:
                pass
            else:
                sched_data = sched_data[sched_data['gameType'].isin([2,3])]
                f_g_id.append(sched_data['id'].tolist())

        # Create Lists (Game ID and Dates Loaded):
        f_g_id = list(chain(*f_g_id))

        # Load Current PBP
        data = pl.read_parquet(load_path).sort('season', 'game_id', 'sort_order')
        n_games = len(f_g_id)
        start_date = min(load_dates)
        end_date = max(load_dates)

        # Initialize Data Frame List To Store Loaded Data Frames
        df_list = []
        
        for i in f_g_id:
            pbp_link = 'https://api-web.nhle.com/v1/gamecenter/'+str(i)+'/play-by-play'

            pbp_response = requests.get(pbp_link)
            pbp_data = pd.json_normalize(pbp_response.json())

            ## GAME DATA
            game_data = pbp_data[['id', 'season', 'gameDate', 'gameType', 'awayTeam.id', 'awayTeam.abbrev', 'homeTeam.id', 'homeTeam.abbrev']]

            # PLAYS DATA
            if len(pbp_data['plays']) > 0:
                normalized_plays = pd.concat([pd.json_normalize(item) for sublist in pbp_data['plays'] for item in sublist], ignore_index=True)

                result_df = pd.merge(game_data.assign(key=1), normalized_plays.assign(key=1), on='key').drop('key', axis=1)
                result_df = pl.DataFrame(result_df)

                # SLIM DF
                result_df = align_and_cast_columns(data = result_df, sch = raw_schema)

                # CLEAN FOR USE
                result_df = reconcile_api_data(result_df)

                # APPEND TO DF LIST FOR UNION
                df_list.append(result_df)
            else:
                pass
        
        for df in df_list:
            data.extend(df)
            
        data = data.sort('season', 'game_id', 'sort_order')

        g_ids = data['game_id'].unique().to_list()
        max_date_new = data['game_date'].max()

        # Print Eval Statements
        end_time = time.time()
        elap_time = round(((end_time - start_time)/60),2)
        rows_loaded = data.filter(pl.col('game_id').is_in(f_g_id)).height
        print("Successfully Loaded",str(rows_loaded),"Rows from", str(n_games), "played between", str(start_date), "to", str(end_date), "in", str(elap_time), "Minutes")

        # Save
        with open('game_ids.pkl', 'wb') as file:
            pickle.dump(g_ids, file)
        
        json.dump({"max_date": max_date_new}, open('last_load_date.json', 'w+'))

        return data
    
    elif(existing==False):
        # Get Game IDs If List of Games Do Not Exist
        g_id_file_name  = 'game_ids.pkl'

        if os.path.exists(g_id_file_name):
            with open(g_id_file_name, "rb") as file:
                game_ids = pickle.load(file)
        else:
            print("Collecting and Aggregating All Game ID's")
            id_start = time.time()
            st_date = str(season_start)+'1001'
            game_ids = []
            for i in pd.date_range(start=st_date, end=end_date, freq='D'):
                i_str = i.strftime('%Y-%m-%d')
                sched_link = "https://api-web.nhle.com/v1/schedule/"+i_str
                response = requests.get(sched_link)

                # Parse the JSON content of the response
                raw_data = pd.json_normalize(response.json())
                sched_data = pd.json_normalize(raw_data['gameWeek'][0])
                sched_data = pd.json_normalize(sched_data['games'][0])

                if len(sched_data) == 0:
                    pass
                else:
                    sched_data = sched_data[sched_data['gameType'].isin([2,3])]
                    game_ids.append(sched_data['id'].tolist())

            # Create Lists (Game ID and Dates Loaded):
            game_ids = list(chain(*game_ids))

            # Save
            with open('game_ids.pkl', 'wb') as file:
                pickle.dump(game_ids, file)

            id_end = time.time()
            id_elap = round((id_end - id_start)/60, 2)

            print("Successfully Loaded", str(len(game_ids)), "Game ID's From NHL Schedule in", str(id_elap), 'minutes')

        print("Now Loading ALL Play By Play Data From NHL API (2012-2024 Seasons)", str(len(game_ids)), "Games")

        start_time = time.time()
        n_games = len(game_ids)

        prev_season = int(str(season_start)+str(season_start+1))
        prev_season_time = start_time

        # Initialize Data Frame List To Store Loaded Data Frames
        df_list = []
        bad_ids = []
        
        for i in game_ids:
            pbp_link = 'https://api-web.nhle.com/v1/gamecenter/'+str(i)+'/play-by-play'
            # Create Try For Bad Links
            try:
                pbp_response = requests.get(pbp_link)
                pbp_data = pd.json_normalize(pbp_response.json())
                # If DF loads but no data, pass
                if (~pbp_data['plays'].empty) | (len(pbp_data['plays']) < 0):
                    game_data = pbp_data[['id', 'season', 'gameDate', 'gameType', 'awayTeam.id', 'awayTeam.abbrev', 'homeTeam.id', 'homeTeam.abbrev']]

                    # Check if the current season is different from the previous one
                    if (game_data['season'][0] != prev_season):
                        season_lab = str(int(str(i)[:4])-1) +"-"+str(int(str(i)[:4]))
                        season_end_time = time.time()
                        season_elapsed_time = round((season_end_time - prev_season_time)/60,2)
                        games_loaded = sum(1 for x in game_ids if str(x).startswith(str(int(str(i)[:4])-1))) - sum(1 for x in bad_ids if str(x).startswith(str(int(str(i)[:4])-1)))
                        all_games_loaded = len(df_list)
                        games_remaining = len(game_ids) - all_games_loaded
                        gpm = ((all_games_loaded)/(season_end_time - start_time)*60)
                        szn_gpm = ((games_loaded)/(season_end_time - prev_season_time)*60)
                        est_time_remaining = games_remaining / gpm
                        print(f"Successfully Loaded {games_loaded} Games From {season_lab} Season in {season_elapsed_time} Minutes ({round(szn_gpm, 2)} GPM) | {games_remaining} Games To Load -- Est. Load Time: {round(est_time_remaining,2)} Minutes ({round(gpm, 2)} GPM) | Completed at {season_end_time}")
                        prev_season = game_data['season'][0]
                        prev_season_time = season_end_time
                    elif (i == game_ids[-1]):
                        season_lab = str(int(str(i)[:4])) +"-"+str(int(str(i)[:4])+1)
                        season_end_time = time.time()
                        season_elapsed_time = round((season_end_time - prev_season_time)/60,2)
                        games_loaded = sum(1 for x in game_ids if str(x).startswith(str(int(str(i)[:4])))) - sum(1 for x in bad_ids if str(x).startswith(str(int(str(i)[:4]))))
                        all_games_loaded = len(df_list) + 1
                        gpm = ((all_games_loaded)/(season_end_time - start_time)*60)
                        szn_gpm = ((games_loaded)/(season_end_time - prev_season_time)*60)
                        print(f"Successfully Loaded {games_loaded} Games From {season_lab} Season in {season_elapsed_time} Minutes ({round(szn_gpm, 2)} GPM) | Completed at {season_end_time}")

                    # Build 'Plays' Data Frame and Combine With Game Data
                    normalized_plays = pd.concat([pd.json_normalize(item) for sublist in pbp_data['plays'] for item in sublist], ignore_index=True)
                    result_df = pd.merge(game_data.assign(key=1), normalized_plays.assign(key=1), on='key').drop('key', axis=1)
                    result_df = pl.DataFrame(result_df)

                    # Slim Game DataFrame
                    result_df = align_and_cast_columns(data = result_df, sch = raw_schema)

                    # Clean Game Data for Model Application
                    result_df = reconcile_api_data(result_df)

                    # Append Shift Data To Clean PBP
                    result_df = append_shift_data(result_df)

                    # Append Single Game Data To List For Union
                    df_list.append(result_df)

                else:
                    pass
            except ValueError as e:
                print(f"Bad Link for GameID {i} | Error: {e}")
                bad_ids.append(i)
                continue

        print("Begin Union of Final DataFrame")
            
        
        data = df_list[0]
        for df in df_list[1:]:
            data.extend(df)
        data = data.sort('season', 'game_id', 'sort_order')

        # Print Eval Statements
        max_date_new = data['game_date'].max()
        min_date = data['game_date'].min()
        rows_loaded = data.height

        end_time = time.time()
        elap_time = round(((end_time - start_time)/3600),2)

        print("Successfully Loaded",str(rows_loaded),"Rows from", str(n_games), "played between", str(min_date), "to", str(max_date_new), "in", str(elap_time), "Hours")
        print("Rows By Season:")
        print(data['season'].value_counts().sort('season', descending=True))
        print(len(list(chain(*bad_ids))), "Bad IDs - Failed To Load - No Data")


        # Save
        json.dump({"max_date": max_date_new}, open('last_load_date.json', 'w+'))

        with open('bad_ids.pkl', 'wb') as file:
            pickle.dump(list(chain(*bad_ids)), file)

        return data
    else:
        print("Wrong Inputs - Please Try Again")

In [26]:
PBP_RAW = load_games(load_path='Data/PBP/API_RAW_PBP_Data.parquet', existing=False)

Now Loading ALL Play By Play Data From NHL API (2012-2024 Seasons) 14168 Games


In [None]:
PBP_Path = "Data/PBP/API_RAW_PBP_Data.parquet"
PBP_RAW.write_parquet(
    PBP_Path,
    use_pyarrow=True,
)

In [None]:
PBP_RAW.head()

# Load Roster File

In [None]:
# Path
roster_file = 'Data/NHL_Rosters_2014_2024.csv'

# All Players - Connect To event_player_1_id, event_player_2_id, event_player_3_id, event_player_4_id, event_goalie_id, home_goalie, away_goalie
ROSTER_DF_RAW = pl.read_csv(roster_file)

ROSTER_DF = (
    ROSTER_DF_RAW
    .with_columns([
        pl.col("player_id").cast(pl.Int32),
        (pl.col("first_name").str.to_uppercase() + '.' + pl.col("last_name").str.to_uppercase()).alias('player_name'),
        pl.when((pl.col('pos_G') == 1) & (pl.col('hand_R') == 1)).then(pl.lit(1)).otherwise(pl.lit(0)).alias('G_hand_R'),
        pl.when((pl.col('pos_G') == 1) & (pl.col('hand_L') == 1)).then(pl.lit(1)).otherwise(pl.lit(0)).alias('G_hand_L')
        ])
    .select(['player_id', 'player_name', 'hand_R', 'hand_L', 'pos_F', 'pos_D', 'pos_G', 'G_hand_R', 'G_hand_L'])
    .unique()
)


In [None]:
PBP_RAW.filter(pl.col('event_type')=='GOAL')['strength_state'].value_counts().sort("counts")

In [None]:
sdv_cols = ['event_type', 'event', 'description', 'period', 'period_seconds', 'period_seconds_remaining', 'game_seconds', 'game_seconds_remaining', 'home_score', 'away_score', 'strength_state', 'event_idx', 'extra_attacker', 'home_skaters', 'away_skaters', 'game_id', 'period_type', 'ordinal_num', 'period_time', 'period_time_remaining', 'date_time', 'home_final', 'away_final', 'season', 'season_type', 'game_date', 'game_start', 'game_end', 'game_length', 'game_state', 'detailed_state', 'venue_name', 'venue_link', 'home_name', 'home_abbreviation', 'home_division_name', 'home_conference_name', 'home_id', 'away_name', 'away_abbreviation', 'away_division_name', 'away_conference_name', 'away_id', 'event_id', 'event_team', 'event_team_type', 'num_on', 'players_on', 'players_off', 'away_on_1', 'away_on_2', 'away_on_3', 'away_on_4', 'away_on_5', 'away_goalie', 'ids_on', 'ids_off', 'secondary_type', 'home_on_1', 'home_on_2', 'home_on_3', 'home_on_4', 'home_on_5', 'home_goalie', 'event_player_1_name', 'event_player_1_type', 'event_player_2_name', 'event_player_2_type', 'strength_code', 'strength', 'x', 'y', 'x_fixed', 'y_fixed', 'event_player_1_id', 'event_player_1_link', 'event_player_2_id', 'event_player_2_link', 'event_team_id', 'event_team_link', 'event_team_abbr', 'num_off', 'penalty_severity', 'penalty_minutes', 'away_on_6', 'shot_distance', 'shot_angle', 'event_goalie_name', 'event_goalie_id', 'event_goalie_link', 'event_goalie_type', 'event_player_3_name', 'event_player_3_type', 'game_winning_goal', 'empty_net', 'event_player_3_id', 'event_player_3_link', 'event_player_4_type', 'event_player_4_id', 'event_player_4_name', 'event_player_4_link', 'home_on_6', 'venue_id']

# Check Similar Columns:
sim_cols = [l for l in sdv_cols if l in PBP_RAW.columns]

# Check SDV Not In API Cols
ignore_cols = ['event', 'description', 'extra_attacker', 'ordinal_num', 'period_time', 'period_time_remaining', 'home_final', 'away_final', 'game_start', 'game_end', 'game_length', 'game_state', 'detailed_state']
sdv_not_api = [l for l in sdv_cols if (l not in PBP_RAW.columns) & (~l.endswith('_link'))]

# Check API Not In SDV Cols
api_not_sdv = [l for l in PBP_RAW.columns if l not in sdv_cols]

print(sim_cols)
print(len(sim_cols))

print(sdv_not_api)
print(len(sdv_not_api))

print(api_not_sdv)
print(len(api_not_sdv))

## Columns To Create:

# Shift Related
    # away_goalie / home_goalie
    # event_player_1_name, event_player_2_name, event_player_3_name, event_player_4_name, event_goalie_name
    # home_on_1, home_on_2, home_on_3, home_on_5, home_on_6
    # away_on_1, away_on_2, away_on_3, away_on_5, away_on_6

    # *num_on
    # *players_on/players_off
    # *ids_on/ids_off

   

# Ignore

    # event
    # description
    # ordinal_num
    # home_final
    # away_final
    # num_off
    # penalty_severity
    # Extra Attacker


## Shift Data

- This will be joined to the PBP_RAW Table and create columns for which players were on the ice at a given event.
- In this cell block, we will create the shift columns needed:
    - *home_on_1, home_on_2, home_on_3, home_on_5, home_on_6, home_goalie*
    - *away_on_1, away_on_2, away_on_3, away_on_5, away_on_6, away_goalie*
- I believe I have two options on the join:
    - 1) Create a table with every seconds from the game and inner join home/away players
        - May create too large of a table for each join (max height = 9600 rows | normal height = 3600 rows)
    - 2) Diagonally Union the table
        - Need to figure out which columns from PBP_RAW will be null (possibly fill_na(method = 'forward')? )
        - Need to figure out join key and create "CHANGE" event_type as well as other types associated (on the fly vs stoppage?)

In [None]:
# Path
roster_file = 'NHL_Rosters_2014_2024.csv'

# All Players - Connect To event_player_1_id, event_player_2_id, event_player_3_id, event_player_4_id, event_goalie_id, home_goalie, away_goalie
ROSTER_DF_RAW = pl.read_csv(roster_file)

ROSTER_DF = (
    ROSTER_DF_RAW
    .with_columns([
        pl.col("player_id").cast(pl.Int32),
        (pl.col("first_name").str.to_uppercase() + '.' + pl.col("last_name").str.to_uppercase()).alias('player_name'),
        pl.when((pl.col('pos_G') == 1) & (pl.col('hand_R') == 1)).then(pl.lit(1)).otherwise(pl.lit(0)).alias('G_hand_R'),
        pl.when((pl.col('pos_G') == 1) & (pl.col('hand_L') == 1)).then(pl.lit(1)).otherwise(pl.lit(0)).alias('G_hand_L')
        ])
    .select(['player_id', 'player_name', 'hand_R', 'hand_L', 'pos_F', 'pos_D', 'pos_G', 'G_hand_R', 'G_hand_L'])
    .unique()
)

In [None]:
# Shift Data
# Load Game ID and Home/Away Ids
def append_shift_data(data):
    """This function will take the game ID from the api load and create columns for players on ice during given events."""

    for i in PBP_RAW['game_id'].unique():
        shift_link = "https://api.nhle.com/stats/rest/en/shiftcharts?cayenneExp=gameId="+str(i)
        shift_response = requests.get(shift_link)
        shift_raw = pd.json_normalize(shift_response.json())
        shift_raw = pd.json_normalize(shift_raw['data'])


        # Create an empty DataFrame to store the normalized plays
        normalized_shift = pd.DataFrame()

        # Iterate over each row in plays_1 and normalize the JSON data
        for _, row in shift_raw.iterrows():
            # Normalize the JSON data in the current row
            normalized_row_g = pd.json_normalize(row)

            # Concatenate the normalized row to the result DataFrame
            normalized_shift = pd.concat([normalized_shift, normalized_row_g], ignore_index=True)

        # Create Columns From Data (Names and Shift Starts)
        normalized_shift['player_name'] = normalized_shift['firstName'] + ' ' + normalized_shift['lastName']
        # Period Time
        normalized_shift['period_start_seconds'] = pd.to_datetime(normalized_shift['startTime'], format='%M:%S').dt.minute * 60 + pd.to_datetime(normalized_shift['startTime'], format='%M:%S').dt.second
        normalized_shift['period_end_seconds'] = pd.to_datetime(normalized_shift['endTime'], format='%M:%S').dt.minute * 60 + pd.to_datetime(normalized_shift['endTime'], format='%M:%S').dt.second
        # Game Time
        normalized_shift['game_start_seconds'] = ( pd.to_datetime(normalized_shift['startTime'], format='%M:%S').dt.minute * 60 + pd.to_datetime(normalized_shift['startTime'], format='%M:%S').dt.second) + ((normalized_shift['period'] - 1) * 1200)
        normalized_shift['game_end_seconds'] = ( pd.to_datetime(normalized_shift['endTime'], format='%M:%S').dt.minute * 60 + pd.to_datetime(normalized_shift['endTime'], format='%M:%S').dt.second) + ((normalized_shift['period'] - 1) * 1200)

        # Rename
        normalized_shift = normalized_shift.rename(columns = {
            'gameId': 'game_id',
            'id': 'shift_id',
            'playerId': 'player_id',
            'teamId': 'team_id',
            'shiftNumber': 'shift_number',
            'teamAbbrev': 'team_abbr'
        })

        # Keep
        shift_keep_cols = ['game_id', 'shift_id', 'team_id', 'player_id', 'player_name', 'period',
                           'period_start_seconds', 'period_end_seconds', 'game_start_seconds', 'game_end_seconds',
                           'eventNumber', 'team_abbr', 'shift_number', 'typeCode']
        normalized_shift = normalized_shift[shift_keep_cols]
        normalized_shift = pl.DataFrame(normalized_shift)

        # Join To Get Home Team
        normalized_shift = (
            normalized_shift
            .join(data.drop('game_seconds', 'period_seconds', 'event_id', 'event_idx', 'event_type').unique(), on='game_id', how='left')
            .with_columns(pl.when(pl.col('home_id') == pl.col('team_id')).then(pl.lit('home')).otherwise(pl.lit('away')).alias('team_type'))
            .drop('home_id', 'away_id')
            .unique()
        )

        # Join To Separate Goalies
        normalized_shift = (
            normalized_shift
            .join(ROSTER_DF.with_columns([
                (pl.col('player_id').cast(pl.Int64).alias('player_id')),
                (pl.col('pos_G').cast(pl.Int64).alias('pos_G'))
            ])
            .select('player_id', 'pos_G'), on='player_id', how='left')
            .unique()
        )


        # Group by 'player_id'
        grouped_shifts = normalized_shift.select('player_id', 'game_id', 'period', 'team_type', 'pos_G', 'period_start_seconds', 'period_end_seconds').to_pandas()
        grouped_shifts = grouped_shifts.groupby(['game_id', 'period', 'period_start_seconds', 'period_end_seconds', 'team_type', 'pos_G'])

        # Aggregate using the agg method
        result_df = grouped_shifts.agg(
            player_id_list=('player_id', list),
        ).reset_index()

        seconds_df = PBP_RAW.select(pl.col('game_id').cast(pl.Int64),pl.col('period').cast(pl.Int64), pl.col('period_seconds').cast(pl.Int64), 'event_idx').filter(pl.col('game_id') == i).to_pandas()


        # Function to get player lists based on conditions
        def get_player_lists(row, type, pos):
            p_secs = row['period_seconds']
            per = row['period']

            # Adjust conditions as needed
            condition = (
                (result_df['period'] == per) &
                (result_df['period_start_seconds'] <= p_secs) &
                (result_df['period_end_seconds'] > p_secs) &
                (result_df['team_type'] == type) &
                (result_df['pos_G'] == pos)
            )

            selected_rows = result_df[condition]

            if not selected_rows.empty:
                # Combine player lists
                combined_players = [player_id for player_list in selected_rows['player_id_list'] for player_id in player_list]
                return combined_players
            else:
                return []

        # Apply the function to each row of seconds_df
        seconds_df['home'] = seconds_df.apply(get_player_lists,type='home', pos=0, axis=1)
        seconds_df['away'] = seconds_df.apply(get_player_lists,type='away', pos=0, axis=1)
        seconds_df['home_goalie'] = seconds_df.apply(get_player_lists, type='home', pos=1, axis=1)
        seconds_df['away_goalie'] = seconds_df.apply(get_player_lists, type='away', pos=1, axis=1)

        # Columns with lists of player IDs
        list_columns = ["home", "away", "home_goalie", "away_goalie"]

        # Iterate over each list column
        for column in list_columns:
            # Iterate over each row
            for index, row in seconds_df.iterrows():
                # Extract the list of player IDs
                player_ids = row[column]

                # Iterate over the player IDs in the list
                for i, player_id in enumerate(player_ids):
                    # Create a new column name
                    new_column_name = f"{column}_{i+1}_on"

                    # Create a new column in the DataFrame
                    seconds_df.at[index, new_column_name] = player_id

        # Drop unnecessary Columns
        seconds_df = seconds_df.drop(['home', 'away', 'home_goalie', 'away_goalie'], axis=1)

# Combine DataFrames
final_shift_df = pd.concat(shift_df_list)
# Display the updated DataFrame
print(final_shift_df.head())


## Roster Data

- Here I have a couple options to go about how I want to connect roster data for joins to event_player_id (and all other event_player_id columns)
    - **Use LoadRosters.py**
        - Pros:
            - Already created
            - Join structure exists
            - Has all columns I would need
            - Can be joined to shift data?
        - Cons:
            - May not have the most current players (Some mismatched joins here)
    - **Hit Rosters from beginning of PBP API**
        - Pros:
            - Each game is unique and all players in game will be matched (all players from this source are in the game)
        - Cons:
            - Load time
            - Build query time
            - Still need to join rosters some 

### Option 1: Load Rosters From LoadRosters.py Script

In [None]:
# Path
roster_file = 'NHL_Rosters_2014_2024.csv'

# All Players - Connect To event_player_1_id, event_player_2_id, event_player_3_id, event_player_4_id, event_goalie_id, home_goalie, away_goalie
ROSTER_DF_RAW = pl.read_csv(roster_file)

ROSTER_DF = (
    ROSTER_DF_RAW
    .with_columns([
        pl.col("player_id").cast(pl.Int32),
        (pl.col("first_name").str.to_uppercase() + '.' + pl.col("last_name").str.to_uppercase()).alias('player_name'),
        pl.when((pl.col('pos_G') == 1) & (pl.col('hand_R') == 1)).then(pl.lit(1)).otherwise(pl.lit(0)).alias('G_hand_R'),
        pl.when((pl.col('pos_G') == 1) & (pl.col('hand_L') == 1)).then(pl.lit(1)).otherwise(pl.lit(0)).alias('G_hand_L')
        ])
    .select(['player_id', 'player_name', 'hand_R', 'hand_L', 'pos_F', 'pos_D', 'pos_G', 'G_hand_R', 'G_hand_L'])
    .unique()
)

### Option 2: Load Rosters From Start of Game

In [None]:
df_list = []
for i in [2021020001]:
    pbp_link = 'https://api-web.nhle.com/v1/gamecenter/'+str(i)+'/play-by-play'

    pbp_response = requests.get(pbp_link)
    pbp_data = pd.json_normalize(pbp_response.json())
    pbp_data = pbp_data[pbp_data['gameType'] != 1]

    ## GAME DATA
    game_data = pbp_data[['id', 'season', 'gameDate', 'gameType', 'awayTeam.id', 'awayTeam.abbrev', 'homeTeam.id', 'homeTeam.abbrev']]

    ## PLAYS DATA
    roster_spots = pd.json_normalize(pbp_data['rosterSpots'])
#
    ## Create an empty DataFrame to store the normalized plays
    #normalized_plays = pd.DataFrame()
#
    ## Iterate over each row in plays_1 and normalize the JSON data
    #for _, row in plays_1.iterrows():
    #    # Normalize the JSON data in the current row
    #    normalized_row = pd.json_normalize(row)
#
    #    # Concatenate the normalized row to the result DataFrame
    #    normalized_plays = pd.concat([normalized_plays, normalized_row], ignore_index=True)
#
    #result_df = pd.merge(game_data.assign(key=1), normalized_plays.assign(key=1), on='key').drop('key', axis=1)
    #result_df = pl.DataFrame(result_df)
#
    #df_list.append(result_df)
    
print('Loading Complete -- Begin Diagonal Union of', len(df_list),'Games')

#### *Check Options by Join Rate*

In [None]:
id_check = 'event_player_1_id'
dist_ids = pl.DataFrame({id_check: PBP_RAW.filter(~pl.col(id_check).is_null())[id_check].cast(pl.Int32).unique()})

## Option 1 ##
test_roster_join_1 = dist_ids.join(ROSTER_DF, left_on=id_check, right_on='player_id', how = 'left')

print("inner_join", dist_ids.join(ROSTER_DF, left_on=id_check, right_on='player_id', how = 'inner').height)
print("left_join", dist_ids.join(ROSTER_DF, left_on=id_check, right_on='player_id', how = 'left').height)
print("outer_join", dist_ids.join(ROSTER_DF, left_on=id_check, right_on='player_id', how = 'outer').height)

good_ids = dist_ids.join(ROSTER_DF, left_on=id_check, right_on='player_id', how = 'inner').select(id_check).unique()
good_ids = good_ids[id_check].to_list()

null_ids_outer = dist_ids.join(ROSTER_DF, left_on=id_check, right_on='player_id', how = 'left').filter(~pl.col(id_check).is_in(good_ids))
print(null_ids_outer.height)
print(null_ids_outer.head(10))

In [None]:
PBP_RAW.filter(pl.col('event_player_1_id') == 8471279).head()

In [None]:
PBP_RAW['season_type'].value_counts().sort('counts', descending=True)

In [None]:
with pl.Config(set_fmt_float="full"):
    print(PBP_RAW.filter(pl.col('season_type') == "I").select('game_id').unique().head())