In [1]:
# Pandas
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import numpy as np

# Polars (Arrow)
from pyarrow.dataset import dataset
import polars as pl
pl.Config.set_tbl_rows(n=-1)
pl.Config.set_tbl_cols(n=-1)

# Hit API
import requests

# Tools
from itertools import chain
from datetime import datetime, timedelta

# Save
import pickle
import os

# Get Game ID's From Previous Seasons

- Saves two lists (Current Game IDs and All Game IDs)

In [2]:
# Get Dates
yday = datetime.today() - timedelta(days=1)
end_date = yday.strftime('%Y%m%d')

# Check and Load Dates
date_file_name = 'dates_loaded.pkl'

if os.path.exists(date_file_name):
    with open(date_file_name, "rb") as file:
        season_dates = pickle.load(file)
else:
    season_dates = []

load_dates = pd.date_range(start='20121001', end=end_date, freq='D')
load_dates = [d for d in load_dates if d not in season_dates]

# Chcek and Load Game IDs
g_id_file_name  = 'game_ids.pkl'

if os.path.exists(g_id_file_name):
    with open(g_id_file_name, "rb") as file:
        game_ids = pickle.load(file)
else:
    game_ids = []

In [3]:
# Loop For Loading Games By Date
f_g_id = []
for i in load_dates:
    i_str = i.strftime('%Y-%m-%d')
    sched_link = "https://api-web.nhle.com/v1/schedule/"+i_str
    response = requests.get(sched_link)

    # Parse the JSON content of the response
    raw_data = pd.json_normalize(response.json())
    sched_data = pd.json_normalize(raw_data['gameWeek'][0])
    sched_data = pd.json_normalize(sched_data['games'][0])

    if len(sched_data) == 0:
        pass
    else:
        sched_data = sched_data[sched_data['gameType'] != 1]
        f_g_id.append(sched_data['id'].tolist())

# Create Lists (Game ID and Dates Loaded):
f_g_id = list(chain(*f_g_id))
game_ids = game_ids + f_g_id
current_game_ids = [game_id for game_id in game_ids if str(game_id).startswith("2023")]
previous_game_ids = [game_id for game_id in game_ids if game_id not in current_game_ids]

loaded_dates = sorted(load_dates + season_dates)

# Save
with open('game_ids.pkl', 'wb') as file:
    pickle.dump(game_ids, file)

with open('dates_loaded.pkl', 'wb') as file:
    pickle.dump(loaded_dates, file)

In [4]:
df_list = []
#szn23_ids = [game_id for game_id in game_ids if str(game_id).startswith("2022") | str(game_id).startswith("2023") | str(game_id).startswith("2024")]
for i in game_ids:
    pbp_link = 'https://api-web.nhle.com/v1/gamecenter/'+str(i)+'/play-by-play'

    pbp_response = requests.get(pbp_link)
    pbp_data = pd.json_normalize(pbp_response.json())
    pbp_data = pbp_data[pbp_data['gameType'] != 1]

    ## GAME DATA
    game_data = pbp_data[['id', 'season', 'gameDate', 'gameType', 'awayTeam.id', 'awayTeam.abbrev', 'homeTeam.id', 'homeTeam.abbrev']]

    # PLAYS DATA
    plays_1 = pd.json_normalize(pbp_data['plays'])

    # Create an empty DataFrame to store the normalized plays
    normalized_plays = pd.DataFrame()

    # Iterate over each row in plays_1 and normalize the JSON data
    for _, row in plays_1.iterrows():
        # Normalize the JSON data in the current row
        normalized_row = pd.json_normalize(row)

        # Concatenate the normalized row to the result DataFrame
        normalized_plays = pd.concat([normalized_plays, normalized_row], ignore_index=True)

    result_df = pd.merge(game_data.assign(key=1), normalized_plays.assign(key=1), on='key').drop('key', axis=1)
    result_df = pl.DataFrame(result_df)

    df_list.append(result_df)
    
print('Loading Complete -- Begin Diagonal Union of', len(df_list),'Games')

no_data = []
PBP_RAW = df_list[0]
for df in df_list[1:]:

    # Quick Clean of Unnecessary Columns
    if "periodDescriptor.otPeriods" in df.columns:
        df = df.drop("periodDescriptor.otPeriods")
    else:
        pass

    # Create Conditions before Union
    for col in df.columns:
        try:
            if col in PBP_RAW.columns:
                if df[col].dtype != PBP_RAW[col].dtype:
                    df = df.with_columns(col, df[col].cast(PBP_RAW[col].dtype))
            PBP_RAW = pl.concat([PBP_RAW, df], how='diagonal')
        except Exception as e:
            try:
                bad_id = df['id'].unique()[0]
                bad_home = df['homeTeam.abbrev'].unique()[0]
                bad_away = df['awayTeam.abbrev'].unique()[0]
                bad_date = df['gameDate'].unique()[0]
                no_data.append(bad_id)
                print("ERROR (No Data): Game ID: " + str(bad_id) + " (" + bad_away + "@" + bad_home + " Game Date: " + bad_date)
            except Exception as e2:
                print(f"Error processing column {col} Unknown Game ID: {e2}")
        # Append if No Bad IDs
        PBP_RAW = pl.concat([PBP_RAW, df], how='diagonal')

print("Number of Columns in PBP_RAW:", PBP_RAW.shape[1])
print("Number of Rows in PBP_RAW:", PBP_RAW.height)

In [93]:
# Get the width of each DataFrame
widths = [df.shape[1] for df in df_list]
# Create a summary DataFrame
summary_df = pl.DataFrame({'Width': widths})
# Get counts for each width
width_counts = summary_df.groupby('Width').agg(pl.col('Width').count().alias('Count'))
# Sort the summary DataFrame by width
sorted_summary_df = width_counts.sort('Width')
# Display the results
print(sorted_summary_df)

shape: (14, 2)
┌───────┬───────┐
│ Width ┆ Count │
│ ---   ┆ ---   │
│ i64   ┆ u32   │
╞═══════╪═══════╡
│ 8     ┆ 30    │
│ 28    ┆ 1     │
│ 39    ┆ 1     │
│ 41    ┆ 5     │
│ 42    ┆ 9     │
│ 43    ┆ 16    │
│ 44    ┆ 27    │
│ 45    ┆ 21    │
│ 46    ┆ 61    │
│ 47    ┆ 52    │
│ 48    ┆ 5774  │
│ 49    ┆ 6357  │
│ 50    ┆ 1822  │
│ 51    ┆ 5     │
└───────┴───────┘


In [94]:
# Base
base_cols = PBP_RAW.columns

# List of DFs with 50 Columns
split_df_list = [[df for df in df_list if df.shape[1] == 50]
                ,[df for df in df_list if df.shape[1] == 49]
                ,[df for df in df_list if df.shape[1] == 48]
                ,[df for df in df_list if df.shape[1] == 47]
                ,[df for df in df_list if df.shape[1] == 46]
                ,[df for df in df_list if df.shape[1] == 45]
                ,[df for df in df_list if df.shape[1] == 44]
                ,[df for df in df_list if df.shape[1] == 43]
                ,[df for df in df_list if df.shape[1] == 42]
                ,[df for df in df_list if df.shape[1] == 41]
                ,[df for df in df_list if df.shape[1] == 39]
                ,[df for df in df_list if df.shape[1] == 28]
                ,[df for df in df_list if df.shape[1] == 8]]

for i in split_df_list:
    extra_cols = []
    common_columns = set(i[0].columns)
    width = i[0].shape[1]

    for df in i[1:]:
        common_columns = common_columns.intersection(df.columns)
        extra = [l for l in df.columns if l not in common_columns]
        extra_cols.append(extra)


    common_columns = list(common_columns)
    missing_columns = [l for l in base_cols if l not in common_columns]
    extra_columns = list(set(chain(*extra_cols)))
    extra_columns = [l for l in extra_columns if l not in missing_columns]
    print("Number of Common Columns in DataFrames with", width,"Columns:", len(common_columns))
    print(common_columns)
    print("Number of Missing Columns in DataFrames with", width,"Columns:", len(missing_columns))
    print(missing_columns)
    print("Number of Extra Columns in", len(extra_cols)*len(extra_columns),"DataFrames with", width,"Columns:", len(extra_columns))
    print(extra_columns)
    print("="*500)

Number of Common Columns in DataFrames with 51 Columns: 51
['details.eventOwnerTeamId', 'gameType', 'details.scoringPlayerTotal', 'awayTeam.id', 'situationCode', 'details.goalieInNetId', 'details.blockingPlayerId', 'details.descKey', 'details.drawnByPlayerId', 'homeTeam.id', 'details.servedByPlayerId', 'awayTeam.abbrev', 'periodDescriptor.periodType', 'details.typeCode', 'details.homeScore', 'homeTeam.abbrev', 'period', 'details.committedByPlayerId', 'periodDescriptor.number', 'details.zoneCode', 'details.xCoord', 'periodDescriptor.otPeriods', 'timeRemaining', 'details.assist1PlayerTotal', 'timeInPeriod', 'details.hitteePlayerId', 'details.assist2PlayerId', 'details.shotType', 'details.shootingPlayerId', 'details.awayScore', 'gameDate', 'id', 'details.reason', 'details.homeSOG', 'typeDescKey', 'season', 'details.hittingPlayerId', 'details.scoringPlayerId', 'details.assist1PlayerId', 'details.assist2PlayerTotal', 'details.duration', 'typeCode', 'eventId', 'details.playerId', 'details.aw

# Cleaning Functions For PBP Data

In [104]:
def min_to_sec(time_str):
    """This function will help to convert time's formatted like MM:SS to a round seconds number"""
    minutes, seconds = map(int, time_str.split(':'))
    return minutes * 60 + seconds

In [110]:
def reconcile_api_data(data):
    """ This Function will take a polars dataframe and reconcile column names, values, and data types to match SDV cleaning functions to save time and effort in building more tweak functions"""

    # Create Dictionaries For Column Name/Value Rename
    # Column Names
    rename_dict = {
        "id": "game_id",
        "gameDate": "game_date",
        "awayTeam.id": "away_id",
        "awayTeam.abbrev": "away_abbreviation",
        "homeTeam.id": "home_id",
        "homeTeam.abbrev": "home_abbreviation",
        "gameType": "season_type",
        "eventId": "event_id",
        "typeDescKey": "event_type",
        "sortOrder": "event_idx",
        "periodDescriptor.periodType": "period_type",
        "details.eventOwnerTeamId": "event_team_id",
        "details.xCoord": "x",
        "details.yCoord": "y",
        "details.zoneCode": "event_zone",
        "details.shotType": "secondary_type",
        "details.awayScore": "away_score",
        "details.homeScore": "home_score",
        "typeDescKey": "event_type"
    }

    # Event Type
    event_type_dict = {
        "faceoff": "FACEOFF",
        "shot-on-goal": "SHOT",
        "stoppage": "STOPPAGE",
        "hit": "HIT",
        "blocked-shot": "BLOCKED_SHOT",
        "missed-shot": "MISSED_SHOT",
        "giveaway": "GIVEAWAY",
        "takeaway": "TAKEAWAY",
        "penalty": "PENALTY",
        "goal": "GOAL",
        "period-start": "PERIOD_START",
        "period-end": "PERIOD_END",
        "delayed-penalty": "DELAYED_PENALTY",
        "game-end": "GAME_END",
        "shootout-complete": "SHOOTOUT_COMPLETE",
        "failed-shot-attempt": "FAILED_SHOT"
    }

    # Season Type
    season_type_dict = {
        2: "R",
        3: "P",
        4: "I"
    }

    # Rename Columns + Values AND Add Event Helpers
    data = data.rename(rename_dict).filter(pl.col('period_type') != 'SO')

    data = (
        data
        .select(pl.all().map_dict(event_type_dict, default=pl.col("event_type")))
        .select(pl.all().map_dict(season_type_dict, default=pl.col("season_type")))
        .with_columns(pl.when(pl.col('event_team_id') == pl.col('home_team_id')).then(pl.lit('home')).otherwise(pl.lit('away')).alias('event_team_type'))
    )

    # Create Game and Period Seconds Remaining from timeInPeriod, timeRemaining: 'period', 'period_seconds', 'period_seconds_remaining', 'game_seconds', 'game_seconds_remaining'
    data = (
        data
        .with_columns((pl.col('timeInPeriod').apply(min_to_sec)).alias('period_seconds'))
        .with_columns([
            (1200 - pl.col('period_seconds')).alias('period_seconds_remaining'),
            (pl.col('period_seconds') + ((pl.col('period')-1)*1200)).alias('game_seconds'),
            ((3600 - pl.col('period_seconds')) + ((pl.col('period') - 3) * 1200)).alias('game_seconds_remaining')
        ])
    )

    # Create x_fixed and y_fixed. These coordinates will be relative to the event team's attacking zone (i.e., x_abs is positive)
    #data = (
    #    data
    #    .with_columns([
    #        (pl.when(pl.col('period') % 2 == 0).then(pl.col('x')*-1)).alias('x_fixed'),
    #    ])
    #)

    return data


In [107]:
PBP_RAW.head()

id,season,gameDate,gameType,awayTeam.id,awayTeam.abbrev,homeTeam.id,homeTeam.abbrev,eventId,period,timeInPeriod,timeRemaining,typeCode,typeDescKey,sortOrder,periodDescriptor.number,periodDescriptor.periodType,situationCode,details.eventOwnerTeamId,details.losingPlayerId,details.winningPlayerId,details.xCoord,details.yCoord,details.zoneCode,details.hittingPlayerId,details.hitteePlayerId,details.shotType,details.shootingPlayerId,details.goalieInNetId,details.awaySOG,details.homeSOG,details.reason,details.typeCode,details.descKey,details.duration,details.committedByPlayerId,details.drawnByPlayerId,details.playerId,details.blockingPlayerId,details.scoringPlayerId,details.scoringPlayerTotal,details.assist1PlayerId,details.assist1PlayerTotal,details.assist2PlayerId,details.assist2PlayerTotal,details.awayScore,details.homeScore,details.secondaryReason,details.servedByPlayerId,homeTeamDefendingSide
i64,i64,str,i64,i64,str,i64,str,i64,i64,str,str,i64,str,i64,i64,str,str,f64,f64,f64,f64,f64,str,f64,f64,str,f64,f64,f64,f64,str,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,f64,str
2012020001,20122013,"""2013-01-19""",2,5,"""PIT""",4,"""PHI""",51,1,"""00:00""","""20:00""",520,"""period-start""",5,1,"""REG""",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2012020001,20122013,"""2013-01-19""",2,5,"""PIT""",4,"""PHI""",52,1,"""00:00""","""20:00""",502,"""faceoff""",6,1,"""REG""","""1551""",5.0,8473512.0,8471675.0,1.0,0.0,"""N""",,,,,,,,,,,,,,,,,,,,,,,,,,
2012020001,20122013,"""2013-01-19""",2,5,"""PIT""",4,"""PHI""",5,1,"""00:12""","""19:48""",503,"""hit""",7,1,"""REG""","""1551""",4.0,,,-13.0,33.0,"""N""",8474568.0,8470543.0,,,,,,,,,,,,,,,,,,,,,,,,
2012020001,20122013,"""2013-01-19""",2,5,"""PIT""",4,"""PHI""",53,1,"""00:29""","""19:31""",506,"""shot-on-goal""",8,1,"""REG""","""1551""",5.0,,,43.0,24.0,"""O""",,,"""snap""",8468498.0,8468524.0,1.0,0.0,,,,,,,,,,,,,,,,,,,
2012020001,20122013,"""2013-01-19""",2,5,"""PIT""",4,"""PHI""",6,1,"""00:29""","""19:31""",516,"""stoppage""",9,1,"""REG""","""1551""",,,,,,,,,,,,,,"""goalie-stopped…",,,,,,,,,,,,,,,,,,


In [111]:
PBP_RAW = reconcile_api_data(PBP_RAW)

: 

In [None]:
# Look into situationCode, typeCode, typeDesckey (see if it can do anything for us on strength)

# Convert homeTeamDefendingSide and x + y to create x_abs and y_abs. (Use code from model createion for logic)

# Create event_player_1_XXX col from details.winningPlayerId, details.shootingPlayerId, details.playerId, details.hittingPlayerId

# Create event_player_2_XXX cols from details.goalieInNetId, details.losingPlayerId, details.blockingPlayerId, details.hitteePlayerId, 