In [1]:
import pandas as pd
from collections import defaultdict, Counter
from tqdm import tqdm
import random

In [2]:
pbp_file = "./hockeyR_data/play_by_play_2021_22_lite.csv"
pbp = pd.read_csv(pbp_file, encoding='latin-1')


  pbp = pd.read_csv(pbp_file, encoding='latin-1')


In [10]:
pbp_18_19 = pd.read_csv("./hockeyR_data/play_by_play_2017_18_lite.csv", encoding='latin-1')

  pbp_18_19 = pd.read_csv("./hockeyR_data/play_by_play_2017_18_lite.csv", encoding='latin-1')


In [3]:
pbp.columns.values

array(['xg', 'event_id', 'event_type', 'event', 'secondary_type',
       'event_team', 'event_team_type', 'description', 'period',
       'period_seconds', 'period_seconds_remaining', 'game_seconds',
       'game_seconds_remaining', 'home_score', 'away_score',
       'event_player_1_name', 'event_player_1_type',
       'event_player_2_name', 'event_player_2_type',
       'event_player_3_name', 'event_player_3_type', 'event_goalie_name',
       'strength_state', 'strength_code', 'strength', 'game_winning_goal',
       'empty_net', 'penalty_severity', 'penalty_minutes', 'event_idx',
       'num_on', 'players_on', 'num_off', 'players_off', 'extra_attacker',
       'x', 'y', 'x_fixed', 'y_fixed', 'shot_distance', 'shot_angle',
       'home_skaters', 'away_skaters', 'home_on_1', 'home_on_2',
       'home_on_3', 'home_on_4', 'home_on_5', 'home_on_6', 'home_on_7',
       'away_on_1', 'away_on_2', 'away_on_3', 'away_on_4', 'away_on_5',
       'away_on_6', 'away_on_7', 'home_goalie', 'away_goal

In [4]:
pbp.loc[:, 'event_type'].unique()

array(['GAME_SCHEDULED', 'FACEOFF', 'HIT', 'STOP', 'SHOT', 'TAKEAWAY',
       'BLOCKED_SHOT', 'MISSED_SHOT', 'GIVEAWAY', 'PERIOD_END', 'GOAL',
       'PENALTY', 'GAME_END', 'CHALLENGE', 'SHOOTOUT_COMPLETE',
       'EARLY_INT_START', 'EARLY_INT_END', 'EMERGENCY_GOALTENDER'],
      dtype=object)

In [5]:
ALL_TEAMS = list(pbp.loc[:, 'away_name'].unique())
ALL_TEAMS

['Pittsburgh Penguins',
 'Seattle Kraken',
 'Montréal Canadiens',
 'New York Rangers',
 'Chicago Blackhawks',
 'Vancouver Canucks',
 'Toronto Maple Leafs',
 'Tampa Bay Lightning',
 'Dallas Stars',
 'New York Islanders',
 'Arizona Coyotes',
 'Winnipeg Jets',
 'Vegas Golden Knights',
 'Minnesota Wild',
 'Ottawa Senators',
 'Carolina Hurricanes',
 'St. Louis Blues',
 'Calgary Flames',
 'Anaheim Ducks',
 'San Jose Sharks',
 'Columbus Blue Jackets',
 'Florida Panthers',
 'Colorado Avalanche',
 'Los Angeles Kings',
 'Boston Bruins',
 'Washington Capitals',
 'Edmonton Oilers',
 'Detroit Red Wings',
 'Buffalo Sabres',
 'Nashville Predators',
 'Philadelphia Flyers',
 'New Jersey Devils']

In [6]:
# this class is used for preprocessing batch penalties
# once the batch of penalties are resolved, the normal Penalty class should be used for the clock
class PrePenalty:
    def __init__(self, time_length = 120):
        self.time_length = time_length
        
# once we process batch penalties, use these objects to add to the penalty clock
class Penalty:
    def __init__(self, time_start, time_length = 120, severity = 'Minor', 
                 is_4v4 = False, remaining_majors = 0, remaining_minors = 0):
        self.time_start = time_start
        self.time_length = time_length
        self.severity = severity
        # if is_4v4 True, minor is of the 4v4 type. both teams down a man;
        # even though this is on the clock, goals don't expire them
        self.is_4v4 = is_4v4
        # used when a player has multiple penalties i.e. a double minor (2+2) or a 5+2
        self.remaining_majors = remaining_majors
        self.remaining_minors = remaining_minors
        
    def __str__(self):
        period = int(self.time_start / 1200) + 1
        time = self.time_start - (period - 1) * 1200
        return f'{self.severity} Penalty at {time} P{period}'
    
    def __repr__(self):
        period = int(self.time_start / 1200) + 1
        time = self.time_start - (period - 1) * 1200
        return f'{self.severity} Penalty at {time} P{period}'

In [7]:
# in certain situtations, the captain of an offending team can elect to play 2 men SH
#    for 2m or 1 man SH for 4m.
# in 5/6 scenarios the captain opted for a double minor. Only the Avs (vs ARI 1/14/22)
# elected to go down 2 men for 2m.

cache_4_PIMs = {
    ((240,), ()): 'DBL',
    ((120,120), ()): '2SH',
    ((540,), (300,)): 'DBL',
    ((240, 120), (120,)): 'CPT',
    ((240, 120, 120), (120, 120)): 'CPT',
    ((360, 300), (300, 120)): 'DBL',
    ((360,), (120,)): 'DBL',
    ((120, 120, 120), (120,)): 'DBL',
    ((420, 120), (300,)): '2SH',
    ((300, 120, 120), (300,)): '2SH',
    ((360, 120), (120, 120)): 'CPT',
    ((300, 240), (300,)): 'DBL'
}

def cache_4_min_PIMS(away_pens, home_pens, time, date):
    more_penalized, less_penalized = None, None
    away_more_penalized = sum(away_pens) > sum(home_pens)
    if away_more_penalized:
        more_penalized = away_pens
        less_penalized = home_pens
    else:
        more_penalized = home_pens
        less_penalized = away_pens
    result = cache_4_PIMs[(more_penalized, less_penalized)]
    assert result, f'Coincidental penalty batch not accounted for: {away_pens}, {home_pens}'
    
    if result == 'CPT':
        result = '2SH' if date == '2022-01-14' else 'DBL'
    
    if result == 'DBL':
        pen = Penalty(time, remaining_minors = 1)
        return ([pen], []) if away_more_penalized else ([], [pen])
    else:
        pen1 = Penalty(time)
        pen2 = Penalty(time)
        return ([pen1, pen2], []) if away_more_penalized else ([], [pen1, pen2])
        

In [8]:
# 7 minute penalty resolver

# in certain situtations, the captain of an offending team can elect to play 2 men SH
#    (1 for 2m, 1 for 5m), or play 1 man SH for 7m

cache_7_PIMs = {
    ((300, 120), ()): '2SH',
    ((540, 300), (420,)): 'CPT',
    ((420,), ()): '1SH',
}

def cache_7_min_PIMS(away_pens, home_pens, time):
    more_penalized, less_penalized = None, None
    away_more_penalized = sum(away_pens) > sum(home_pens)
    if away_more_penalized:
        more_penalized = away_pens
        less_penalized = home_pens
    else:
        more_penalized = home_pens
        less_penalized = away_pens
    result = cache_7_PIMs[(more_penalized, less_penalized)]
    assert result, f'Coincidental penalty batch not accounted for {away_pens}, {home_pens}'
    if result == 'CPT': # CHI @ NSH 12/1/18
        result = '1SH'
            
    if result == '1SH':
        pen = Penalty(time, 300, 'Major', remaining_minors = 1)
        return ([pen], []) if away_more_penalized else ([], [pen])
    else:
        minor = Penalty(time)
        major = Penalty(time, 300, 'Major')
        # since we call .pop() to add penalties to clock, we must ensure minors are 
        # on the clock first (Rule 26.3)
        return ([major, minor], []) if away_more_penalized else ([], [major, minor])
        

In [9]:
class Game:
    def __init__(self, away_team, home_team, date, season_type):
        self.away_team = away_team
        self.home_team = home_team
        self.date = date
        self.season_type = season_type # ['R', 'P'], Regular Season or Playoffs
        self.pre_penalties = {} # away/home -> {player_name -> [Penalty]}
        self.pre_penalties['away'] = defaultdict(list) 
        self.pre_penalties['home'] = defaultdict(list)
        self.current_time = 0
        self.period = None
        self.strength = (5,5)
        self.strength_to_time = defaultdict(int) # maps strength -> TOI in seconds (tuple -> int)
        # maps team -> {strength -> goals}
        self.strength_to_goals = defaultdict(int)
        self.strength_to_goals[self.away_team] = Counter()
        self.strength_to_goals[self.home_team] = Counter()
        self.strength_to_goals_against = defaultdict(int)
        self.strength_to_goals_against[self.away_team] = Counter()
        self.strength_to_goals_against[self.home_team] = Counter()
        # contains all penalties that are on the clock (affecting on-ice manpower)
        self.penalty_clock = {}
        self.penalty_clock['away'] = []
        self.penalty_clock['home'] = []

        self.away_penalty_clock = []
        self.home_penalty_clock = []

        # penalties that would force a team to have less than 3 skaters on the ice
        # after old ones expire, add queued penalties
        self.queued_penalties = {}
        self.queued_penalties['away'] = []
        self.queued_penalties['home'] = []

        # when penalties expire in OT, teams have an extra skater on the ice until a stoppage
        # i.e. a 4v3 PP ends, play continues as 4v4 until a stoppage
        self.away_OT_extra = 0
        self.home_OT_extra = 0
        # set after stoppages in OT to signify the PP advantage of the home/away team
        # for example if the away team starts OT on the PP, away_OT_PP = 1
        # when the penalty expires, we decrement this and increment both away_OT_extra and home_OT_extra
        self.away_OT_PP = 0
        self.home_OT_PP = 0
        self.regular_season_OT = False
        # for debugging purposes
        self.coincidentals = Counter()
        
    # returns the manpower of both teams.
    # since (regular season) overtime penalties work differently than regulation, we need to adjust
    #   the math for OT
    def get_strengths(self):
        if self.period == 4 and self.season_type == 'R': # regular season only
            away_strength = 3 + self.away_OT_extra + self.away_OT_PP
            home_strength = 3 + self.home_OT_extra + self.home_OT_PP

        else: # all other times should be default 5v5 play
            away_strength = 5 - len(self.away_penalty_clock)
            home_strength = 5 - len(self.home_penalty_clock)
        return (away_strength, home_strength)
    
    # takes the period and time of a play and returns the time elapsed since the start of the game
    def get_game_seconds(self, period, time):
        return (period - 1) * 1200 + time

    # bulky function that looks at both away/home penalty clocks and removes expired ones
    # if penalties expire and there are queued penalties, we should add them to the clock here
    def update_penalty_clock(self, period, period_seconds, debug_mode = False):
        # get time elapsed since last update
        old_time = self.current_time
        current_seconds = self.get_game_seconds(period, period_seconds)
        secs_elapsed = current_seconds - old_time
        
        # update current time
        self.current_time = current_seconds # period_seconds
        self.period = period
        
        old_strengths = self.get_strengths()

        # expired_penalties = defaultdict(list)
        # new_penalty_clock = defaultdict(list)
        # for _team in ['away', 'home']:
        #     pass
        
        # update away penalty clock
        away_expired_penalties = []
        new_away_penalty_clock = []
        for pen in self.away_penalty_clock:
            curr_expiration = pen.time_start + pen.time_length
            if curr_expiration > self.current_time:
                new_away_penalty_clock.append(pen)
            else: # if current penalty expired, see if any are remaining
                if pen.remaining_majors:
                    new_pen_start = min(pen.time_start + 300, self.current_time)
                    rem_majors = pen.remaining_majors - 1
                    new_pen = Penalty(new_pen_start, 500, 'Major', 
                                      remaining_majors = rem_majors, remaining_minors = pen.remaining_minors)
                    new_away_penalty_clock.append(new_pen)
                elif pen.remaining_minors:
                    new_pen_start = min(pen.time_start + 120, self.current_time)
                    rem_minors = pen.remaining_minors - 1
                    new_pen = Penalty(new_pen_start, remaining_minors = rem_minors)
                    new_away_penalty_clock.append(new_pen)
                else: # keep the expired penalties in case there are queued penalties and we can adjust time_start accordingly
                    away_expired_penalties.append(pen)
                    
        away_num_added = 0 # amount of penalties that were queued and have been added to the clock
        # if there is room for another penalty, we should check if there are queued penalties and add them if possible
        if len(new_away_penalty_clock) == 1: 
            if away_expired_penalties and self.queued_penalties['away']: # if one expired and one is on deck, add one
                first_expired = min(away_expired_penalties, key = lambda pen: pen.time_start + pen.time_length)
                new_pen = self.queued_penalties['away'].pop()
                new_pen.time_start = first_expired.time_start + first_expired.time_length
                new_away_penalty_clock.append(new_pen)
                away_num_added = 1
        elif len(new_away_penalty_clock) == 0:
            if away_expired_penalties and self.queued_penalties['away']:
                if len(self.queued_penalties['away']) == 2: # if there are 2 queued penalties, we know that 2 just expired
                    exp_pen_1 = away_expired_penalties[0]
                    new_pen_1 = self.queued_penalties['away'].pop()
                    new_pen_1.time_start = exp_pen_1.time_start + exp_pen_1.time_length
                    new_away_penalty_clock.append(new_pen_1)
                    
                    exp_pen_2 = away_expired_penalties[1]
                    new_pen_2 = self.queued_penalties['away'].pop()
                    new_pen_2.time_start = exp_pen_2.time_start + exp_pen_2.time_length
                    new_away_penalty_clock.append(new_pen_2)
                    away_num_added = 2
                elif len(self.queued_penalties['away']) == 1:
                    first_expired = min(away_expired_penalties, key = lambda pen: pen.time_start + pen.time_length)
                    new_pen = self.queued_penalties['away'].pop()
                    new_pen.time_start = first_expired.time_start + first_expired.time_length
                    new_away_penalty_clock.append(new_pen)
                    away_num_added = 1
        
        num_expired = len(self.away_penalty_clock) - len(new_away_penalty_clock)
        if num_expired and self.regular_season_OT:
            self.away_OT_extra += num_expired
            self.home_OT_extra += num_expired
            self.home_OT_PP -= num_expired
        
        self.away_penalty_clock = new_away_penalty_clock
        away_expired_penalties = away_expired_penalties[away_num_added:] # for the strength metrics
        
        # update home penalty clock
        home_expired_penalties = []
        new_home_penalty_clock = []
        for pen in self.home_penalty_clock:
            # curr_expiration = (pen.period - 1) * 1200 + pen.time_start + pen.time_length
            curr_expiration = pen.time_start + pen.time_length
            if curr_expiration > self.current_time:
                new_home_penalty_clock.append(pen)    
            else: # if current penalty expired, see if any are remaining or queue new penalties
                if pen.remaining_majors:
                    new_pen_start = min(pen.time_start + 300, self.current_time)
                    rem_majors = pen.remaining_majors - 1
                    new_pen = Penalty(new_pen_start, 500, 'Major', 
                                      remaining_majors = rem_majors, remaining_minors = pen.remaining_minors)
                    new_home_penalty_clock.append(new_pen)
                elif pen.remaining_minors:
                    new_pen_start = min(pen.time_start + 120, self.current_time)
                    rem_minors = pen.remaining_minors - 1
                    new_pen = Penalty(new_pen_start, remaining_minors = rem_minors)
                    new_home_penalty_clock.append(new_pen)
                else: # keep the expired penalties in case there are queued penalties and we can adjust time_start accordingly
                    home_expired_penalties.append(pen)
          
        home_num_added = 0 # number of queued penalties that were added to the clock
        # if there is room for another penalty, we should check if there are queued penalties and add them if possible
        if len(new_home_penalty_clock) == 1: 
            if home_expired_penalties and self.queued_penalties['home']: # if one expired and one is on deck, add one
                first_expired = min(home_expired_penalties, key = lambda pen: pen.time_start + pen.time_length)
                new_pen = self.queued_penalties['home'].pop()
                new_pen.time_start = first_expired.time_start + first_expired.time_length
                new_home_penalty_clock.append(new_pen)
                home_num_added = 1
        elif len(new_home_penalty_clock) == 0:
            if home_expired_penalties and self.queued_penalties['home']:
                if len(self.queued_penalties['home']) == 2: # if there are 2 queued penalties, we know that 2 just expired
                    exp_pen_1 = home_expired_penalties[0]
                    new_pen_1 = self.queued_penalties['home'].pop()
                    new_pen_1.time_start = exp_pen_1.time_start + exp_pen_1.time_length
                    new_home_penalty_clock.append(new_pen_1)
                    
                    exp_pen_2 = home_expired_penalties[1]
                    new_pen_2 = self.queued_penalties['home'].pop()
                    new_pen_2.time_start = exp_pen_2.time_start + exp_pen_2.time_length
                    new_home_penalty_clock.append(new_pen_2)
                    home_num_added = 2
                elif len(self.queued_penalties['home']) == 1:
                    first_expired = min(home_expired_penalties, key = lambda pen: pen.time_start + pen.time_length)
                    new_pen = self.queued_penalties['home'].pop()
                    new_pen.time_start = first_expired.time_start + first_expired.time_length
                    new_home_penalty_clock.append(new_pen)
                    home_num_added = 1
                    
        num_expired = len(self.home_penalty_clock) - len(new_home_penalty_clock)
        if num_expired and self.regular_season_OT:
            self.home_OT_extra += num_expired
            self.away_OT_extra += num_expired
            self.away_OT_PP -= num_expired
                    
        self.home_penalty_clock = new_home_penalty_clock
        home_expired_penalties = home_expired_penalties[home_num_added:]
        
        self.strength = self.get_strengths()
        
        if home_num_added or away_num_added:
            print(f'{self.away_team} @ {self.home_team} {self.date}')
        
        # exclude shootouts, which mess up the data
        if not (self.season_type == 'R' and self.period > 4): 
            self.update_strength_metrics(away_expired_penalties, home_expired_penalties, old_strengths, secs_elapsed, debug_mode)
            
        if debug_mode:
            print(f'Updating Penalty Clock at P:{period} {period_seconds}s')
            print(f'Away: {self.away_penalty_clock}')
            print(f'Home: {self.home_penalty_clock}')
            print(self.get_strengths())
            
    # update our strength tracking metrics by looking at expired penalties
    # if pens expired here, we know no goal was scored (unless major/match)
    def update_strength_metrics(self, away_expired_penalties, home_expired_penalties, old_strengths, secs_elapsed, debug_mode):
        away_expired = len(away_expired_penalties)
        home_expired = len(home_expired_penalties)
        if away_expired + home_expired == 1:
            if away_expired:
                pen = away_expired_penalties[0]
            else:
                pen = home_expired_penalties[0]
            old_strength_time = self.current_time - pen.time_length - pen.time_start
            self.strength_to_time[old_strengths] += secs_elapsed - old_strength_time
            self.strength_to_time[self.strength] += old_strength_time
        elif not(away_expired or home_expired):
            self.strength_to_time[old_strengths] += secs_elapsed
        else: # complex case
            # maps expiration_time -> <'away'/'home'> to indicate who the expiring penalty was on
            if len(away_expired_penalties) + len(home_expired_penalties) > 2:
                print(f'multiple expiring penalties: {self.away_team} @ {self.home_team} {self.date}')
            expired_pens = defaultdict(list) 
            expiration_times = set([]) # stores unique penalty expiration times
            
            for away_pen in away_expired_penalties:
                expiration = away_pen.time_start + away_pen.time_length
                expiration_times.add(expiration)
                expired_pens[expiration].append('away')
                
            for home_pen in home_expired_penalties:
                expiration = home_pen.time_start + home_pen.time_length
                expiration_times.add(expiration)
                expired_pens[expiration].append('home')
                
            # sort in ascending order of expiration
            expiration_times = sorted(list(expiration_times))
            
            new_strengths = old_strengths
            # keeps track of time in the past as we are retroactively inferring TOI for each manpower
            checkpoint = self.current_time - secs_elapsed
            for curr_exp_time in expiration_times:
                curr_expired_pens = expired_pens[curr_exp_time]
                time_diff = curr_exp_time - checkpoint # time that given manpower was on ice
                
                self.strength_to_time[new_strengths] += time_diff
                checkpoint += time_diff
                # for each penalty that expires at exactly this time, increment the manpower
                for team in curr_expired_pens:
                    if team == 'away':
                        new_strengths = (new_strengths[0] + 1, new_strengths[1])
                    else:
                        new_strengths = (new_strengths[0], new_strengths[1] + 1)
                        
            # don't forget to increment the current manpower TOI with the remaining time
            self.strength_to_time[new_strengths] += self.current_time - checkpoint
        if debug_mode:
            print(self.strength_to_time)
    
    # sets the current strength of the Game object
    def update_strength(self):
        self.strength = self.get_strengths()     
    
    # this helper function is called when a PPG is scored to remove the first (non-4v4) minor to expire
    def get_expiring_penalty(self, team_penalty_clock):
        first_expiration = float('inf')
        first_expiration_pen = None
        for i, pen in enumerate(team_penalty_clock):
            curr_expiration = pen.time_start + pen.time_length
            if pen.severity == 'Minor' and not pen.is_4v4 and curr_expiration < first_expiration:
                first_expiration_pen = pen
                first_expiration = curr_expiration
        return first_expiration_pen
    
    # - helper function called after a PPG
    # - we look at the SH team's penalty clock and remove the first non-coincidental minor set to expire
    # - additionally, if we remove a minor penalty and there are queued penalties, we should add them here until there
    #     are 2 penalties on the clock, or there are no queued penalties.
    def remove_minor(self, team):
        if team == self.away_team: 
            expiring_penalty = self.get_expiring_penalty(self.away_penalty_clock)
            if expiring_penalty:
                self.away_penalty_clock.remove(expiring_penalty)
                if expiring_penalty.remaining_minors > 0:
                    rem_minors = expiring_penalty.remaining_minors - 1
                    new_pen = Penalty(self.current_time, remaining_minors = rem_minors)
                    self.away_penalty_clock.append(new_pen)
                elif self.queued_penalties['away']: # check for queued penalties
                    new_pen = self.queued_penalties['away'].pop()
                    new_pen.time_start = self.current_time
                    self.away_penalty_clock.append(new_pen)
        else:
            expiring_penalty = self.get_expiring_penalty(self.home_penalty_clock)
            if expiring_penalty:
                self.home_penalty_clock.remove(expiring_penalty)
                if expiring_penalty.remaining_minors > 0:
                    rem_minors = expiring_penalty.remaining_minors - 1
                    new_pen = Penalty(self.current_time, remaining_minors = rem_minors)
                    self.home_penalty_clock.append(new_pen)
                elif self.queued_penalties['home']: # check for queued penalties
                    new_pen = self.queued_penalties['home'].pop()
                    new_pen.time_start = self.current_time
                    self.home_penalty_clock.append(new_pen)
            
    # - this function is called when a goal is scored
    # - we first call update_penalty_clock() to remove penalties that may have expired due to time
    # - we then consider the on-ice manpower and if the scoring team was on the PP, remove a minor 
    #     penalty from the other team if possible.
    def add_goal(self, play, debug_mode = False):
        period_goal_scored = play['period']
        time_goal_scored = play['period_seconds']
        
        # remove penalties that have expired
        self.update_penalty_clock(period_goal_scored, time_goal_scored, debug_mode)
        
        away_strength, home_strength = self.get_strengths()
        if debug_mode:
            print(f'GOAL!! {away_strength}v{home_strength}')
            
        # Rule 16.2: no penalty shall expire due to a penalty shot goal
        if play['secondary_type'] == 'Penalty Shot':
            return
        
        # add goals to the given strength in self.strength_to_goals
        self.update_goal_metric(play)
        
        if away_strength != home_strength: # no penalty shall expire at even strength
            # remove any applicable minor penalties on the clock, iff scoring team was on PP
            team_scored = play['event_team']
            if team_scored == self.away_team and away_strength > home_strength:
                self.remove_minor(self.home_team)
            elif team_scored == self.home_team and home_strength > away_strength:
                self.remove_minor(self.away_team)
            
        self.update_strength()
        
        if debug_mode:
            print('After Goal:')
            print(f'Away: {self.away_penalty_clock}')
            print(f'Home: {self.home_penalty_clock}')
    
    # - used for data scraping purposes
    # - after a goal is scored, we increment the respective teams' GF and GA at the given strength
    def update_goal_metric(self, play):
        scoring_team = play['event_team']
        away_strength, home_strength = self.get_strengths()
        if scoring_team == self.away_team:
            self.strength_to_goals[scoring_team][(away_strength, home_strength)] += 1
            self.strength_to_goals_against[self.home_team][(home_strength, away_strength)] += 1
        elif scoring_team == self.home_team:
            self.strength_to_goals[scoring_team][(home_strength, away_strength)] += 1
            self.strength_to_goals_against[self.away_team][(away_strength, home_strength)] += 1

    # - add a penalty to the current batch of penalties
    # - we use the PrePenalty class as these are being preprocessed
    # - we really only care about the possibility of 5+2s, 2+2s, etc; and the total PIMs
    def add_penalty(self, play):
        penalized_team = play['event_team']
        severity = play['penalty_severity']
            
        penalty = PrePenalty()
        if severity == 'Minor': # check PIMs if 2 or 4
            if play['penalty_minutes'] == 4: # double minor is 2 minors back to back, not a single 4 minute block
                penalty.time_length = 240
            elif play['penalty_minutes'] == 6: # triple minor
                penalty.time_length = 360
        elif severity == 'Major' or severity == 'Match': # 5 minutes
            penalty.time_length = 300

        penalized_player = play['event_player_1_name']
        if penalized_team == self.away_team:
            self.pre_penalties['away'][penalized_player].append(penalty)
        else:
            self.pre_penalties['home'][penalized_player].append(penalty)

    # - once the current batch of penalties is completed, determine the on ice strength
    #    using self.<away/home>_penalty_clock and try to resolve coincidental penalties
    # - once we match coincidental penalties, only add the leftover penalties on the clock
    # - if the penalty clock is full, add the rest of them to self.queued_<away/home>_penalties
    def process_penalties(self, play, debug_mode = False):
        # group penalties by player. if they have multiple, add the PIMs and we can infer them
        away_pens = self.get_team_pens(self.pre_penalties['away'])
        home_pens = self.get_team_pens(self.pre_penalties['home'])
        
        # clear expired penalties and keep self.current_time updated
        self.update_penalty_clock(play['period'], play['period_seconds'])
        
        away_PIMs = sum(away_pens)
        home_PIMs = sum(home_pens)
        PIMs_diff = away_PIMs - home_PIMs
        abs_PIMs_diff = abs(PIMs_diff)
        
        away_team_penalized = PIMs_diff > 0
        penalized_team = self.away_team if away_team_penalized else self.home_team
        
        # we keep temporary lists of the penalties to be added in case we need to queue one or more
        # i.e. if a team has 1 on the clock and takes 2 more penalties, we can only add 1 and need to queue the other
        away_temp_pens = []
        home_temp_pens = []
        
        # we can use some clever logic by taking the difference between the two teams' PIMs to deduce what
        # most penalty combinations will result. 
        # i.e. - 120 will always result in 1 team with a 2m minor
        #      - 180 will always result in 1 team with a 5m major and the other team with a 2m minor
        # note that a few PIMs_diff are ambiguous and that we need to resolve them, like 240 or 420
        match abs_PIMs_diff:
            case 120: # 2m PP
                pen = Penalty(self.current_time)
                away_temp_pens.append(pen) if away_team_penalized else home_temp_pens.append(pen)
            case 300: # 5m PP
                pen = Penalty(self.current_time, 300, 'Major')
                away_temp_pens.append(pen) if away_team_penalized else home_temp_pens.append(pen)
            case 0: # Full strength unless both teams have exactly 1 2m minor
                if away_pens == (120,) and home_pens == (120,) and self.get_strengths() == (5,5):
                    pen1 = Penalty(self.current_time, is_4v4 = True)
                    pen2 = Penalty(self.current_time, is_4v4 = True)
                    away_temp_pens.append(pen1)
                    home_temp_pens.append(pen2)
            case 240:
                away_temp_pens, home_temp_pens = cache_4_min_PIMS(away_pens, home_pens, self.current_time, play['game_date'])
            case 60: # 5 vs 2, 2
                print('PIM difference == 1')
                print('Assuming 1 major vs 2 minors')
                print(away_pens, home_pens)
                minor1 = Penalty(self.current_time)
                minor2 = Penalty(self.current_time)
                major = Penalty(self.current_time, 500, 'Major')
                if self.current_time >= 3300:
                    print(f'L5M/OT: {self.away_pens}, {self.home_pens}')
                    assert False, f"L5M/OT situation {self.away_team} @ {self.home_team} {self.date}"
                else:
                    if away_team_penalized: # assign the major to the away team
                        away_temp_pens.append(major)
                        home_temp_pens = [minor1, minor2]
                    else: # assign the major to the home team
                        away_temp_pens = [minor1, minor2]
                        home_temp_pens.append(major)
            case 180: # 5 vs 2
                # rule 19.4 for last 5 mins and overtime
                if self.current_time >= 3300:
                    print(f'L5M/OT: {away_pens}, {home_pens}')
                    print(f'{self.away_team} @ {self.home_team} {self.date}')
                    major = Penalty(self.current_time, 180, 'Major')
                    away_temp_pens.append(major) if away_team_penalized else home_temp_pens.append(major)
                else:
                    major = Penalty(self.current_time, 300, 'Major')
                    minor = Penalty(self.current_time)
                    if away_team_penalized: # assign the major to the away team
                        away_temp_pens.append(major)
                        home_temp_pens.append(minor)
                    else: # assign the major to the home team
                        away_temp_pens.append(minor)
                        home_temp_pens.append(major)
            case 360: # 2 + 2 + 2
                print('PIM difference == 6')
                print('Assuming a 2+2+2')
                print(f'{self.away_team} @ {self.home_team} {self.date}')
                print(away_pens, home_pens)
                pen = Penalty(self.current_time, remaining_minors = 2)
                away_temp_pens.append(pen) if away_team_penalized else home_temp_pens.append(pen)
            case 420: # 5 + 2
                print('PIM difference == 7')
                print(f'{self.away_team} @ {self.home_team} {self.date}')
                print(away_pens, home_pens)
                away_temp_pens, home_temp_pens = cache_7_min_PIMS(away_pens, home_pens, self.current_time)
            case 540: # 5 + 2 + 2
                print('PIM difference == 9')
                print('Assuming a 5+2+2')
                print(f'{self.away_team} @ {self.home_team} {self.date}')
                print(away_pens, home_pens)
                pen = Penalty(self.current_time, 300, 'Major', remaining_minors = 2)
                away_temp_pens.append(pen) if away_team_penalized else home_temp_pens.append(pen)
            case 600: # 5 + 5, 5 // 5
                print('PIM difference == 9')
                print('Assuming a 5+5')
                print(f'{self.away_team} @ {self.home_team} {self.date}')
                print(away_pens, home_pens)
                pen = Penalty(self.current_time, 300, 'Major', remaining_majors = 1)
                away_temp_pens.append(pen) if away_team_penalized else home_temp_pens.append(pen)
            case default: # disregard for now
                print(abs_PIMs_diff)
                print('Default case for PIM matching')
                print(f'{self.away_team} @ {self.home_team} {self.date}')
                print(away_pens, home_pens)
                assert False, "Current batch of coincidental penalties not accounted for: {away_pens}, {home_pens}"
                
        self.add_penalties_to_clock(away_temp_pens, home_temp_pens)
        
        if self.regular_season_OT:
            self.reset_OT_strengths()
            
        self.update_strength()

        # reset pre-processed penalties
        self.pre_penalties['away'] = defaultdict(list)
        self.pre_penalties['home'] = defaultdict(list)
                
        if debug_mode:
            print('Processed penalties. Resulting Penalty Clock: ')
            print(f'PIMs_diff: {PIMs_diff}')
            print(f'Away: {self.away_penalty_clock}')
            print(f'Home: {self.home_penalty_clock}')
        
    # - helper function that takes all penalties drawn and attempts to add them to the penalty clock
    # - the penalty clock cannot have more than 2 penalties, so we add the rest to the queue 
    # - the queued penalties will begin once earlier ones expire
    def add_penalties_to_clock(self, temp_away_pens, temp_home_pens):
        # see if we can add these penalties to the clock, else, queue them
        if len(self.away_penalty_clock) == 2: # penalty clock full, queue the current batch
            self.queued_penalties['away'] += temp_away_pens
        elif len(self.away_penalty_clock) == 1: # can add 1 penalty to the clock
            if temp_away_pens:
                pen = temp_away_pens.pop()
                self.away_penalty_clock.append(pen)
                self.queued_penalties['away'] += temp_away_pens
        elif len(self.away_penalty_clock) == 0: # penalty clock empty, add up to 2 from the batch
            if len(temp_away_pens) > 2:
                pen1 = temp_away_pens.pop()
                pen2 = temp_away_pens.pop()
                self.away_penalty_clock = [pen1, pen2]
                self.queued_penalties['away'] += temp_away_pens
            else:
                self.away_penalty_clock = temp_away_pens
        else:
            assert "Away Penalty Clock is misconfigured"
            
        # add home_penalties now
        if len(self.home_penalty_clock) == 2:
            self.queued_penalties['home'] += temp_home_pens
        elif len(self.home_penalty_clock) == 1:
            if temp_home_pens:
                pen = temp_home_pens.pop()
                self.home_penalty_clock.append(pen)
                self.queued_penalties['home'] += temp_home_pens
        elif len(self.home_penalty_clock) == 0:
            if len(temp_home_pens) > 2:
                pen1 = temp_home_pens.pop()
                pen2 = temp_home_pens.pop()
                self.home_penalty_clock = [pen1, pen2]
                self.queued_penalties['home'] += temp_home_pens
            else:
                self.home_penalty_clock = temp_home_pens
        else:
            assert "Home Penalty Clock is misconfigured"
    
    # team_penalties: player -> [Penalty]
    # - groups total PIMs per player to account for double minors, 5+2s, etc
    # retval: tuple of each players total PIMs
    def get_team_pens(self, team_penalties):
        team_pens = ()
        for player, pens in team_penalties.items():
            curr_PIMs = sum(pen.time_length for pen in pens)
            team_pens = (*team_pens, curr_PIMs)
        return tuple(sorted(list(team_pens), reverse = True))
    
    # clear 4v4 penalties
    # since regular season OT has different rules with penalties on 3v3, we need a different Game state.
    def start_OT(self):
        self.regular_season_OT = True
        self.away_penalty_clock = [pen for pen in self.away_penalty_clock if not pen.is_4v4]
        assert len(self.queued_penalties['away']) == 0, f"There are queued penalties heading to OT {self.away_team} v {self.home_team} {self.date}"
        self.home_penalty_clock = [pen for pen in self.home_penalty_clock if not pen.is_4v4]
        assert len(self.queued_penalties['home']) == 0, f"There are queued penalties heading to OT {self.away_team} v {self.home_team} {self.date}"
        self.reset_OT_strengths()
        
    # after any stoppage in regular season OT, reset back to the default 3v3 or set PP fields if there is a PP
    def reset_OT_strengths(self):
        self.away_OT_extra = 0
        self.home_OT_extra = 0
        self.away_OT_PP = 0
        self.home_OT_PP = 0
        OT_PP = len(self.away_penalty_clock) - len(self.home_penalty_clock)
        if OT_PP > 0:
            self.home_OT_PP = OT_PP
        elif OT_PP < 0:
            self.away_OT_PP = -OT_PP
    
    # debugging function
    # gets every unique combination of penalties called at a single stoppage
    def increment_coincidentals_debug(self):
        away_pens = self.get_team_pens(self.pre_penalties['away'])
        home_pens = self.get_team_pens(self.pre_penalties['home'])

        self.coincidentals[(away_pens, home_pens)] += 1
        self.coincidentals[(home_pens, away_pens)] += 1
        self.pre_penalties['away'] = defaultdict(list)
        self.pre_penalties['home'] = defaultdict(list)
        
    # designed to be used with the coincidental penalty debugging function
    def add_penalty_debug(self, play):
        penalized_team = play['event_team']
        severity = play['penalty_severity']
        period = play['period']
        time = play['period_seconds']
        self.current_time = time
        self.period = period

        penalty = PrePenalty()
        if severity == 'Minor': # check PIMs if 2 or 4
            PIMs = play['penalty_minutes']
            if PIMs == 4: # double minor is 2 minors back to back, not a single 4 minute block
                penalty.time_length = 240
            elif PIMs == 6: 
                penalty.time_length = 360
                penalty.severity = 'Triple Minor'
        elif severity == 'Major' or severity == 'Match': # 5 minutes
            penalty.time_length = 300

        penalized_player = play['event_player_1_name']
        if penalized_team == self.away_team:
            self.pre_penalties['away'][penalized_player].append(penalty)
        else:
            self.pre_penalties['home'][penalized_player].append(penalty)

In [38]:
def scrape_pbp(playoffs = False):
    overall_strength_to_TOI = {}
    overall_strength_to_goals = {}
    overall_strength_to_goals_against = {}

    for year in range(16, 22):
        season = int(f'20{year}')
        overall_strength_to_TOI[season] = {}
        overall_strength_to_goals[season] = {}
        overall_strength_to_goals_against[season] = {}
        for _team in ALL_TEAMS:
            overall_strength_to_TOI[season][_team] = Counter()
            overall_strength_to_goals[season][_team] = Counter()
            overall_strength_to_goals_against[season][_team] = Counter()
    
    current_game = None
    penalty_just_called = False
    debug = False

    # these are irrelevant to on-ice manpower and PP goals (misconducts usually paired with a minor/major)
    irrelevant_penalties = set(['Penalty Shot', 'Misconduct', 'Game Misconduct'])
    for year in range(16, 22):
        file = f'./play_by_play_20{year}_{year+1}_lite.csv'
        curr_pbp = pd.read_csv(file, encoding='latin-1')
        for _, play in tqdm(curr_pbp.iterrows()):
            if playoffs and play['season_type'] == 'R':
                continue
            else:
                event = play['event_type']

                away_team, home_team, date = play['away_name'], play['home_name'], play['game_date']
                if current_game: # sometimes, games don't have the 'GAME_SCHEDULED' event, need to manually check
                    if away_team != current_game.away_team or home_team != current_game.home_team or date != current_game.date:
                        overall_strength_to_TOI, overall_strength_to_goals, overall_strength_to_goals_against = update_team_statistics(overall_strength_to_TOI, overall_strength_to_goals, overall_strength_to_goals_against, current_game.strength_to_time, current_game.strength_to_goals, current_game.strength_to_goals_against, int(f'20{year}'), current_game.away_team, False)
                        overall_strength_to_TOI, overall_strength_to_goals, overall_strength_to_goals_against = update_team_statistics(overall_strength_to_TOI, overall_strength_to_goals, overall_strength_to_goals_against, current_game.strength_to_time, current_game.strength_to_goals, current_game.strength_to_goals_against, int(f'20{year}'), current_game.home_team, True)
                        current_game = Game(play['away_name'], play['home_name'], play['game_date'], play['season_type'])

                if penalty_just_called and event != 'PENALTY': # once all penalties are announced, process them and figure out the manpower
                    current_game.process_penalties(play, debug)
                    penalty_just_called = False

                if play['period'] == 4 and current_game.season_type == 'R':
                    if not current_game.regular_season_OT:
                        current_game.start_OT()
                    elif play['event_type'] in set(['STOP', 'CHALLENGE', 'PENALTY']):
                        current_game.reset_OT_strengths()

                match event:
                    case 'GAME_SCHEDULED':
                        if current_game:
                            overall_strength_to_TOI, overall_strength_to_goals, overall_strength_to_goals_against = update_team_statistics(overall_strength_to_TOI, overall_strength_to_goals, overall_strength_to_goals_against, current_game.strength_to_time, current_game.strength_to_goals, current_game.strength_to_goals_against, int(f'20{year}'), current_game.away_team, False)
                            overall_strength_to_TOI, overall_strength_to_goals, overall_strength_to_goals_against = update_team_statistics(overall_strength_to_TOI, overall_strength_to_goals, overall_strength_to_goals_against, current_game.strength_to_time, current_game.strength_to_goals, current_game.strength_to_goals_against, int(f'20{year}'), current_game.home_team, True)
                        current_game = Game(play['away_name'], play['home_name'], play['game_date'], play['season_type'])
                    case 'PENALTY':
                        if play['penalty_severity'] not in irrelevant_penalties:  # minor/major/misconduct/game misconduct/match
                            current_game.add_penalty(play)
                            penalty_just_called = True
                    case 'GOAL': # exclude penalty shots and shootout goals
                        if play['secondary_type'] != 'Penalty Shot' and play['period'] != 5: # exclude SO
                            current_game.add_goal(play, debug)
                    case default:
                        current_game.update_penalty_clock(play['period'], play['period_seconds'], debug)


        print('Done')
    return overall_strength_to_TOI, overall_strength_to_goals, overall_strength_to_goals_against

In [47]:
def scrape_pbp_22(playoffs = False):
    overall_strength_to_TOI = {}
    overall_strength_to_goals = {}
    overall_strength_to_goals_against = {}

    for year in range(22, 23):
        season = int(f'20{year}')
        overall_strength_to_TOI[season] = {}
        overall_strength_to_goals[season] = {}
        overall_strength_to_goals_against[season] = {}
        for _team in ALL_TEAMS:
            overall_strength_to_TOI[season][_team] = Counter()
            overall_strength_to_goals[season][_team] = Counter()
            overall_strength_to_goals_against[season][_team] = Counter()
    
    current_game = None
    penalty_just_called = False
    debug = False

    # these are irrelevant to on-ice manpower and PP goals (misconducts usually paired with a minor/major)
    irrelevant_penalties = set(['Penalty Shot', 'Misconduct', 'Game Misconduct'])
    for year in range(22, 23):
        file = f'./play_by_play_20{year}_{year+1}_lite.csv'
        curr_pbp = pd.read_csv(file)
        for _, play in tqdm(curr_pbp.iterrows()):
            if playoffs and play['season_type'] == 'R':
                continue
            else:
                event = play['event_type']

                away_team, home_team, date = play['away_name'], play['home_name'], play['game_date']
                if current_game: # sometimes, games don't have the 'GAME_SCHEDULED' event, need to manually check
                    if away_team != current_game.away_team or home_team != current_game.home_team or date != current_game.date:
                        overall_strength_to_TOI, overall_strength_to_goals, overall_strength_to_goals_against = update_team_statistics(overall_strength_to_TOI, overall_strength_to_goals, overall_strength_to_goals_against, current_game.strength_to_time, current_game.strength_to_goals, current_game.strength_to_goals_against, int(f'20{year}'), current_game.away_team, False)
                        overall_strength_to_TOI, overall_strength_to_goals, overall_strength_to_goals_against = update_team_statistics(overall_strength_to_TOI, overall_strength_to_goals, overall_strength_to_goals_against, current_game.strength_to_time, current_game.strength_to_goals, current_game.strength_to_goals_against, int(f'20{year}'), current_game.home_team, True)
                        current_game = Game(play['away_name'], play['home_name'], play['game_date'], play['season_type'])

                if penalty_just_called and event != 'PENALTY': # once all penalties are announced, process them and figure out the manpower
                    current_game.process_penalties(play, debug)
                    penalty_just_called = False

                if play['period'] == 4 and current_game.season_type == 'R':
                    if not current_game.regular_season_OT:
                        current_game.start_OT()
                    elif play['event_type'] in set(['STOP', 'CHALLENGE', 'PENALTY']):
                        current_game.reset_OT_strengths()

                match event:
                    case 'GAME_SCHEDULED':
                        if current_game:
                            overall_strength_to_TOI, overall_strength_to_goals, overall_strength_to_goals_against = update_team_statistics(overall_strength_to_TOI, overall_strength_to_goals, overall_strength_to_goals_against, current_game.strength_to_time, current_game.strength_to_goals, current_game.strength_to_goals_against, int(f'20{year}'), current_game.away_team, False)
                            overall_strength_to_TOI, overall_strength_to_goals, overall_strength_to_goals_against = update_team_statistics(overall_strength_to_TOI, overall_strength_to_goals, overall_strength_to_goals_against, current_game.strength_to_time, current_game.strength_to_goals, current_game.strength_to_goals_against, int(f'20{year}'), current_game.home_team, True)
                        current_game = Game(play['away_name'], play['home_name'], play['game_date'], play['season_type'])
                    case 'PENALTY':
                        if play['penalty_severity'] not in irrelevant_penalties:  # minor/major/misconduct/game misconduct/match
                            current_game.add_penalty(play)
                            penalty_just_called = True
                    case 'GOAL': # exclude penalty shots and shootout goals
                        if play['secondary_type'] != 'Penalty Shot' and play['period'] != 5: # exclude SO
                            current_game.add_goal(play, debug)
                    case default:
                        current_game.update_penalty_clock(play['period'], play['period_seconds'], debug)


        print('Done')
    return overall_strength_to_TOI, overall_strength_to_goals, overall_strength_to_goals_against

In [39]:
def update_team_statistics(overall_strength_to_TOI, overall_strength_to_goals, overall_strength_to_goals_against, strength_to_time, strength_to_goals, strength_to_goals_against, year, team, is_home):
    if team in ['Canadian All-Stars', 'American All-Stars']:
        return overall_strength_to_TOI, overall_strength_to_goals, overall_strength_to_goals_against
    else:
        if is_home:       
            for _strength, _time in strength_to_time.items():
                home_strength = (_strength[1], _strength[0])
                overall_strength_to_TOI[year][team][home_strength] += _time
            overall_strength_to_goals[year][team] += strength_to_goals[team]
            overall_strength_to_goals_against[year][team] += strength_to_goals_against[team]
        else:
            overall_strength_to_TOI[year][team] += strength_to_time
            overall_strength_to_goals[year][team] += strength_to_goals[team]
            overall_strength_to_goals_against[year][team] += strength_to_goals_against[team]
        return overall_strength_to_TOI, overall_strength_to_goals, overall_strength_to_goals_against

    
    

In [50]:
def get_2022_data():
    # 2022-23 season
    overall_strength_to_TOI, overall_strength_to_goals, overall_strength_to_goals_against = scrape_pbp_22()

    strengths = [(5,5), (5,4), (4,5), (4,4), (3,3), (4,3), (3,4), (5,3), (3,5)]
    columns = ['season', 'team', 'strength', 'TOI', 'goals', 'goals_against']
    data = []
    for year in range(2022, 2023):
        teams = overall_strength_to_TOI[year].keys()
        for _team in teams:
            curr_row = [year, _team]
            for _strength in strengths:
                curr_toi = overall_strength_to_TOI[year][_team][_strength]
                curr_goals = overall_strength_to_goals[year][_team][_strength]
                curr_goals_against = overall_strength_to_goals_against[year][_team][_strength]
                new_row = curr_row + [_strength, curr_toi, curr_goals, curr_goals_against]
                data.append(new_row)

    df = pd.DataFrame(data, columns=columns)
    df.to_csv("./nhl_strength_data_22.csv")

get_2022_data()


  curr_pbp = pd.read_csv(file)
48323it [00:04, 14602.71it/s]

3 on 5 goal: Ottawa Senators @ Tampa Bay Lightning 2022-11-01


60098it [00:05, 13625.99it/s]

PIM difference == 1
Assuming 1 major vs 2 minors
(120, 120) (300,)


74785it [00:06, 14086.71it/s]

multiple expiring penalties: Pittsburgh Penguins @ Montréal Canadiens 2022-11-12


176992it [00:13, 14475.03it/s]

Carolina Hurricanes @ New Jersey Devils 2023-01-01


222115it [00:16, 14507.28it/s]

multiple expiring penalties: Florida Panthers @ Montréal Canadiens 2023-01-19


310585it [00:22, 13806.71it/s]


Done


In [37]:
overall_strength_to_TOI, overall_strength_to_goals, overall_strength_to_goals_against = scrape_pbp(playoffs=True)

strengths = [(5,5), (5,4), (4,5), (4,4), (3,3), (4,3), (3,4), (5,3), (3,5)]
columns = ['season', 'team', 'strength', 'TOI', 'goals', 'goals_against']
data = []
for year in range(2016, 2022):
    teams = overall_strength_to_TOI[year].keys()
    for _team in teams:
        curr_row = [year, _team]
        for _strength in strengths:
            curr_toi = overall_strength_to_TOI[year][_team][_strength]
            curr_goals = overall_strength_to_goals[year][_team][_strength]
            curr_goals_against = overall_strength_to_goals_against[year][_team][_strength]
            new_row = curr_row + [_strength, curr_toi, curr_goals, curr_goals_against]
            data.append(new_row)
            
df = pd.DataFrame(data, columns=columns)
df.to_csv("./nhl_playoff_strength_data.csv", encoding="latin-1")

  curr_pbp = pd.read_csv(file, encoding='latin-1')
410520it [00:20, 20236.49it/s]


Done


  curr_pbp = pd.read_csv(file, encoding='latin-1')
425270it [00:22, 12876.53it/s]

PIM difference == 7
San Jose Sharks @ Vegas Golden Knights 2018-04-26
(300, 120) ()


429082it [00:22, 12207.45it/s]

multiple expiring penalties: Tampa Bay Lightning @ Washington Capitals 2018-05-15


431856it [00:22, 19047.82it/s]


Done


  curr_pbp = pd.read_csv(file, encoding='latin-1')
430380it [00:22, 19201.67it/s]


multiple expiring penalties: Boston Bruins @ St. Louis Blues 2019-06-01
Done


  curr_pbp = pd.read_csv(file, encoding='latin-1')
377844it [00:20, 18186.01it/s]


Done


  curr_pbp = pd.read_csv(file, encoding='latin-1')
276927it [00:13, 14151.19it/s]

PIM difference == 9
Assuming a 5+2+2
Vegas Golden Knights @ Colorado Avalanche 2021-05-30
(540,) ()


284536it [00:14, 19661.04it/s]


Done


  curr_pbp = pd.read_csv(file, encoding='latin-1')
431914it [00:21, 19715.08it/s]


Done


In [12]:
# - this function has a similar body scrape_pbp() but is used for debugging
# - specficially, when testing new or edited features, you can uncomment the first 2 commented lines to find the 
#   game number (i)
# - once you have the game number, insert this into the next commented 2 lines and the scraper will enter debug mode,
#   printing the game state on a play-by-play basis

# Example: you know that BOS @ CBJ has a triple minor and want to ensure the system is handling this properly
# 1. add the teams to the first if statement (say this returns i=238)
# 2. set game_num to 238 and uncomment the 2nd pair of lines
# 3. the function will print out the relevant features on a pbp basis, which can be cross 
#    referenced (ESPN pbp) and verified
def scrape_pbp_debug():
    team_to_power_plays = {}
    current_game = None
    penalty_just_called = False

    # these are irrelevant to on-ice manpower and PP goals (misconducts usually paired with a minor/major)
    irrelevant_penalties = set(['Penalty Shot', 'Misconduct', 'Game Misconduct'])
    
    # debugging variables
    i = 0
    game_num = 118
    debug = False

    for _, play in tqdm(pbp_18_19.iterrows()):
        event = play['event_type']

        away_team, home_team, date = play['away_name'], play['home_name'], play['game_date']
        if current_game:
            if away_team != current_game.away_team or home_team != current_game.home_team or date != current_game.date:
                current_game = Game(play['away_name'], play['home_name'], play['game_date'], play['season_type'])

        if penalty_just_called and event != 'PENALTY': # once all penalties are announced, process them and figure out the manpower
            current_game.process_penalties(play, debug)
            penalty_just_called = False

        if play['period'] == 4 and current_game.season_type == 'R':
            if not current_game.regular_season_OT:
                current_game.start_OT()
            elif play['event_type'] in set(['STOP', 'CHALLENGE', 'PENALTY']):
                current_game.reset_OT_strengths()

        match event:
            case 'GAME_SCHEDULED':
                current_game = Game(play['away_name'], play['home_name'], play['game_date'], play['season_type'])
#                 if current_game.away_team == 'Columbus Blue Jackets' and current_game.home_team == 'Boston Bruins':
#                     print(f'bos @ cbj i={i}')
                if current_game.away_team == 'Chicago Blackhawks' and current_game.home_team == 'Arizona Coyotes':
                    print(f'chi @ ari i={i}')
#                 if i == game_num: # debugging
#                     print(f'New Game: {current_game.away_team} @ {current_game.home_team}')
            case 'PENALTY':
                if play['penalty_severity'] not in irrelevant_penalties:  # minor/major/misconduct/game misconduct/match
                    current_game.add_penalty(play)
                    penalty_just_called = True
            case 'GOAL': # exclude penalty shots and shootout goals
                if play['secondary_type'] != 'Penalty Shot' and play['period'] != 5: # exclude SO
                    current_game.add_goal(play, debug)
            case 'GAME_END':
                # debugging 
                i += 1
                # if i > game_num - 1:
                #     debug = True
                # if i > game_num:
                #     break # debugging
            case default:
                current_game.update_penalty_clock(play['period'], play['period_seconds'], debug)

    print('Done')
    
scrape_pbp_debug()

26156it [00:03, 12445.72it/s]

multiple expiring penalties: Calgary Flames @ Vancouver Canucks 2017-10-14


39219it [00:05, 13209.92it/s]

chi @ ari i=118
Chicago Blackhawks @ Arizona Coyotes 2017-10-21


133598it [00:12, 12714.40it/s]

L5M/OT: (300,), (120,)
San Jose Sharks @ Washington Capitals 2017-12-04


249468it [00:21, 13738.71it/s]

PIM difference == 9
Assuming a 5+5
Florida Panthers @ Buffalo Sabres 2018-02-01
(300,) (600, 300)


276283it [00:23, 13489.70it/s]

chi @ ari i=861


296373it [00:24, 13226.52it/s]

Dallas Stars @ Anaheim Ducks 2018-02-21


362483it [00:30, 13219.27it/s]

multiple expiring penalties: New York Rangers @ Philadelphia Flyers 2018-03-22


425656it [00:34, 13140.60it/s]

PIM difference == 7
San Jose Sharks @ Vegas Golden Knights 2018-04-26
(300, 120) ()


429606it [00:35, 12413.28it/s]

multiple expiring penalties: Tampa Bay Lightning @ Washington Capitals 2018-05-15


431856it [00:35, 12229.96it/s]

Done





In [22]:
# - this function was used to scrape most pbp years and gather the unique combinations of penalties called
#   at a single stoppage.
# - after gathering and printing the combinations, we use the rulebook to determine what the resulting
#   penalties should be after cancelling coincidental penalties.
# - Note: there are ambiguous situations where the penalized team's captain has the choice to, for example,
#   go SH 2 men for 2 minutes or SH 1 man for 4 minutes.
def get_coincidental_penalties(first_year = 16, last_year = 22):
    current_game = None
    penalty_just_called = False
    coincidental_penalty_cache = defaultdict(set)
    coincidentals = Counter()

    # these are irrelevant to on-ice manpower and PP goals (misconducts usually paired with a minor/major)
    irrelevant_penalties = set(['Penalty Shot', 'Misconduct', 'Game Misconduct'])

    for year in range(first_year, last_year + 1):
        pbp_test = pd.read_csv(f"./hockeyR_data/play_by_play_20{year}_{year+1}_lite.csv", encoding='latin-1')
        for _, play in tqdm(pbp_test.iterrows()):
            event = play['event_type']

            away_team, home_team, date = play['away_name'], play['home_name'], play['game_date']
            if current_game:
                if away_team != current_game.away_team or home_team != current_game.home_team or date != current_game.date:
                    current_game = Game(play['away_name'], play['home_name'], play['game_date'], play['season_type'])

            match event:
                case 'GAME_SCHEDULED':
                    current_game = Game(play['away_name'], play['home_name'], play['game_date'], play['season_type'])
                case 'PENALTY':
                    if play['penalty_severity'] not in irrelevant_penalties:  # minor/major/misconduct/game misconduct/match
                        current_game.add_penalty_debug(play)
                        penalty_just_called = True
                case 'GAME_END':
                    coincidentals += current_game.coincidentals

            if penalty_just_called and event != 'PENALTY': # once all penalties are announced, process them and figure out the manpower
                current_game.increment_coincidentals_debug()
                penalty_just_called = False

        for k, _ in coincidentals.items():
            t1, t2 = k
            abs_PIM_diff = abs(sum(t1) - sum(t2))
            coincidental_penalty_cache[abs_PIM_diff].add((t1, t2))

        coincidentals = Counter()
    return coincidental_penalty_cache

In [24]:
cache = get_coincidental_penalties(16, 19)
print(cache.keys())
for k, v in cache.items():
    if k != 0:
        print(f'PIM difference == {k}')
        for combo in v:
            print(v)

  pbp_test = pd.read_csv(f"./hockeyR_data/play_by_play_20{year}_{year+1}_lite.csv", encoding='latin-1')
410520it [00:23, 17106.01it/s]
  pbp_test = pd.read_csv(f"./hockeyR_data/play_by_play_20{year}_{year+1}_lite.csv", encoding='latin-1')
431856it [00:24, 17607.49it/s]
  pbp_test = pd.read_csv(f"./hockeyR_data/play_by_play_20{year}_{year+1}_lite.csv", encoding='latin-1')
430380it [00:24, 17387.85it/s]
  pbp_test = pd.read_csv(f"./hockeyR_data/play_by_play_20{year}_{year+1}_lite.csv", encoding='latin-1')
377844it [00:22, 16999.40it/s]


dict_keys([0, 120, 300, 240, 180, 600, 420, 360])
PIM difference == 120
{((240, 120, 120, 120, 120), (120, 120, 120, 120, 120)), ((420, 300), (300, 300)), ((660, 300), (420, 300, 120)), ((300, 120, 120), (300, 240, 120)), ((300, 300), (420, 300)), ((420, 300, 120), (660, 300)), ((360,), (240,)), ((360, 300, 120), (300, 240, 120)), ((420, 300, 120), (540, 300, 120)), ((420,), (420, 120)), ((240,), (240, 120)), ((), (120,)), ((120,), (240,)), ((120, 120), (120,)), ((420, 360), (420, 240)), ((240,), (360,)), ((120, 120, 120, 120, 120), (240, 120, 120, 120, 120)), ((300, 120), (540,)), ((420, 120), (420,)), ((420, 120), (300, 120)), ((300, 240, 120), (360, 300, 120)), ((420,), (300,)), ((120,), ()), ((300, 120), (420, 120)), ((240, 120), (120, 120)), ((300, 240, 120), (300, 120, 120)), ((300, 300, 120), (300, 300)), ((120, 120), (120, 120, 120)), ((120, 120, 120), (120, 120)), ((300, 120), (300,)), ((300, 120), (300, 120, 120)), ((300, 240), (300, 120)), ((300, 300), (300, 300, 120)), ((24

In [27]:
pbp = pd.read_csv("./hockeyR_data/play_by_play_2021_22_lite.csv", encoding="latin-1")
pbp.loc[pbp['away_abbreviation'] == 'MTL']

  pbp = pd.read_csv("./hockeyR_data/play_by_play_2021_22_lite.csv", encoding="latin-1")


Unnamed: 0,xg,event_id,event_type,event,secondary_type,event_team,event_team_type,description,period,period_seconds,...,home_division_name,home_division_name_short,home_conference_name,home_id,away_name,away_abbreviation,away_division_name,away_division_name_short,away_conference_name,away_id
626,,2.021020e+13,GAME_SCHEDULED,Game Scheduled,,,,Game Scheduled,1,0,...,Atlantic,ATL,Eastern,10,Montréal Canadiens,MTL,Atlantic,ATL,Eastern,8
627,,2.021020e+13,FACEOFF,Faceoff,,Montréal Canadiens,away,Christian Dvorak faceoff won against John Tavares,1,0,...,Atlantic,ATL,Eastern,10,Montréal Canadiens,MTL,Atlantic,ATL,Eastern,8
628,,2.021020e+13,HIT,Hit,,Toronto Maple Leafs,home,Nick Ritchie hit Christian Dvorak,1,11,...,Atlantic,ATL,Eastern,10,Montréal Canadiens,MTL,Atlantic,ATL,Eastern,8
629,0.190316,2.021020e+13,SHOT,Shot,Slap Shot,Montréal Canadiens,away,Brett Kulak Slap Shot saved by Jack Campbell,1,15,...,Atlantic,ATL,Eastern,10,Montréal Canadiens,MTL,Atlantic,ATL,Eastern,8
630,0.004288,2.021020e+13,MISSED_SHOT,Missed Shot,,Montréal Canadiens,away,Josh Anderson Wide of Net Jack Campbell,1,18,...,Atlantic,ATL,Eastern,10,Montréal Canadiens,MTL,Atlantic,ATL,Eastern,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
381275,,2.021021e+13,FACEOFF,Faceoff,,Ottawa Senators,home,Josh Norris faceoff won against Nick Suzuki,3,1185,...,Atlantic,ATL,Eastern,9,Montréal Canadiens,MTL,Atlantic,ATL,Eastern,8
381276,0.027017,2.021021e+13,SHOT,Shot,Backhand,Montréal Canadiens,away,Mike Hoffman Backhand saved by Anton Forsberg,3,1195,...,Atlantic,ATL,Eastern,9,Montréal Canadiens,MTL,Atlantic,ATL,Eastern,8
381277,0.001274,2.021021e+13,MISSED_SHOT,Missed Shot,,Ottawa Senators,home,Tim Stützle Wide of Net,3,1199,...,Atlantic,ATL,Eastern,9,Montréal Canadiens,MTL,Atlantic,ATL,Eastern,8
381278,,2.021021e+13,PERIOD_END,Period End,,,,End of 3rd Period,3,1200,...,Atlantic,ATL,Eastern,9,Montréal Canadiens,MTL,Atlantic,ATL,Eastern,8
