In [3]:
# import packages
import nfl_data_py as nfl
import numpy as np
import pandas as pd
from scipy import integrate

In [4]:
# import data
pbp_data = nfl.import_pbp_data([2022, 2021, 2019, 2018, 2017, 2016, 2015])
pbp_data['home_team'].replace({'SD': 'LAC', 'OAK': 'LV', 'STL': 'LA'}, inplace=True)
pbp_data['away_team'].replace({'SD': 'LAC', 'OAK': 'LV', 'STL': 'LA'}, inplace=True)
cols = list(nfl.see_pbp_cols())
sample = pbp_data.sample(10)

2022 done.
2021 done.
2019 done.
2018 done.
2017 done.
2016 done.
2015 done.
Downcasting floats.


In [5]:
# set up result DataFrame
ingame_stats = {
    'game_id': [], 
    'home_team': [],
    'away_team': [],
    'home_dsr': [],
    'away_dsr': [],
    'home_gc': [],
    'away_gc': []
}

In [6]:
# compute DSR and game control for each game
# DSR = (1st Downs + TDs) / (Drives + 1st Downs - TDs), where TDs are included in 1st Downs
# GC = integral of win probability over all plays

games = pbp_data.groupby('game_id')

# sum number of first downs and drives for each team (remove penalty firsts, end-of-half drives, qb kneel drives)
for game_group in games.groups:
    game = games.get_group(game_group)

    home_firsts = 0
    home_drives = 0
    home_tds = 0
    away_firsts = 0
    away_drives = 0
    away_tds = 0

    drives = game.groupby('drive')
    for drive_group in drives.groups:
        drive = drives.get_group(drive_group)

        if 0 in set(drive['half_seconds_remaining']):
            continue
        if 'home' in set(drive['posteam_type']):
            home_firsts -= list(drive['first_down_penalty']).count(1)
            home_tds += 1 if 1 in set(drive['touchdown']) else 0
            home_firsts += np.nanmax(drive['drive_first_downs']) if np.nanmax(drive['drive_first_downs']) else 0
            home_drives += 1
        elif 'away' in set(drive['posteam_type']):
            away_firsts -= list(drive['first_down_penalty']).count(1)
            away_tds += 1 if 1 in set(drive['touchdown']) else 0
            away_firsts += np.nanmax(drive['drive_first_downs']) if np.nanmax(drive['drive_first_downs']) else 0
            away_drives += 1
        else:
            continue

    # game ID info
    ingame_stats['game_id'].append(game['game_id'].iloc[0])
    ingame_stats['home_team'].append(game['home_team'].iloc[0])
    ingame_stats['away_team'].append(game['away_team'].iloc[0])

    # DSR stats
    ingame_stats['home_dsr'].append(home_firsts / (home_drives + home_firsts - home_tds))
    ingame_stats['away_dsr'].append(away_firsts / (away_drives + away_firsts - away_tds))

    # integrate win probability to get GC stats
    ingame_stats['home_gc'].append(integrate.cumulative_trapezoid(game['home_wp'].dropna())[-1])
    ingame_stats['away_gc'].append(integrate.cumulative_trapezoid(game['away_wp'].dropna())[-1])

In [7]:
# save ingame_stats
ingame_stats_df = pd.DataFrame(ingame_stats)
ingame_stats_df.to_csv('data/ingame_stats.csv', index=False)