In [47]:
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

import os

tqdm.pandas()


# Load raw data

In [49]:
games = pd.read_csv("../data/nfl-big-data-bowl-2024/games.csv")
plays = pd.read_csv("../data/nfl-big-data-bowl-2024/plays.csv")
players = pd.read_csv("../data/nfl-big-data-bowl-2024/players.csv").drop("displayName",axis=1)
tracking_data = pd.concat([pd.read_csv(os.path.join(f"../data/nfl-big-data-bowl-2024/tracking_week_{i}.csv")) for i in range(1, 10)], axis=0)
tackles = pd.read_csv("../data/nfl-big-data-bowl-2024/tackles.csv")


In [None]:
plays.columns


Index(['gameId', 'playId', 'ballCarrierId', 'ballCarrierDisplayName',
       'playDescription', 'quarter', 'down', 'yardsToGo', 'possessionTeam',
       'defensiveTeam', 'yardlineSide', 'yardlineNumber', 'gameClock',
       'preSnapHomeScore', 'preSnapVisitorScore', 'passResult', 'passLength',
       'penaltyYards', 'prePenaltyPlayResult', 'playResult',
       'playNullifiedByPenalty', 'absoluteYardlineNumber', 'offenseFormation',
       'defendersInTheBox', 'passProbability', 'preSnapHomeTeamWinProbability',
       'preSnapVisitorTeamWinProbability', 'homeTeamWinProbabilityAdded',
       'visitorTeamWinProbilityAdded', 'expectedPoints', 'expectedPointsAdded',
       'foulName1', 'foulName2', 'foulNFLId1', 'foulNFLId2'],
      dtype='object')

In [None]:
PLAYS_RELEVANT_COLS = ['gameId', 'playId', 'ballCarrierId', 'ballCarrierDisplayName', 'quarter', 'down', 'yardsToGo', 'possessionTeam', 'defensiveTeam', 'yardlineSide', 'yardlineNumber', 'gameClock',
    'preSnapHomeScore', 'preSnapVisitorScore', 'passResult', 'absoluteYardlineNumber', 'prePenaltyPlayResult', 'playResult', 'offenseFormation', 'defendersInTheBox', 'passProbability']
non_penalty_plays_raw = plays.loc[plays['playNullifiedByPenalty'] != "Y", PLAYS_RELEVANT_COLS]
non_penalty_plays_raw = pd.merge(non_penalty_plays_raw, games, how="inner", on="gameId").drop(columns=["homeFinalScore", "visitorFinalScore"]).set_index(["gameId", "playId"])
non_penalty_plays_raw.head()


Unnamed: 0_level_0,Unnamed: 1_level_0,ballCarrierId,ballCarrierDisplayName,quarter,down,yardsToGo,possessionTeam,defensiveTeam,yardlineSide,yardlineNumber,gameClock,...,playResult,offenseFormation,defendersInTheBox,passProbability,season,week,gameDate,gameTimeEastern,homeTeamAbbr,visitorTeamAbbr
gameId,playId,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2022100908,3537,48723,Parker Hesse,4,1,10,ATL,TB,ATL,41,7:52,...,9,SHOTGUN,7.0,0.747284,2022,5,10/09/2022,13:00:00,TB,ATL
2022100908,619,54042,Caleb Huntley,1,1,10,ATL,TB,ATL,15,4:52,...,-1,PISTOL,7.0,0.732525,2022,5,10/09/2022,13:00:00,TB,ATL
2022100908,3731,48374,Olamide Zaccheaus,4,1,10,ATL,TB,TB,19,4:43,...,19,SHOTGUN,6.0,0.775222,2022,5,10/09/2022,13:00:00,TB,ATL
2022100908,1821,42345,Marcus Mariota,2,1,10,ATL,TB,ATL,32,1:10,...,8,PISTOL,7.0,0.789536,2022,5,10/09/2022,13:00:00,TB,ATL
2022100908,2392,44816,Leonard Fournette,3,1,10,TB,ATL,TB,26,10:28,...,2,SHOTGUN,6.0,0.459837,2022,5,10/09/2022,13:00:00,TB,ATL


In [None]:
n_players_per_play = week1.groupby(["gameId", "playId"])["nflId"].nunique()
n_players_per_play.value_counts() # should be 22


nflId
22    1473
Name: count, dtype: int64

In [57]:
non_penalty_plays_raw.loc[(2022090800, 80)]


ballCarrierId                  46076
ballCarrierDisplayName    Josh Allen
quarter                            1
down                               2
yardsToGo                          4
possessionTeam                   BUF
defensiveTeam                     LA
yardlineSide                     BUF
yardlineNumber                    31
gameClock                      14:29
preSnapHomeScore                   0
preSnapVisitorScore                0
passResult                         R
absoluteYardlineNumber            79
prePenaltyPlayResult               7
playResult                         7
offenseFormation               EMPTY
defendersInTheBox                6.0
passProbability             0.699896
season                          2022
week                               1
gameDate                  09/08/2022
gameTimeEastern             20:20:00
homeTeamAbbr                      LA
visitorTeamAbbr                  BUF
Name: (2022090800, 80), dtype: object

In [56]:
test_play = tracking_data.groupby(["gameId", "playId"]).get_group((2022090800, 80))
play_groups = test_play.groupby("nflId")
first_group = play_groups.get_group(list(play_groups.groups.keys())[0])
time_to_first_contact = first_group.event.tolist().index("first_contact")

play_timeseries = pd.concat([g.reset_index(drop=True).loc[:, ["x", "y", "s", "a", "o", "dir"]] for _, g in play_groups], axis=1, keys=play_groups.groups.keys())

truncated_play_timeseries = play_timeseries.loc[:time_to_first_contact]
event_timeseries = first_group.event.reset_index(drop=True).loc[:time_to_first_contact]
player_data_for_play = pd.merge(play_groups.first().loc[:, ["gameId", "playId", "displayName", "time", "club"]], players.drop(columns="collegeName"), how="left", on="nflId")
player_data_for_play_with_tackles = pd.merge(player_data_for_play, tackles, how="left", on=["gameId", "playId", "nflId"])

tackle_label = int(player_data_for_play_with_tackles.loc[:, ["pff_missedTackle"]].sum(axis=0).max() == 0)
play_lookup = non_penalty_plays_raw.loc[*test_play[["gameId", "playId"]].iloc[0]]
direction = 2 * int(first_group["playDirection"].iloc[0] == "right") - 1

ball_carrier_trajectory = play_timeseries.xs(play_lookup.ballCarrierId, axis=1)
yds_at_first_contact = ball_carrier_trajectory.loc[time_to_first_contact, "x"]
yds_final = ball_carrier_trajectory.loc[len(ball_carrier_trajectory) - 1, "x"]
outcome = direction * (yds_final - yds_at_first_contact)
print(tackle_label, outcome)


1 3.200000000000003


In [55]:
test_play["nflId"].nunique()


0

In [64]:
RELEVANT_EVENTS = ["first_contact", "ball_snap", "pass_outcome_caught", "handoff", "pass_arrived", "out_of_bounds", "run", "man_in_motion", "play_action", "touchdown", "fumble"]
RELEVANT_GAME_INFO = ["gameId", "playId",  "displayName", "time", "club"]
RELEVANT_PLAYER_PLAY_INFO = ["x", "y", "s", "a", "o", "dir"]

def collect_play_by_play_information(play_data):
    play_groups = play_data.groupby("nflId")
    game_id, play_id = play_data[["gameId", "playId"]].iloc[0]
    first_group = play_groups.get_group(list(play_groups.groups.keys())[0])
    time_to_first_contact = first_group.event.tolist().index("first_contact")

    teams = play_data.loc[:, ["nflId", "club"]].drop_duplicates()

    play_timeseries = pd.concat([g.reset_index(drop=True).loc[:, RELEVANT_PLAYER_PLAY_INFO] for _, g in play_groups], axis=1, keys=play_groups.groups.keys())
    truncated_play_timeseries = play_timeseries.loc[:time_to_first_contact]

    event_timeseries = first_group.event.reset_index(drop=True).loc[:time_to_first_contact]
    event_timeseries_encoded = pd.DataFrame(event_timeseries.values.reshape(-1, 1) == np.array(RELEVANT_EVENTS), columns=RELEVANT_EVENTS).astype(int)
    player_data_for_play = pd.merge(play_groups.first().loc[:, RELEVANT_GAME_INFO], players.drop(columns="collegeName"), how="left", on="nflId")
    player_data_for_play_with_tackles = pd.merge(player_data_for_play, tackles, how="left", on=["gameId", "playId", "nflId"])

    # whether there was a successful tackle/forced fumble or a missed tackle, who tackled, and such
    tackle_label = int(player_data_for_play_with_tackles.loc[:, ["pff_missedTackle"]].sum(axis=0).max() == 0)

    # yards after first_contact
    play_lookup = non_penalty_plays_raw.loc[(game_id, play_id)]
    direction = 2 * int(first_group["playDirection"].iloc[0] == "right") - 1
    ball_carrier_trajectory = play_timeseries.xs(play_lookup.ballCarrierId, axis=1)
    yds_at_first_contact = ball_carrier_trajectory.loc[time_to_first_contact, "x"]
    yds_final = ball_carrier_trajectory.loc[len(ball_carrier_trajectory) - 1, "x"]
    outcome = direction * (yds_final - yds_at_first_contact)
    
    return {
        "game_id": game_id,
        "play_id": play_id,
        "player_tracking": truncated_play_timeseries,
        "event_timeseries": event_timeseries_encoded,
        "players_on_the_field": player_data_for_play_with_tackles,
        "teams": teams,
        "tackle_successful": tackle_label,
        "yards_after_contact": outcome,
    }

all_play_data_week1 = week1.head(n=1000).groupby(["gameId", "playId"]).filter(lambda g: ("first_contact" in g["event"].values) and (g.name in non_penalty_plays_raw.index)) \
    .groupby(["gameId", "playId"]).progress_apply(collect_play_by_play_information)


  0%|          | 0/1 [00:00<?, ?it/s]

In [65]:
all_play_data_week1.iloc[0]["players_on_the_field"]


Unnamed: 0,nflId,gameId,playId,displayName,time,club,height,weight,birthDate,position,tackle,assist,forcedFumble,pff_missedTackle
0,35472.0,2022090800,80,Rodger Saffold,2022-09-08 20:24:35.700000,BUF,6-5,325,1988-06-06,G,,,,
1,38577.0,2022090800,80,Bobby Wagner,2022-09-08 20:24:35.700000,LA,6-0,242,1990-06-27,ILB,,,,
2,41239.0,2022090800,80,Aaron Donald,2022-09-08 20:24:35.700000,LA,6-1,280,1991-05-23,DT,,,,
3,42392.0,2022090800,80,Mitch Morse,2022-09-08 20:24:35.700000,BUF,6-6,305,1992-04-21,C,,,,
4,42489.0,2022090800,80,Stefon Diggs,2022-09-08 20:24:35.700000,BUF,6-0,191,1993-11-29,WR,,,,
5,42816.0,2022090800,80,Troy Hill,2022-09-08 20:24:35.700000,LA,5-11,184,1991-08-29,CB,,,,
6,43294.0,2022090800,80,Jalen Ramsey,2022-09-08 20:24:35.700000,LA,6-1,208,1994-10-24,CB,,,,
7,43298.0,2022090800,80,Leonard Floyd,2022-09-08 20:24:35.700000,LA,6-5,240,1992-09-08,DE,,,,
8,43335.0,2022090800,80,A'Shawn Robinson,2022-09-08 20:24:35.700000,LA,6-4,330,1995-03-21,DT,,,,
9,44875.0,2022090800,80,Dion Dawkins,2022-09-08 20:24:35.700000,BUF,6-5,320,1994-04-26,T,,,,


In [40]:
all_play_data_week1.tolist()


[{'player_tracking':    35472.0                                    38577.0                     ...  \
           x      y     s     a       o     dir       x      y     s     a  ...   
  0    82.42  26.13  2.43  0.37  196.42  140.67   71.72  23.19  1.48  2.80  ...   
  1    82.58  25.96  2.34  0.93  203.85  138.66   71.66  23.36  1.78  3.19  ...   
  2    82.73  25.79  2.20  1.39  209.50  136.40   71.61  23.56  2.12  3.38  ...   
  3    82.87  25.66  1.96  1.87  214.57  133.71   71.58  23.80  2.51  3.25  ...   
  4    82.99  25.55  1.62  2.42  222.35  130.47   71.56  24.14  3.24  3.76  ...   
  5    83.09  25.48  1.19  2.80  234.72  125.05   71.55  24.50  3.72  3.93  ...   
  6    83.16  25.45  0.83  2.89  246.19  114.56   71.56  24.89  4.15  3.75  ...   
  7    83.21  25.45  0.47  2.88  265.22   91.69   71.57  25.33  4.58  3.42  ...   
  8    83.23  25.47  0.32  2.69  280.09   35.37   71.58  25.81  5.01  3.12  ...   
  9    83.23  25.51  0.51  2.50  284.57  354.43   71.59  26.32  5.34