# Clean Event Data

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
DIR = "/workspace/nflbigdatabowl2023"
sys.path.append(DIR)

In [9]:
from typing import Optional

import pandas as pd

In [11]:
df_tracking_all = pd.read_csv(f"{DIR}/data/raw/week1.csv")

In [12]:
for event in sorted(df_tracking_all["event"].unique()):
    print(event)

None
autoevent_ballsnap
autoevent_passforward
autoevent_passinterrupted
ball_snap
first_contact
fumble
fumble_offense_recovered
handoff
huddle_break_offense
lateral
line_set
man_in_motion
pass_arrived
pass_forward
pass_outcome_caught
pass_outcome_incomplete
pass_tipped
play_action
qb_sack
qb_strip_sack
run
shift


In [21]:
def clean_autoevent(raw: str) -> Optional[str]:
    # Replace `None` event with null values.
    if raw == "None":
        return None

    # Rename auto event names to base event names.
    autoevent_renames = {
        "autoevent_ballsnap": "ball_snap",
        "autoevent_passforward": "pass_forward",
        "autoevent_passinterrupted": "pass_interrupted",
    }
    # If the raw name is not in the dictionary, fall back
    # to the raw name itself.
    renamed_event = autoevent_renames.get(raw, raw)
    return renamed_event

In [51]:
def remove_redundant_event(event: Optional[str], is_first: bool) -> Optional[str]:
    # For example, a play could have multiple `fumble` events, so we only want to
    # remove redundant events that are non-repeatable.
    non_repeatable_events = {"ball_snap", "pass_forward"}
    should_not_repeat = event in non_repeatable_events
    is_repeat = not is_first
    if should_not_repeat and is_repeat:
        return None
    return event

In [52]:
def clean_event_data(df_tracking: pd.DataFrame) -> pd.DataFrame:
    """
    Copies the Dataframe and adds a new column:
    - cleaned_event: Replaces autoevents with the base event name.
    
    Also ensures that each play has exactly one `ball_snap` event.
    """
    # Get only columns for event per frame, also copies the DataFrame.
    base_columns = ["gameId", "playId", "frameId", "event"]
    df = df_tracking[base_columns].drop_duplicates()
    df["clean_event"] = df["event"].apply(clean_autoevent)
    
    # Find first frame for each event type.
    df_event_first_frame = (
        df
            .groupby(["gameId", "playId", "clean_event"])
            .agg(**{"first_frame": ("frameId", min)})
            .reset_index()
    )
    df_with_first = df.merge(
        df_event_first_frame,
        left_on=["gameId", "playId", "clean_event", "frameId"],
        right_on=["gameId", "playId", "clean_event", "first_frame"],
        how="left"
    )
    df_with_first["is_first_event_of_type"] = df_with_first["first_frame"].notna()
    
    # Set redundant events to a null value.
    df_with_first["clean_event"] = df_with_first.apply(
        lambda row: remove_redundant_event(
            event=row["clean_event"],
            is_first=row["is_first_event_of_type"],
        ),
        axis=1,
    )
    df_with_first.drop(columns=["first_frame", "is_first_event_of_type"], inplace=True)
    return df_with_first

In [56]:
df_events = clean_event_data(df_tracking_all)
df_events[df_events["event"] != "None"].head(10)

Unnamed: 0,gameId,playId,frameId,event,clean_event
5,2021090900,97,6,ball_snap,ball_snap
37,2021090900,97,38,autoevent_passforward,pass_forward
39,2021090900,97,40,pass_forward,
48,2021090900,137,6,autoevent_ballsnap,ball_snap
49,2021090900,137,7,ball_snap,
74,2021090900,137,32,pass_forward,pass_forward
80,2021090900,187,1,line_set,line_set
85,2021090900,187,6,ball_snap,ball_snap
105,2021090900,187,26,autoevent_passforward,pass_forward
107,2021090900,187,28,pass_forward,
