In [1]:
import os, glob
import numpy as np
import pandas as pd


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
GAMES_CSV = "data/games.csv"
WEEK_GLOB = "data/week*.csv"

In [3]:
# === Add move- & release-based features for the success model ===

# ----------------- CONFIG -----------------


# Move detection (dir-based) params
ANGLE_DEG_THRESH   = 7.5   # degrees change considered a "move"
MIN_SPEED_YDPS     = 1.5   # require min speed at move; set 0.0 to ignore
FRAMES_AFTER_SNAP  = 3     # start checking at ball_snap + N

# Separation sampling offsets from move frame (single frames)
SEP_BEFORE_OFFSET  = -3
SEP_AFTER_OFFSET   = +3

# Release burst window after snap (at ~10Hz)
EARLY_FRAMES = 10          # snap+1 .. snap+10
SMOOTH_K     = 3           # small moving average for speed

# Early direction-change window (wrap-safe Δdir)
EARLY_DIR_FRAMES = 5       # consider frames snap+1 .. snap+5

# ----------------- HELPERS -----------------
def _load_games():
    if not os.path.exists(GAMES_CSV):
        raise FileNotFoundError(f"Missing {GAMES_CSV}")
    return pd.read_csv(GAMES_CSV)[["gameId","week"]]

_GAMES = _load_games()
_WEEK_CACHE = {}

In [4]:


def _load_week_for_game(game_id: int) -> pd.DataFrame:
    """Load the tracking week DataFrame that contains this game."""
    row = _GAMES.loc[_GAMES["gameId"] == game_id]
    if row.empty:
        raise ValueError(f"gameId {game_id} not in games.csv")
    wk = int(row.iloc[0]["week"])
    if wk in _WEEK_CACHE:
        return _WEEK_CACHE[wk]
    path = f"data/week{wk}.csv"
    if os.path.exists(path):
        df = pd.read_csv(path)
        _WEEK_CACHE[wk] = df
        return df
    # Fallback: search week*.csv
    for wf in sorted(glob.glob(WEEK_GLOB)):
        try:
            tmp = pd.read_csv(wf, usecols=["gameId"])
            if (pd.to_numeric(tmp["gameId"], errors="coerce") == game_id).any():
                df = pd.read_csv(wf)
                _WEEK_CACHE[wk] = df
                return df
        except Exception:
            continue
    raise FileNotFoundError(f"Week file for game {game_id} not found.")

def _circ_delta_deg(d_now, d_prev):
    """Wrap-safe angular difference in degrees in (-180, 180]."""
    return ((d_now - d_prev + 180.0) % 360.0) - 180.0

def _ma(x, k=3):
    if k <= 1 or len(x) < k:
        return x
    box = np.ones(k)/k
    return np.convolve(x, box, mode="same")

def _first_move_frame_dir(
    track_df: pd.DataFrame, by: str, val,
    angle_deg_thresh=ANGLE_DEG_THRESH, min_speed=MIN_SPEED_YDPS, frames_after_snap=FRAMES_AFTER_SNAP
):
    """
    Consistent with your current approach:
    - Filter to frames >= ball_snap + frames_after_snap
    - Then restrict to frames < first pass_forward (if present; if absent, we keep post-snap window)
    - Detect first i with |Δdir| >= threshold AND speed >= min_speed.
    Returns (frameId, x, y) or (None, None, None).
    """
    need_cols = {"frameId","x","y","dir","event"}
    if not need_cols.issubset(track_df.columns):
        return (None, None, None)

    df = track_df.copy()
    df["frameId"] = pd.to_numeric(df["frameId"], errors="coerce")
    df["dir"] = pd.to_numeric(df["dir"], errors="coerce")
    if "s" in df.columns:
        df["s"] = pd.to_numeric(df["s"], errors="coerce")
    df["event"] = df["event"].astype(str).str.lower()

    # Lower bound: ball_snap + N
    snap_rows = df.loc[df["event"] == "ball_snap", "frameId"].dropna()
    if snap_rows.empty:
        return (None, None, None)
    snap_frame = int(snap_rows.min())
    df = df[df["frameId"] >= (snap_frame + frames_after_snap)]

    # Upper bound: before pass_forward if present
    pass_rows = df.loc[df["event"] == "pass_forward", "frameId"].dropna()
    if not pass_rows.empty:
        pass_frame = int(pass_rows.min())
        df = df[df["frameId"] < pass_frame]

    if by not in df.columns:
        return (None, None, None)

    wr = df[(df[by] == val)].dropna(subset=["frameId","x","y","dir"]).sort_values("frameId")
    if wr.empty:
        return (None, None, None)

    frames = wr["frameId"].to_numpy(dtype=int)
    xs = wr["x"].to_numpy(dtype=float)
    ys = wr["y"].to_numpy(dtype=float)
    dirs = wr["dir"].to_numpy(dtype=float)
    spd  = wr["s"].to_numpy(dtype=float) if "s" in wr.columns else np.zeros_like(dirs)

    # wrap-safe Δdir per step
    if len(dirs) < 2:
        return (None, None, None)
    deltas = np.array([abs(_circ_delta_deg(dirs[i], dirs[i-1])) for i in range(1, len(dirs))])

    for i in range(1, len(frames)):
        if np.isfinite(deltas[i-1]) and (deltas[i-1] >= angle_deg_thresh):
            if np.isfinite(spd[i]) and (spd[i] >= min_speed):
                return int(frames[i]), float(xs[i]), float(ys[i])

    return (None, None, None)

def _separation_at_frame(track_play_df: pd.DataFrame, off_id: int, def_id: int, frame_id: int):
    """WR–CB separation (yards) at a specific frame; np.nan if unavailable."""
    if frame_id is None or not np.isfinite(frame_id):
        return np.nan
    f = int(frame_id)
    wr = track_play_df[(track_play_df["nflId"] == off_id) & (pd.to_numeric(track_play_df["frameId"], errors="coerce") == f)]
    cb = track_play_df[(track_play_df["nflId"] == def_id) & (pd.to_numeric(track_play_df["frameId"], errors="coerce") == f)]
    if wr.empty or cb.empty:
        return np.nan
    dx = float(wr["x"].iloc[0]) - float(cb["x"].iloc[0])
    dy = float(wr["y"].iloc[0]) - float(cb["y"].iloc[0])
    return float(np.hypot(dx, dy))

def _release_burst_for_wr(track_play_df: pd.DataFrame, wr_id: int) -> float:
    """max(speed in snap+1..snap+EARLY_FRAMES) - speed at snap+1; 0 if unavailable."""
    if "event" not in track_play_df.columns:
        return 0.0
    tdf = track_play_df.copy()
    for c in ["frameId","nflId","s"]:
        if c in tdf.columns:
            tdf[c] = pd.to_numeric(tdf[c], errors="coerce")
    tdf["event"] = tdf["event"].astype(str).str.lower()

    snap = tdf.loc[tdf["event"] == "ball_snap", "frameId"]
    if snap.empty:
        return 0.0
    snap_frame = int(snap.min())

    wr = tdf.loc[tdf["nflId"] == wr_id, ["frameId","s"]].dropna(subset=["frameId"]).copy()
    if wr.empty:
        return 0.0
    wr = wr.sort_values("frameId")
    win = wr[(wr["frameId"] >= snap_frame + 1) & (wr["frameId"] <= snap_frame + EARLY_FRAMES)]
    if win.empty:
        return 0.0

    spd = pd.to_numeric(win["s"], errors="coerce").fillna(0.0).to_numpy()
    spd = _ma(spd, SMOOTH_K)
    base = spd[0]
    peak = float(np.max(spd))
    burst = peak - base
    return float(max(0.0, burst)) if np.isfinite(burst) else 0.0

def _early_dir_change(track_play_df: pd.DataFrame, wr_id: int, frames_after=slice(1, EARLY_DIR_FRAMES+1)):
    """
    Wrap-safe max |Δdir| in the first few frames after snap.
    Looks at dir(frame snap+1 ... snap+EARLY_DIR_FRAMES) and returns the max step change in degrees.
    """
    need = {"frameId","nflId","dir","event"}
    if not need.issubset(track_play_df.columns):
        return 0.0
    df = track_play_df.copy()
    df["frameId"] = pd.to_numeric(df["frameId"], errors="coerce")
    df["dir"] = pd.to_numeric(df["dir"], errors="coerce")
    df["event"] = df["event"].astype(str).str.lower()

    snap_rows = df.loc[df["event"] == "ball_snap", "frameId"].dropna()
    if snap_rows.empty:
        return 0.0
    snap = int(snap_rows.min())

    wr = df[(df["nflId"] == wr_id)].dropna(subset=["frameId","dir"]).sort_values("frameId")
    if wr.empty:
        return 0.0

    win = wr[(wr["frameId"] >= snap + 1) & (wr["frameId"] <= snap + EARLY_DIR_FRAMES)]
    dirs = win["dir"].to_numpy(dtype=float)
    if dirs.size < 2:
        return 0.0
    deltas = np.array([abs(_circ_delta_deg(dirs[i], dirs[i-1])) for i in range(1, len(dirs))])
    m = float(np.max(deltas)) if deltas.size else 0.0
    return m if np.isfinite(m) else 0.0

# ----------------- MAIN: ADD FEATURES -----------------
def add_move_burst_dir_features(cb_wr_data: pd.DataFrame,
                                game_col="gameId", play_col="playId",
                                off_col="nflIdOff", def_col="nflIdDef") -> pd.DataFrame:
    """
    Adds FIVE columns and returns the updated DataFrame:
      - Move Separation : sep(move+3) - sep(move-3), else 0
      - Move Occurred   : 1 if a move was detected, else 0
      - Time To Move    : frameId of the move (0 if none)
      - Release Burst   : early acceleration metric after snap (>=0)
      - Early Dir Change: max |Δdir| in first few frames after snap (wrap-safe)
    """
    out = cb_wr_data.copy()
    out["Move Separation"]  = 0.0
    out["Move Occurred"]    = 0
    out["Time To Move"]     = 0
    out["Release Burst"]    = 0.0
    out["Early Dir Change"] = 0.0

    # process play-by-play to avoid re-reading week files too often
    for (g, p), idx in out.groupby([game_col, play_col], sort=False).groups.items():
        try:
            g_int = int(g); p_int = int(p)
        except Exception:
            continue

        # Load tracking for this play
        try:
            wkdf = _load_week_for_game(g_int)
        except Exception:
            continue

        tp = wkdf[(pd.to_numeric(wkdf["gameId"], errors="coerce") == g_int) &
                  (pd.to_numeric(wkdf["playId"], errors="coerce") == p_int)].copy()
        if tp.empty:
            continue

        # Ensure needed columns numeric/string
        for c in ["frameId","nflId","x","y","dir","s"]:
            if c in tp.columns:
                tp[c] = pd.to_numeric(tp[c], errors="coerce")
        if "event" in tp.columns:
            tp["event"] = tp["event"].astype(str)

        # Fill per-row (off/def pairing)
        for row_i in (idx if isinstance(idx, (list, np.ndarray, pd.Index)) else [idx]):
            try:
                wr_id = int(out.at[row_i, off_col])
                cb_id = int(out.at[row_i, def_col])
            except Exception:
                continue

            # Release burst (once per WR)
            out.at[row_i, "Release Burst"] = _release_burst_for_wr(tp, wr_id=wr_id)

            # Early direction change in first few frames
            out.at[row_i, "Early Dir Change"] = _early_dir_change(tp, wr_id=wr_id)

            # Move detection & derivatives
            mf, mx, my = _first_move_frame_dir(
                tp, by="nflId", val=wr_id,
                angle_deg_thresh=ANGLE_DEG_THRESH,
                min_speed=MIN_SPEED_YDPS,
                frames_after_snap=FRAMES_AFTER_SNAP
            )

            if mf is None or not np.isfinite(mf):
                # No move: defaults remain
                continue

            out.at[row_i, "Move Occurred"] = 1
            out.at[row_i, "Time To Move"]  = int(mf)

            sep_before = _separation_at_frame(tp, wr_id, cb_id, int(mf + SEP_BEFORE_OFFSET))
            sep_after  = _separation_at_frame(tp, wr_id, cb_id, int(mf + SEP_AFTER_OFFSET))
            if np.isfinite(sep_before) and np.isfinite(sep_after):
                out.at[row_i, "Move Separation"] = float(sep_after - sep_before)
            else:
                out.at[row_i, "Move Separation"] = 0.0  # keep consistent fallback

    return out

# ----------------- RUN ON YOUR DATAFRAME -----------------
# Example:
# cb_wr_data = add_move_burst_dir_features(cb_wr_data)
# cb_wr_data[["gameId","playId","nflIdOff","nflIdDef",
#             "Move Occurred","Time To Move","Move Separation",
#             "Release Burst","Early Dir Change"]].head()


In [5]:
cb_wr_data = pd.read_csv("custom/CB_WR_data.csv")  # Load your existing dataframe

In [7]:
cb_wr_data = add_move_burst_dir_features(cb_wr_data)

In [8]:
cb_wr_data.head()

Unnamed: 0.1,Unnamed: 0,gameId,playId,successOff,manCoverage,targetOff,nflIdDef,defPlayerName,defenseTeam,defPos,...,TE,RB,CB,FS,WRToCBRatio,Move Separation,Move Occurred,Time To Move,Release Burst,Early Dir Change
0,1,2018090600,75,True,False,True,2555383,Jalen Mills,PHI,CB,...,1,1,2,2,1.0,0.760938,1,35,4.283333,77.69
1,2,2018090600,146,False,False,False,2552689,Ronald Darby,PHI,CB,...,1,1,3,1,1.0,0.0,0,0,3.63,36.97
2,3,2018090600,146,False,False,False,2557958,Sidney Jones,PHI,CB,...,1,1,3,1,1.0,0.0,0,0,3.223333,36.86
3,4,2018090600,146,False,False,True,2555383,Jalen Mills,PHI,CB,...,1,1,3,1,1.0,0.0,0,0,2.75,6.29
4,5,2018090600,168,False,False,False,2552689,Ronald Darby,PHI,CB,...,1,2,3,1,0.666667,0.0,0,0,3.146667,75.28


In [9]:
#drop early dir change for now
cb_wr_data = cb_wr_data.drop(columns=["Early Dir Change"])

In [12]:
# now to full data set
cb_wr_data_full = pd.read_csv("predictions/full_cb_wr_data_preds.csv")  
#add the features 
cb_wr_data_full = add_move_burst_dir_features(cb_wr_data_full)


In [13]:
# save the csv 
cb_wr_data_full.to_csv("predictions/full_cb_wr_data_with_move_burst_dir_features.csv", index=False)