### Automated Data Pipeline: OL-DL 1v1 Rep Detection + Validation

Runs the detection workflow across configured sessions without visualization, then concatenates and filters reps for curation. Validation is performed at the end of the notebook.


In [1]:
import re
from pathlib import Path

import numpy as np
import polars as pl

BASE_DIR = Path.cwd()
PRACTICE_DATA_DIR = BASE_DIR.parent / "practice_data"

print(f"Base dir: {BASE_DIR}")
print(f"Practice data dir: {PRACTICE_DATA_DIR}")


Base dir: /Users/danfalkenheim/Desktop/ShrineBowlSumerSportsAnalyticsCompetition/RepIdentification
Practice data dir: /Users/danfalkenheim/Desktop/ShrineBowlSumerSportsAnalyticsCompetition/practice_data


In [2]:
# Manual Configurations

practice_data = {
    "2024WestPractice1": {
        "START_TS": "2024-01-27T17:14:06.100",
        "END_TS": "2024-01-27T17:25:47.100",
        "LOS": 105,
        "olinemen": ["75", "55", "72", "60", "54", "69", "71", "78", "77", "70", "73"],
        "dlinemen": ["7", "6", "97", "85", "99", "9", "58", "92", "52", "8", "91"],
        "flip": False,
        "drill_type_filter": "Bigs 1 on 1 - Skill 7 on 7",
    },
    "2024WestPractice2": {
        "START_TS": "2024-01-28T15:01:30.000",
        "END_TS": "2024-01-28T15:11:00.400",
        "LOS": 105,
        "olinemen": ["75", "78", "55", "72", "77", "74", "70", "69", "71", "60", "54"],
        "dlinemen": ["9", "8", "92", "91", "58", "99", "7", "97", "85", "52", "6"],
        "flip": False,
        "drill_type_filter": "Big 1 on 1 - Skill 7 on 7",
    },
    "2024WestPractice3": {
        "START_TS": "2024-01-29T16:57:06.300",
        "END_TS": "2024-01-29T17:06:15.400",
        "LOS": 100,
        "olinemen": ["70", "60", "54", "69", "74", "71", "72", "78", "75", "73", "77"],
        "dlinemen": ["91", "7", "92", "52", "8", "97", "94", "6", "99", "85"],
        "flip": True,
        "drill_type_filter": "Team 1",
    },
    "2024EastPractice1": {
        "START_TS": "2024-01-27T15:27:07.900",
        "END_TS": "2024-01-27T:15:34:51.400",
        "LOS": 110,
        "olinemen": ["75", "68", "72", "73", "54", "74", "78", "64", "70", "53", "52"],
        "dlinemen": ["58", "98", "96", "99", "94", "8", "93", "97", "0", "29", "3", "92"],
        "flip": False,
        "drill_type_filter": "1 on 1",
    },
    "2024EastPractice2": {
        "START_TS": "2024-01-28T17:05:33.400",
        "END_TS": "2024-01-28T:17:12:22.800",
        "LOS": 110,
        "olinemen": ["65", "53", "64", "70", "74", "72", "73", "54", "78", "75", "52"],
        "dlinemen": ["3", "96", "8", "98", "92", "29", "94", "58", "0", "99", "93", "97"],
        "flip": False,
        "drill_type_filter": "1 on 1",
    },
    "2024EastPractice3": {
        "START_TS": "2024-01-29T15:17:57.100",
        "END_TS": "2024-01-29T:15:23:52.000",
        "LOS": 110,
        "olinemen": ["75", "68", "64", "52", "54", "53", "72", "74", "70", "73", "78", "50", "65"],
        "dlinemen": ["58", "93", "8", "99", "97", "94", "29", "98", "92", "3"],
        "flip": False,
        "drill_type_filter": "1 on 1",
    },
}

print(f"Loaded {len(practice_data)} session configs")

def find_practice_file(session_name: str, data_dir: Path) -> Path | None:
    match = re.match(r"([0-9]{4})(East|West)Practice(\d+)", session_name)
    if not match:
        return None
    year, side, number = match.groups()
    candidates = [
        data_dir / f"{year}_{side}_Practice_{number}.snappy.parquet",
        data_dir / f"{year}_{side}_Practice_{number}.parquet",
    ]
    for candidate in candidates:
        if candidate.exists():
            return candidate
    return None


Loaded 6 session configs


In [3]:
X_MIN = -15.0
X_MAX = 15.0
TRACKING_METRICS = ["a", "dir", "sa", "dis", "s", "x", "y", "z"]

def transform_coordinates(df: pl.DataFrame, los: float, flip_orientation: bool) -> pl.DataFrame:
    """Transform coordinates based on LOS and orientation configuration."""
    if flip_orientation:
        df = df.with_columns([
            (pl.lit(los) - pl.col("x")).alias("x"),
            (pl.lit(53.3) - pl.col("y")).alias("y"),
            ((pl.col("dir") + 180) % 360).alias("dir"),
        ])
    else:
        df = df.with_columns([
            (pl.col("x") - pl.lit(los)).alias("x"),
        ])
    return df

def impute_missing_frames(df: pl.DataFrame, frame_map: pl.DataFrame, player_id_col: str) -> pl.DataFrame:
    if player_id_col not in df.columns:
        raise ValueError(f"Expected player id column '{player_id_col}' not found")

    metrics_for_impute = [c for c in TRACKING_METRICS if c in df.columns]
    non_metric_cols = [c for c in df.columns if c not in metrics_for_impute + ["frame_id", "ts", "parsed_ts"]]
    df = df.with_columns([
        pl.col(col).cast(pl.Utf8, strict=False) for col in non_metric_cols
    ])
    frame_ts_map = dict(zip(frame_map["frame_id"].to_list(), frame_map["ts"].to_list()))

    rows_to_add = []
    for _, player_df in df.partition_by(player_id_col, as_dict=True).items():
        player_rows = player_df.sort("frame_id").to_dicts()
        for i in range(len(player_rows) - 1):
            prev_row = player_rows[i]
            next_row = player_rows[i + 1]
            gap = next_row["frame_id"] - prev_row["frame_id"]
            missing = gap - 1
            if missing <= 0 or missing > 5:
                continue

            use_linear_interpolation = True
            total_steps = next_row["frame_id"] - prev_row["frame_id"]

            for fid in range(prev_row["frame_id"] + 1, next_row["frame_id"]):
                ts_val = frame_ts_map.get(fid)
                if ts_val is None:
                    continue

                step = fid - prev_row["frame_id"]
                imputed_metrics = {}
                for m in metrics_for_impute:
                    pv = prev_row.get(m)
                    nv = next_row.get(m)
                    if pv is None or nv is None:
                        imputed_metrics[m] = pv if nv is None else nv if pv is None else None
                    elif use_linear_interpolation:
                        imputed_metrics[m] = pv + (nv - pv) * (step / total_steps)
                    else:
                        imputed_metrics[m] = (pv + nv) / 2

                new_row = {col: prev_row.get(col) for col in non_metric_cols}
                new_row["frame_id"] = fid
                new_row["ts"] = ts_val
                new_row["parsed_ts"] = ts_val
                new_row.update(imputed_metrics)
                rows_to_add.append(new_row)

    if rows_to_add:
        imputed_df = pl.DataFrame(rows_to_add)
        if "parsed_ts" in imputed_df.columns:
            imputed_df = imputed_df.with_columns(pl.col("parsed_ts").str.to_datetime())

        for col, dtype in df.schema.items():
            if col not in imputed_df.columns:
                imputed_df = imputed_df.with_columns(pl.lit(None, dtype=dtype).alias(col))

        imputed_df = imputed_df.with_columns([
            pl.col(col).cast(dtype, strict=False)
            for col, dtype in df.schema.items()
            if col in imputed_df.columns
        ])
        imputed_df = imputed_df.select(df.columns)
        df = pl.concat([df, imputed_df], how="vertical").sort([player_id_col, "frame_id"])

    return df


In [4]:
# Rep Detection Algorithm Constants
TRIGGER_X = 0.5             # DL crossing threshold for rep window trigger (normalized coords)
CROSSING_X = 0.5            # DL crossing threshold for rep start detection (normalized coords)
WINDOW_BEFORE_TRIGGER = 40  # 4.0 seconds
WINDOW_AFTER_TRIGGER = 80   # 8.0 seconds
MIN_REP_DURATION = 15       # 1.5 seconds

# OL proximity filter
OL_MAX_X_AT_TRIGGER = 6.0

def compute_pairwise_distance(x1: float, y1: float, x2: float, y2: float) -> float:
    return np.sqrt((x1 - x2)**2 + (y1 - y2)**2)

def build_pair_timeseries(df_window: pl.DataFrame, ol_jersey: str, dl_jersey: str) -> pl.DataFrame:
    ol_data = (
        df_window
        .filter(pl.col("jersey_number") == ol_jersey)
        .select(["frame_id", "ts", "x", "y", "s", "a"])
        .rename({"x": "ol_x", "y": "ol_y", "s": "ol_s", "a": "ol_a"})
    )

    dl_data = (
        df_window
        .filter(pl.col("jersey_number") == dl_jersey)
        .select(["frame_id", "ts", "x", "y", "s", "a"])
        .rename({"x": "dl_x", "y": "dl_y", "s": "dl_s", "a": "dl_a"})
    )

    pair_df = ol_data.join(dl_data.drop("ts"), on="frame_id", how="inner")

    if pair_df.height == 0:
        return pair_df

    pair_df = pair_df.sort("frame_id")

    pair_df = pair_df.with_columns(
        (((pl.col("ol_x") - pl.col("dl_x"))**2 + (pl.col("ol_y") - pl.col("dl_y"))**2).sqrt())
        .alias("pairwise_distance")
    )

    pair_df = pair_df.with_columns(
        (pl.col("pairwise_distance") - pl.col("pairwise_distance").shift(1))
        .alias("distance_change")
    )

    pair_df = pair_df.with_columns(
        (pl.col("frame_id") - pl.col("frame_id").shift(1))
        .alias("frame_delta")
    )

    return pair_df

def score_pair_engagement(pair_df: pl.DataFrame) -> dict:
    if pair_df.height < 5:
        return {
            "engagement_score": 0,
            "min_distance": 999,
            "closing_frames": 0,
            "sustained_contact_frames": 0,
            "close_duration": 0,
            "activity_score": 0,
            "active_close_frames": 0,
            "min_distance_idx": 0,
        }

    distances = pair_df["pairwise_distance"].to_numpy()
    distance_changes = pair_df["distance_change"].to_numpy()
    frame_deltas = pair_df["frame_delta"].to_numpy()
    ol_accels = pair_df["ol_a"].to_numpy()
    dl_accels = pair_df["dl_a"].to_numpy()
    ol_speeds = pair_df["ol_s"].to_numpy()
    dl_speeds = pair_df["dl_s"].to_numpy()

    min_distance = float(np.nanmin(distances))
    min_distance_idx = int(np.nanargmin(distances))

    max_closing_run = 0
    current_run = 0
    for i in range(1, len(distance_changes)):
        dc = distance_changes[i]
        fd = frame_deltas[i]
        is_consecutive = fd is not None and not np.isnan(fd) and fd == 1
        is_closing = dc is not None and not np.isnan(dc) and dc < -0.01

        if is_consecutive and is_closing:
            current_run += 1
            max_closing_run = max(max_closing_run, current_run)
        else:
            current_run = 0
    closing_frames = max_closing_run

    contact_threshold = 2.0
    sustained_contact_frames = int(np.sum(distances < contact_threshold))
    close_duration = int(np.sum(distances < 2.0))

    active_close_mask = (distances < 2.0) & (ol_speeds >= 1.0) & (dl_speeds >= 1.0)
    active_close_frames = int(np.sum(active_close_mask))

    closing_mask = np.zeros(len(distance_changes) - 1, dtype=bool)
    for i in range(1, len(distance_changes)):
        dc = distance_changes[i]
        fd = frame_deltas[i]
        is_consecutive = fd is not None and not np.isnan(fd) and fd == 1
        is_closing = dc is not None and not np.isnan(dc) and dc < -0.01
        if is_consecutive and is_closing:
            closing_mask[i - 1] = True

    if np.any(closing_mask):
        activity_score = float(np.mean(ol_accels[1:][closing_mask] + dl_accels[1:][closing_mask]))
    else:
        activity_score = 0.0

    engagement_score = float(active_close_frames)

    return {
        "engagement_score": engagement_score,
        "min_distance": min_distance,
        "min_distance_idx": min_distance_idx,
        "closing_frames": closing_frames,
        "sustained_contact_frames": sustained_contact_frames,
        "close_duration": close_duration,
        "activity_score": activity_score,
        "active_close_frames": active_close_frames,
    }

def identify_ol_dl_pair(df_window: pl.DataFrame, excluded_ol_jerseys: list | None = None, trigger_frame: int | None = None) -> tuple:
    if excluded_ol_jerseys is None:
        excluded_ol_jerseys = []

    ol_jerseys = (
        df_window
        .filter(pl.col("is_olineman") == True)
        .select("jersey_number")
        .unique()["jersey_number"]
        .to_list()
    )

    if trigger_frame is not None:
        ol_jerseys_near_los = []
        for ol_j in ol_jerseys:
            if ol_j in excluded_ol_jerseys:
                continue
            ol_at_trigger = df_window.filter(
                (pl.col("jersey_number") == ol_j) &
                (pl.col("frame_id") == trigger_frame)
            )
            if ol_at_trigger.height > 0:
                x_at_trigger = ol_at_trigger.select("x").item()
                if x_at_trigger is not None and x_at_trigger <= OL_MAX_X_AT_TRIGGER:
                    ol_jerseys_near_los.append(ol_j)
        ol_jerseys = ol_jerseys_near_los
    else:
        ol_jerseys = [j for j in ol_jerseys if j not in excluded_ol_jerseys]

    dl_jerseys = (
        df_window
        .filter(pl.col("is_dlineman") == True)
        .select("jersey_number")
        .unique()["jersey_number"]
        .to_list()
    )

    if len(ol_jerseys) == 0 or len(dl_jerseys) == 0:
        raise ValueError("No OL or DL players found in window (or all OL excluded/filtered by position)")

    best_score = -1
    best_pair = (None, None)
    best_pair_df = None
    best_info = None

    tiebreak_score_eps = 0.02
    contact_threshold = 1.5
    active_closing_threshold = -0.005
    min_activity_speed = 0.3
    min_activity_accel = 0.5

    for ol_j in ol_jerseys:
        for dl_j in dl_jerseys:
            pair_df = build_pair_timeseries(df_window, ol_j, dl_j)
            if pair_df.height < 5:
                continue

            info = score_pair_engagement(pair_df)

            frame_ids = pair_df["frame_id"].to_numpy()
            distances = pair_df["pairwise_distance"].to_numpy()
            distance_changes = pair_df["distance_change"].to_numpy()
            frame_deltas = pair_df["frame_delta"].to_numpy()
            ol_speeds = pair_df["ol_s"].to_numpy()
            dl_speeds = pair_df["dl_s"].to_numpy()
            ol_accels = pair_df["ol_a"].to_numpy()
            dl_accels = pair_df["dl_a"].to_numpy()
            min_idx = info.get("min_distance_idx", 0)
            min_frame = int(frame_ids[min_idx]) if len(frame_ids) > min_idx else int(frame_ids[0])
            first_close_frame = None
            if len(distances) > 0:
                close_idx = np.where(distances < contact_threshold)[0]
                if close_idx.size > 0:
                    first_close_frame = int(frame_ids[close_idx[0]])

            first_active_closing_frame = None
            for i in range(1, len(distance_changes)):
                dc = distance_changes[i]
                fd = frame_deltas[i]
                if dc is None or np.isnan(dc) or fd is None or np.isnan(fd) or fd != 1:
                    continue
                if dc < active_closing_threshold:
                    max_speed = max(ol_speeds[i], dl_speeds[i])
                    max_accel = max(ol_accels[i], dl_accels[i])

                    if max_speed >= min_activity_speed or max_accel >= min_activity_accel:
                        first_active_closing_frame = int(frame_ids[i])
                        break

            info["min_distance_frame"] = min_frame
            info["first_close_frame"] = first_close_frame
            info["first_active_closing_frame"] = first_active_closing_frame

            score = info["engagement_score"]
            if score > best_score + tiebreak_score_eps:
                best_score = score
                best_pair = (ol_j, dl_j)
                best_pair_df = pair_df
                best_info = info
            elif best_info is None or abs(score - best_score) <= tiebreak_score_eps:
                cand_active = info.get("first_active_closing_frame")
                best_active = best_info.get("first_active_closing_frame") if best_info else None
                cand_min = info.get("min_distance_frame")
                best_min = best_info.get("min_distance_frame") if best_info else None
                cand_first = info.get("first_close_frame")
                best_first = best_info.get("first_close_frame") if best_info else None

                prefer = False
                if cand_active is not None and best_active is not None and cand_active != best_active:
                    prefer = cand_active < best_active
                elif cand_min is not None and best_min is not None and cand_min != best_min:
                    prefer = cand_min < best_min
                elif cand_first is not None and best_first is not None and cand_first != best_first:
                    prefer = cand_first < best_first

                if best_info is None or prefer:
                    best_score = score
                    best_pair = (ol_j, dl_j)
                    best_pair_df = pair_df
                    best_info = info

    if best_pair[0] is None:
        raise ValueError("Could not identify engaged OL-DL pair")

    return best_pair[0], best_pair[1], best_pair_df, best_info

def detect_rep_start(pair_df: pl.DataFrame) -> int:
    if pair_df.height == 0:
        return 0
    pair_df = pair_df.sort("frame_id")
    frame_ids = pair_df["frame_id"].to_numpy()
    dl_x = pair_df["dl_x"].to_numpy()
    ol_a = pair_df["ol_a"].to_numpy()
    dl_a = pair_df["dl_a"].to_numpy()
    ol_s = pair_df["ol_s"].to_numpy()
    dl_s = pair_df["dl_s"].to_numpy()
    frame_deltas = np.concatenate([[np.nan], np.diff(frame_ids)])
    n = len(frame_ids)

    hold_frames = 10
    lookback_frames = 15
    accel_sum_threshold = 1.5
    speed_sum_threshold = 1.1

    crossing_idx = None
    for i in range(0, n - hold_frames + 1):
        if dl_x[i] <= CROSSING_X:
            continue
        if i > 0 and dl_x[i-1] > CROSSING_X:
            continue
        run_ok = True
        for j in range(hold_frames):
            idx = i + j
            if dl_x[idx] <= CROSSING_X:
                run_ok = False
                break
            if j > 0:
                fd = frame_deltas[idx]
                if fd is None or np.isnan(fd) or fd != 1:
                    run_ok = False
                    break
        if run_ok:
            crossing_idx = i
            break

    if crossing_idx is None:
        return int(frame_ids[0])

    lookback_start = max(0, crossing_idx - lookback_frames)

    for i in range(lookback_start, crossing_idx + 1):
        accel_sum = ol_a[i] + dl_a[i]
        speed_sum = ol_s[i] + dl_s[i]
        if not np.isnan(accel_sum) and ((accel_sum >= accel_sum_threshold) | (speed_sum >= speed_sum_threshold)):
            return int(frame_ids[i])

    return int(frame_ids[crossing_idx])

def detect_rep_end(pair_df: pl.DataFrame, rep_start_frame: int) -> int:
    search_delay_frames = 10
    x_decrease_threshold = -0.05
    consecutive_frames = 10
    stagnation_threshold = 0.01
    stagnation_frames = 3

    search_start_frame = rep_start_frame + search_delay_frames
    pair_after_delay = pair_df.filter(pl.col("frame_id") >= search_start_frame)

    min_frames_needed = min(consecutive_frames, stagnation_frames)
    if pair_after_delay.height < min_frames_needed:
        return int(pair_df["frame_id"].max())

    pair_after_delay = pair_after_delay.sort("frame_id")
    pair_after_delay = pair_after_delay.with_columns([
        (pl.col("frame_id") - pl.col("frame_id").shift(1)).alias("frame_delta"),
        (pl.col("ol_x") - pl.col("ol_x").shift(1)).alias("ol_x_delta"),
        (pl.col("ol_y") - pl.col("ol_y").shift(1)).alias("ol_y_delta"),
        (pl.col("dl_x") - pl.col("dl_x").shift(1)).alias("dl_x_delta"),
        (pl.col("dl_y") - pl.col("dl_y").shift(1)).alias("dl_y_delta"),
    ])

    frame_ids = pair_after_delay["frame_id"].to_numpy()
    frame_deltas = pair_after_delay["frame_delta"].to_numpy()
    ol_x_deltas = pair_after_delay["ol_x_delta"].to_numpy()
    ol_y_deltas = pair_after_delay["ol_y_delta"].to_numpy()
    dl_x_deltas = pair_after_delay["dl_x_delta"].to_numpy()
    dl_y_deltas = pair_after_delay["dl_y_delta"].to_numpy()
    n = len(frame_ids)

    for i in range(1, n):
        if i <= n - consecutive_frames:
            ol_retreat_run = True
            for j in range(consecutive_frames):
                idx = i + j
                if idx >= n:
                    ol_retreat_run = False
                    break
                fd = frame_deltas[idx]
                is_consecutive = fd is not None and not np.isnan(fd) and fd == 1
                ol_xd = ol_x_deltas[idx]
                ol_retreating = ol_xd is not None and not np.isnan(ol_xd) and ol_xd < x_decrease_threshold
                if not (is_consecutive and ol_retreating):
                    ol_retreat_run = False
                    break
            if ol_retreat_run:
                return int(frame_ids[i])

            dl_retreat_run = True
            for j in range(consecutive_frames):
                idx = i + j
                if idx >= n:
                    dl_retreat_run = False
                    break
                fd = frame_deltas[idx]
                is_consecutive = fd is not None and not np.isnan(fd) and fd == 1
                dl_xd = dl_x_deltas[idx]
                dl_retreating = dl_xd is not None and not np.isnan(dl_xd) and dl_xd < x_decrease_threshold
                if not (is_consecutive and dl_retreating):
                    dl_retreat_run = False
                    break
            if dl_retreat_run:
                return int(frame_ids[i])

        if i <= n - stagnation_frames:
            ol_stagnant_run = True
            for j in range(stagnation_frames):
                idx = i + j
                if idx >= n:
                    ol_stagnant_run = False
                    break
                fd = frame_deltas[idx]
                is_consecutive = fd is not None and not np.isnan(fd) and fd == 1
                ol_xd = ol_x_deltas[idx]
                ol_yd = ol_y_deltas[idx]
                ol_x_stagnant = ol_xd is not None and not np.isnan(ol_xd) and abs(ol_xd) < stagnation_threshold
                ol_y_stagnant = ol_yd is not None and not np.isnan(ol_yd) and abs(ol_yd) < stagnation_threshold
                ol_is_stagnant = ol_x_stagnant and ol_y_stagnant
                if not (is_consecutive and ol_is_stagnant):
                    ol_stagnant_run = False
                    break
            if ol_stagnant_run:
                return int(frame_ids[i])

            dl_stagnant_run = True
            for j in range(stagnation_frames):
                idx = i + j
                if idx >= n:
                    dl_stagnant_run = False
                    break
                fd = frame_deltas[idx]
                is_consecutive = fd is not None and not np.isnan(fd) and fd == 1
                dl_xd = dl_x_deltas[idx]
                dl_yd = dl_y_deltas[idx]
                dl_x_stagnant = dl_xd is not None and not np.isnan(dl_xd) and abs(dl_xd) < stagnation_threshold
                dl_y_stagnant = dl_yd is not None and not np.isnan(dl_yd) and abs(dl_yd) < stagnation_threshold
                dl_is_stagnant = dl_x_stagnant and dl_y_stagnant
                if not (is_consecutive and dl_is_stagnant):
                    dl_stagnant_run = False
                    break
            if dl_stagnant_run:
                return int(frame_ids[i])

    return int(frame_ids[-1])

def detect_rep(df_window: pl.DataFrame, window_start: int, window_end: int, rep_number: int = 0, trigger_frame: int | None = None) -> dict:
    min_rep_duration_local = 0
    max_retries = 3

    excluded_ol_jerseys = []
    best_result = None
    best_rep_duration = 0

    for attempt in range(max_retries + 1):
        try:
            ol_jersey, dl_jersey, pair_df, engagement_info = identify_ol_dl_pair(
                df_window, excluded_ol_jerseys=excluded_ol_jerseys, trigger_frame=trigger_frame
            )

            rep_start_frame = detect_rep_start(pair_df)
            rep_end_frame = detect_rep_end(pair_df, rep_start_frame)

            rep_duration = rep_end_frame - rep_start_frame

            start_ts_row = pair_df.filter(pl.col("frame_id") == rep_start_frame)
            end_ts_row = pair_df.filter(pl.col("frame_id") == rep_end_frame)
            start_ts = start_ts_row["ts"][0] if start_ts_row.height > 0 else None
            end_ts = end_ts_row["ts"][0] if end_ts_row.height > 0 else None

            result = {
                "window_start": window_start,
                "window_end": window_end,
                "ol_jersey": ol_jersey,
                "dl_jersey": dl_jersey,
                "rep_start_frame": rep_start_frame,
                "rep_end_frame": rep_end_frame,
                "start_ts": start_ts,
                "end_ts": end_ts,
                "engagement_info": engagement_info,
                "pair_timeseries": pair_df,
                "rep_number": rep_number,
                "retry_attempt": attempt,
                "excluded_ol_jerseys": list(excluded_ol_jerseys),
            }

            if rep_duration > best_rep_duration:
                best_rep_duration = rep_duration
                best_result = result

            if rep_duration >= min_rep_duration_local:
                return result

            if attempt < max_retries:
                excluded_ol_jerseys.append(ol_jersey)

        except ValueError as e:
            if best_result is not None:
                return best_result
            raise e

    return best_result

def find_next_dl_trigger(df: pl.DataFrame, start_frame: int, end_frame: int | None = None) -> int | None:
    dl_df = df.filter(pl.col("is_dlineman") == True)

    if end_frame is not None:
        dl_df = dl_df.filter(
            (pl.col("frame_id") >= start_frame) &
            (pl.col("frame_id") <= end_frame)
        )
    else:
        dl_df = dl_df.filter(pl.col("frame_id") >= start_frame)

    if dl_df.height == 0:
        return None

    dl_with_prev = (
        dl_df
        .sort(["jersey_number", "frame_id"])
        .with_columns(
            pl.col("x").shift(1).over("jersey_number").alias("x_prev")
        )
    )

    crossings = dl_with_prev.filter(
        (pl.col("x_prev").is_not_null()) &
        (pl.col("x_prev") < TRIGGER_X) &
        (pl.col("x") >= TRIGGER_X)
    )

    if crossings.height == 0:
        return None

    earliest_frame = crossings["frame_id"].min()
    return int(earliest_frame)

def run_supra_algorithm(df: pl.DataFrame, start_frame: int, end_frame: int, verbose: bool = True) -> list:
    results = []
    rep_number = 1
    current_scan_position = start_frame

    if verbose:
        print(f"Starting supra-algorithm scan from frame {start_frame} to {end_frame}")
        print(f"Trigger X: {TRIGGER_X}, Window: [-{WINDOW_BEFORE_TRIGGER}, +{WINDOW_AFTER_TRIGGER}]")
        print(f"OL position filter: x <= {OL_MAX_X_AT_TRIGGER} at trigger frame")
        print("=" * 70)

    while current_scan_position < end_frame:
        trigger_frame = find_next_dl_trigger(df, current_scan_position, end_frame)

        if trigger_frame is None:
            if verbose:
                print(f"No more triggers found after frame {current_scan_position}")
            break

        if verbose:
            print(f"\nRep {rep_number}: Trigger at frame {trigger_frame}")

        window_start = max(start_frame, trigger_frame - WINDOW_BEFORE_TRIGGER)
        window_end = min(end_frame, trigger_frame + WINDOW_AFTER_TRIGGER)

        if verbose:
            print(f"  Window: [{window_start}, {window_end}]")

        df_window = df.filter(
            (pl.col("frame_id") >= window_start) &
            (pl.col("frame_id") <= window_end)
        )

        if df_window.height == 0:
            if verbose:
                print("  Empty window, skipping")
            current_scan_position = trigger_frame + 10
            continue

        try:
            result = detect_rep(df_window, window_start, window_end, rep_number, trigger_frame=trigger_frame)

            if result is not None:
                rep_duration = result['rep_end_frame'] - result['rep_start_frame']

                if rep_duration >= MIN_REP_DURATION:
                    results.append(result)
                    if verbose:
                        print(f"  Detected: OL {result['ol_jersey']} vs DL {result['dl_jersey']}")
                        print(f"  Rep frames: {result['rep_start_frame']} - {result['rep_end_frame']} ({rep_duration} frames)")

                    rep_number += 1
                    current_scan_position = result['rep_start_frame'] + WINDOW_AFTER_TRIGGER
                else:
                    if verbose:
                        print(f"  Rep too short ({rep_duration} frames < {MIN_REP_DURATION}), skipping")
                    current_scan_position = trigger_frame + 10
            else:
                if verbose:
                    print("  No valid rep detected, skipping")
                current_scan_position = trigger_frame + 10

        except ValueError as e:
            if verbose:
                print(f"  Error: {e}, skipping")
            current_scan_position = trigger_frame + 10

    if verbose:
        print("\n" + "=" * 70)
        print(f"Supra-algorithm complete. Detected {len(results)} reps.")

    return results

def build_output_dataframe(results: list, jersey_to_zebra: dict) -> pl.DataFrame:
    if not results:
        return pl.DataFrame({
            'rep_number': [], 'rep_start_frame': [], 'rep_end_frame': [],
            'ol_jersey': [], 'dl_jersey': [], 'ol_zebra_id': [], 'dl_zebra_id': [],
            'start_ts': [], 'end_ts': [], 'duration_frames': [], 'duration_seconds': [],
        })

    rows = []
    for r in results:
        duration_frames = r['rep_end_frame'] - r['rep_start_frame'] + 1
        rows.append({
            'rep_number': r['rep_number'],
            'rep_start_frame': r['rep_start_frame'],
            'rep_end_frame': r['rep_end_frame'],
            'ol_jersey': r['ol_jersey'],
            'dl_jersey': r['dl_jersey'],
            'ol_zebra_id': jersey_to_zebra.get(r['ol_jersey']),
            'dl_zebra_id': jersey_to_zebra.get(r['dl_jersey']),
            'start_ts': r['start_ts'],
            'end_ts': r['end_ts'],
            'duration_frames': duration_frames,
            'duration_seconds': duration_frames * 0.1,
        })

    return pl.DataFrame(rows)


In [5]:
def build_wide_rep_data(df_reps: pl.DataFrame, result: dict, session_name: str, player_id_col: str) -> pl.DataFrame | None:
    rep_start = result['rep_start_frame']
    rep_end = result['rep_end_frame']
    ol_jersey = result['ol_jersey']
    dl_jersey = result['dl_jersey']
    rep_number = result['rep_number']

    rep_df = df_reps.filter(
        (pl.col("frame_id") >= rep_start) &
        (pl.col("frame_id") <= rep_end)
    )

    base_cols = ["frame_id", "ts"]
    metric_cols = ["x", "y", "s", "a", "dir", "z", "sa", "dis"]
    id_cols = ["jersey_number", "gsis_id", player_id_col]
    all_player_cols = metric_cols + id_cols
    available_cols = [c for c in all_player_cols if c in rep_df.columns]
    select_cols = base_cols + available_cols

    ol_data = (
        rep_df
        .filter(pl.col("jersey_number") == ol_jersey)
        .select([c for c in select_cols if c in rep_df.columns])
        .unique(subset=["frame_id"])
        .sort("frame_id")
    )

    ol_rename = {c: f"ol_{c}" for c in available_cols}
    ol_data = ol_data.rename(ol_rename)

    dl_data = (
        rep_df
        .filter(pl.col("jersey_number") == dl_jersey)
        .select([c for c in select_cols if c in rep_df.columns])
        .unique(subset=["frame_id"])
        .sort("frame_id")
    )

    dl_rename = {c: f"dl_{c}" for c in available_cols}
    dl_data = dl_data.rename(dl_rename)

    wide_df = ol_data.join(dl_data.drop("ts"), on="frame_id", how="inner")

    if wide_df.height == 0:
        return None

    wide_df = wide_df.with_columns([
        pl.lit(rep_number).alias("rep_number"),
        pl.lit(session_name).alias("session_name"),
    ])

    wide_df = wide_df.with_columns(
        (((pl.col("ol_x") - pl.col("dl_x"))**2 + (pl.col("ol_y") - pl.col("dl_y"))**2).sqrt())
        .alias("pairwise_distance")
    )

    wide_df = wide_df.with_columns(
        (pl.col("pairwise_distance") - pl.col("pairwise_distance").shift(1))
        .alias("distance_change")
    )

    wide_df = wide_df.with_columns(
        (pl.col("frame_id") - pl.col("frame_id").shift(1))
        .alias("frame_delta")
    )

    return wide_df


In [6]:
VERBOSE = True

summary_dfs = {}
wide_rep_dfs = {}
results_by_session = {}

for session_name, cfg in practice_data.items():
    practice_file = find_practice_file(session_name, PRACTICE_DATA_DIR)

    if practice_file is None:
        print(f"Skipping {session_name}: practice file not found")
        continue

    drill_type = cfg["drill_type_filter"]

    print(f"\n=== Processing {session_name} ===")
    print(f"File: {practice_file.name}")
    print(f"Drill filter: {drill_type}")

    df_raw = (
        pl.scan_parquet(practice_file)
        .filter(pl.col("entity_type") == "player")
        .with_columns([pl.col("ts").cast(pl.Utf8)])
        .collect()
    )

    df_filtered = df_raw.filter(pl.col("drill_type") == drill_type)
    if df_filtered.height == 0:
        print(f"No rows match drill type '{drill_type}' for {session_name}")
        summary_dfs[session_name] = build_output_dataframe([], {})
        wide_rep_dfs[session_name] = None
        results_by_session[session_name] = []
        continue

    frame_map = (
        df_filtered
        .select("ts")
        .unique()
        .sort("ts")
        .with_row_index(name="frame_id", offset=0)
    )

    df_with_frames = df_filtered.join(frame_map, on="ts", how="left")
    df_full_raw = df_with_frames.with_columns(pl.col("ts").str.to_datetime().alias("parsed_ts"))

    los = float(cfg["LOS"])
    flip = bool(cfg["flip"])
    df_full_field = transform_coordinates(df_full_raw, los, flip)
    df = df_full_field.filter((pl.col("x") >= X_MIN) & (pl.col("x") <= X_MAX))

    player_id_col = "zebra_id" if "zebra_id" in df.columns else "id"
    df = impute_missing_frames(df, frame_map, player_id_col)

    rep_start_ts = cfg["START_TS"]
    rep_end_ts = cfg["END_TS"]
    df_reps = df.filter(
        (pl.col("ts") >= rep_start_ts) &
        (pl.col("ts") <= rep_end_ts)
    )

    df_reps = df_reps.with_columns([
        pl.col("jersey_number").is_in(cfg["olinemen"]).alias("is_olineman"),
        pl.col("jersey_number").is_in(cfg["dlinemen"]).alias("is_dlineman"),
    ])

    if df_reps.height == 0:
        print(f"No rep data in configured window for {session_name}")
        summary_dfs[session_name] = build_output_dataframe([], {})
        wide_rep_dfs[session_name] = None
        results_by_session[session_name] = []
        continue

    jersey_to_player = dict(
        df_reps.select(["jersey_number", player_id_col]).unique().iter_rows()
    )

    start_frame = int(df_reps["frame_id"].min())
    end_frame = int(df_reps["frame_id"].max())

    results = run_supra_algorithm(df_reps, start_frame, end_frame, verbose=VERBOSE)
    output_df = build_output_dataframe(results, jersey_to_player)
    summary_dfs[session_name] = output_df.with_columns(pl.lit(session_name).alias("session_name"))
    results_by_session[session_name] = results

    all_wide_reps = []
    for result in results:
        wide_rep = build_wide_rep_data(df_reps, result, session_name, player_id_col)
        if wide_rep is not None:
            all_wide_reps.append(wide_rep)

    if all_wide_reps:
        combined_wide_df = pl.concat(all_wide_reps, how="vertical")
        id_cols_order = ["session_name", "rep_number", "frame_id", "ts"]
        ol_cols = [c for c in combined_wide_df.columns if c.startswith("ol_")]
        dl_cols = [c for c in combined_wide_df.columns if c.startswith("dl_")]
        derived_cols = ["pairwise_distance", "distance_change", "frame_delta"]
        final_cols = id_cols_order + sorted(ol_cols) + sorted(dl_cols) + derived_cols
        final_cols = [c for c in final_cols if c in combined_wide_df.columns]
        combined_wide_df = combined_wide_df.select(final_cols)
    else:
        combined_wide_df = None

    wide_rep_dfs[session_name] = combined_wide_df

    print(f"Detected {len(results)} reps for {session_name}")
    if combined_wide_df is not None:
        print(f"Wide reps rows: {combined_wide_df.height:,}")
    else:
        print("Wide reps rows: 0")



=== Processing 2024WestPractice1 ===
File: 2024_West_Practice_1.snappy.parquet
Drill filter: Bigs 1 on 1 - Skill 7 on 7
Starting supra-algorithm scan from frame 49734 to 56744
Trigger X: 0.5, Window: [-40, +80]
OL position filter: x <= 6.0 at trigger frame

Rep 1: Trigger at frame 49824
  Window: [49784, 49904]
  Detected: OL 71 vs DL 91
  Rep frames: 49815 - 49842 (27 frames)

Rep 2: Trigger at frame 50039
  Window: [49999, 50119]
  Detected: OL 71 vs DL 91
  Rep frames: 50029 - 50073 (44 frames)

Rep 3: Trigger at frame 50177
  Window: [50137, 50257]
  Detected: OL 69 vs DL 8
  Rep frames: 50163 - 50205 (42 frames)

Rep 4: Trigger at frame 50353
  Window: [50313, 50433]
  Detected: OL 69 vs DL 8
  Rep frames: 50342 - 50375 (33 frames)

Rep 5: Trigger at frame 50479
  Window: [50439, 50559]
  Detected: OL 54 vs DL 52
  Rep frames: 50465 - 50502 (37 frames)

Rep 6: Trigger at frame 50633
  Window: [50593, 50713]
  Detected: OL 54 vs DL 52
  Rep frames: 50621 - 50665 (44 frames)

Rep 7

In [7]:
# Concatenate wide reps (equivalent to data_curation cell 3)
wide_rep_list = [df for df in wide_rep_dfs.values() if df is not None and df.height > 0]
if wide_rep_list:
    all_reps_df = pl.concat(wide_rep_list, how="vertical")
else:
    all_reps_df = pl.DataFrame()

print(f"\n=== Concatenated DataFrame ===")
print(f"Total rows: {all_reps_df.height:,}")
print(f"Columns: {len(all_reps_df.columns)}")
if all_reps_df.height > 0:
    print(f"Unique sessions: {all_reps_df['session_name'].n_unique()}")

    rep_counts = (
        all_reps_df
        .select(["session_name", "rep_number"])
        .unique()
        .group_by("session_name")
        .agg(pl.col("rep_number").count().alias("n_reps"))
        .sort("session_name")
    )
    print(f"\nReps per session:")
    for row in rep_counts.iter_rows(named=True):
        print(f"  {row['session_name']}: {row['n_reps']} reps")

    total_reps = rep_counts["n_reps"].sum()
    print(f"\nTotal reps: {total_reps}")



=== Concatenated DataFrame ===
Total rows: 5,345
Columns: 29
Unique sessions: 6

Reps per session:
  2024EastPractice1: 19 reps
  2024EastPractice2: 21 reps
  2024EastPractice3: 14 reps
  2024WestPractice1: 33 reps
  2024WestPractice2: 24 reps
  2024WestPractice3: 22 reps

Total reps: 133


In [9]:
# Plotly field view: all players in bounds + OL/DL highlight
# pip install plotly anywidget if needed

import numpy as np
import polars as pl
import plotly.graph_objects as go
from ipywidgets import Dropdown, Play, IntSlider, Button, HBox, VBox, link
from IPython.display import display

reps_df = all_reps_df.with_columns(
    pl.cum_count("rep_number")
    .over(["session_name", "rep_number"])
    .alias("frame_number")
)

frame_col = "frame_number"

rep_keys = (
    reps_df
    .select(["session_name", "rep_number"])
    .unique()
    .sort(["session_name", "rep_number"])
)

rep_options = [
    (f"{session} | rep {rep}", (session, int(rep)))
    for session, rep in rep_keys.iter_rows()
]

# ====== Load full player tracking per session ======
if "practice_data" not in globals():
    raise ValueError("practice_data not found; run earlier cells to define session configs.")
if "PRACTICE_DATA_DIR" not in globals():
    raise ValueError("PRACTICE_DATA_DIR not found; run earlier cells to define data paths.")
if "find_practice_file" not in globals() or "transform_coordinates" not in globals():
    raise ValueError("Missing helper functions; run earlier cells that define find_practice_file and transform_coordinates.")

X_MIN = -15.0
X_MAX = 15.0
Y_MIN = 10.0
Y_MAX = 40.0

print("Loading full-session player data...")
session_cache = {}
for session in reps_df["session_name"].unique().to_list():
    cfg = practice_data.get(session)
    if cfg is None:
        print(f"No config for session {session}, skipping")
        continue

    practice_file = find_practice_file(session, PRACTICE_DATA_DIR)
    if practice_file is None:
        print(f"Practice file not found for session {session}, skipping")
        continue

    drill_type = cfg["drill_type_filter"]
    df_raw = (
        pl.scan_parquet(practice_file)
        .filter(pl.col("entity_type") == "player")
        .with_columns([pl.col("ts").cast(pl.Utf8)])
        .collect()
    )

    df_filtered = df_raw.filter(pl.col("drill_type") == drill_type)
    if df_filtered.height == 0:
        print(f"No rows match drill type '{drill_type}' for {session}")
        continue

    frame_map = (
        df_filtered
        .select("ts")
        .unique()
        .sort("ts")
        .with_row_index(name="frame_id", offset=0)
    )

    df_with_frames = df_filtered.join(frame_map, on="ts", how="left")
    df_full_raw = df_with_frames.with_columns(pl.col("ts").str.to_datetime().alias("parsed_ts"))

    los = float(cfg["LOS"])
    flip = bool(cfg["flip"])
    df_full_field = transform_coordinates(df_full_raw, los, flip)
    df = df_full_field.filter((pl.col("x") >= X_MIN) & (pl.col("x") <= X_MAX))

    rep_start_ts = cfg["START_TS"]
    rep_end_ts = cfg["END_TS"]
    df_reps = df.filter((pl.col("ts") >= rep_start_ts) & (pl.col("ts") <= rep_end_ts))

    # Keep only players in field view bounds for plotting
    df_reps = df_reps.filter((pl.col("y") >= Y_MIN) & (pl.col("y") <= Y_MAX))

    session_cache[session] = df_reps

print(f"Loaded {len(session_cache)} sessions")

# ====== Helper to choose matching id column ======

def pick_id_base(reps_cols, session_cols):
    if "zebra_id" in session_cols and "ol_zebra_id" in reps_cols:
        return "zebra_id"
    if "id" in session_cols and "ol_id" in reps_cols:
        return "id"
    if "gsis_id" in session_cols and "ol_gsis_id" in reps_cols:
        return "gsis_id"
    return "jersey_number"

# ====== Pre-cache rep data ======
print("Pre-caching rep frames...")
rep_cache = {}
for _, rep_key in rep_options:
    session, rep_number = rep_key
    rep_df = (
        reps_df
        .filter((pl.col("session_name") == session) & (pl.col("rep_number") == rep_number))
        .sort(frame_col)
    )
    if rep_df.height == 0:
        continue

    session_df = session_cache.get(session)
    if session_df is None or session_df.height == 0:
        continue

    id_base = pick_id_base(reps_df.columns, session_df.columns)
    ol_id_col = f"ol_{id_base}"
    dl_id_col = f"dl_{id_base}"

    if ol_id_col not in rep_df.columns or dl_id_col not in rep_df.columns:
        id_base = "jersey_number"
        ol_id_col = "ol_jersey_number"
        dl_id_col = "dl_jersey_number"

    ol_id = rep_df[ol_id_col][0]
    dl_id = rep_df[dl_id_col][0]

    frame_numbers = rep_df[frame_col].to_list()
    frame_ids = rep_df["frame_id"].to_list()
    frame_min = min(frame_ids)
    frame_max = max(frame_ids)

    players_df = session_df.filter((pl.col("frame_id") >= frame_min) & (pl.col("frame_id") <= frame_max))

    frame_groups = (
        players_df
        .group_by("frame_id")
        .agg(
            pl.col("x").implode().alias("x"),
            pl.col("y").implode().alias("y"),
            pl.col(id_base).implode().alias("pid"),
        )
        .sort("frame_id")
    )

    others_by_frame = {}
    for row in frame_groups.iter_rows(named=True):
        xs = row["x"]
        ys = row["y"]
        pids = row["pid"]
        other_xs = [x for x, p in zip(xs, pids) if p not in {ol_id, dl_id}]
        other_ys = [y for y, p in zip(ys, pids) if p not in {ol_id, dl_id}]
        others_by_frame[row["frame_id"]] = (other_xs, other_ys)

    rep_cache[rep_key] = {
        "session": session,
        "rep_number": rep_number,
        "frame_numbers": frame_numbers,
        "frame_ids": frame_ids,
        "ol_x": rep_df["ol_x"].to_list(),
        "ol_y": rep_df["ol_y"].to_list(),
        "dl_x": rep_df["dl_x"].to_list(),
        "dl_y": rep_df["dl_y"].to_list(),
        "others_by_frame": others_by_frame,
    }

print(f"Loaded {len(rep_cache)} reps")

# ====== Figure ======
fig = go.FigureWidget()

# Add field yard lines
for x_val in range(-15, 16, 5):
    color = "yellow" if x_val == 0 else "rgba(255,255,255,0.5)"
    width = 2 if x_val == 0 else 1
    fig.add_shape(
        type="line",
        x0=x_val, x1=x_val,
        y0=Y_MIN, y1=Y_MAX,
        line=dict(color=color, width=width),
        layer="below",
    )

# Other players (faded)
fig.add_trace(go.Scatter(
    x=[], y=[], mode="markers", name="Other Players",
    marker=dict(size=8, color="rgba(220,220,220,0.5)", line=dict(color="rgba(255,255,255,0.4)", width=1)),
    showlegend=True
))

# OL highlight
fig.add_trace(go.Scatter(
    x=[], y=[], mode="markers", name="OL",
    marker=dict(size=18, color="dodgerblue", line=dict(color="white", width=2)),
    showlegend=True
))

# DL highlight
fig.add_trace(go.Scatter(
    x=[], y=[], mode="markers", name="DL",
    marker=dict(size=18, color="red", line=dict(color="white", width=2)),
    showlegend=True
))

fig.update_layout(
    width=900, height=500,
    title="",
    showlegend=True,
    legend=dict(x=0.02, y=0.98),
    plot_bgcolor="#2e7d32",
    paper_bgcolor="white",
)

fig.update_xaxes(range=[X_MIN, X_MAX], title="X (yards)", showgrid=False)
fig.update_yaxes(range=[Y_MIN, Y_MAX], title="Y (yards)", showgrid=False, scaleanchor="x", scaleratio=1)

# ====== Widgets ======
rep_dropdown = Dropdown(options=rep_options, description="Rep")
play = Play(interval=100, min=0, max=1, step=1, value=0)
frame_slider = IntSlider(min=0, max=1, step=1, value=0, description="Frame")
back_button = Button(description="◀")
forward_button = Button(description="▶")

current_data = {"ref": None}

def update_plot(frame_idx):
    data = current_data["ref"]
    if data is None:
        return
    idx = max(0, min(frame_idx, len(data["frame_numbers"]) - 1))
    frame_id = data["frame_ids"][idx]
    frame_number = data["frame_numbers"][idx]
    other_xy = data["others_by_frame"].get(frame_id, ([], []))

    with fig.batch_update():
        # Other players
        fig.data[0].x = other_xy[0]
        fig.data[0].y = other_xy[1]

        # OL/DL
        fig.data[1].x = [data["ol_x"][idx]]
        fig.data[1].y = [data["ol_y"][idx]]
        fig.data[2].x = [data["dl_x"][idx]]
        fig.data[2].y = [data["dl_y"][idx]]

        fig.layout.title = f"{data['session']} | rep {data['rep_number']} | {frame_col} {frame_number}"


def load_rep(rep_key):
    if rep_key not in rep_cache:
        return
    current_data["ref"] = rep_cache[rep_key]
    data = current_data["ref"]
    max_idx = len(data["frame_numbers"]) - 1

    frame_slider.max = max_idx
    play.max = max_idx
    frame_slider.value = 0
    update_plot(0)


def on_rep_change(change):
    if change["name"] == "value":
        load_rep(change["new"])


def on_frame_change(change):
    if change["name"] == "value":
        update_plot(change["new"])


link((play, "value"), (frame_slider, "value"))
rep_dropdown.observe(on_rep_change, names="value")
frame_slider.observe(on_frame_change, names="value")
back_button.on_click(lambda _: setattr(frame_slider, "value", max(0, frame_slider.value - 1)))
forward_button.on_click(lambda _: setattr(frame_slider, "value", min(frame_slider.max, frame_slider.value + 1)))

controls = HBox([play, back_button, forward_button, frame_slider])
ui = VBox([rep_dropdown, controls, fig])
display(ui)

load_rep(rep_dropdown.value)


Loading full-session player data...
Loaded 6 sessions
Pre-caching rep frames...
Loaded 133 reps


VBox(children=(Dropdown(description='Rep', options=(('2024EastPractice1 | rep 1', ('2024EastPractice1', 1)), (…

In [10]:
# Filter Faulty Reps
EXCLUDED_REPS = [
    # 2024EastPractice1
    ("2024EastPractice1", 5), # False Positive
    ("2024EastPractice1", 10), # False Positive
    ("2024EastPractice1", 15), # Imputation Issue
    ("2024EastPractice1", 17), # Imputation Issue
    ("2024EastPractice1", 19), # False Positive
    # 2024EastPractice2
    ("2024EastPractice2", 6), # Imputation Issue
    ("2024EastPractice2", 8), # False Positive
    ("2024EastPractice2", 14), # Imputation Issue / Wonky Start
    ("2024EastPractice2", 20), # False Positive
    ("2024EastPractice2", 21), # False Positive
    # 2024WestPractice1
    ("2024WestPractice1", 14), # Wrong Pair
    # ("2024WestPractice1", 15), # THE Logan Lee Rep. He hits a swim move that makes this look like a FP. Clip is here: https://www.youtube.com/shorts/bKbsMJZmPkw
    ("2024WestPractice1", 18), # False Positive
    ("2024WestPractice1", 19), # Missing OL 
    ("2024WestPractice1", 22), # Imputation Issue
    ("2024WestPractice1", 29), # Imputation Issue
    # 2024WestPractice2
    ("2024WestPractice2", 3), # Imputation Issue
    ("2024WestPractice2", 8), # False Positive
    # 2024WestPractice3
    ("2024WestPractice3", 11) # False Positive
]

all_bad_reps = EXCLUDED_REPS

if all_reps_df.height == 0:
    all_reps_df_filtered = all_reps_df
    print("\nNo reps to filter.")
else:
    filter_conditions = [
        (pl.col("session_name") == session) & (pl.col("rep_number") == rep)
        for session, rep in EXCLUDED_REPS
    ]

    exclude_condition = filter_conditions[0]
    for cond in filter_conditions[1:]:
        exclude_condition = exclude_condition | cond

    all_reps_df_filtered = all_reps_df.filter(~exclude_condition)

    print(f"\nOriginal rows: {all_reps_df.height:,}")
    print(f"Filtered rows: {all_reps_df_filtered.height:,}")
    print(f"Removed rows: {all_reps_df.height - all_reps_df_filtered.height:,}")
    print(f"Removed {len(all_bad_reps)} bad reps total")



Original rows: 5,345
Filtered rows: 4,641
Removed rows: 704
Removed 18 bad reps total


#### VALIDATION
Through manual inspection, the start and end frames from 2024 West Practice 1 and 2024 West Practice 2 marked. 2024 West Practice 1 was used to tune the algorithms, and 2024 West Practice 2 was used to validate the results. For manual validation, the following heuristics were used:

1. Rep Start: The first frame where the sum of the identified OL-DL pair's speed was >= 1.1 or acceleration was >= 1.5. **Note:** These thresholds were based on looking at the data and examining the tracking metrics, and often coincided with the moment an offensive lineman or defensive lineman began to move as part of the rep. They came to inform the algorithm itself, so it's not validation in the truest sense. (The "validation" in the title is a bit of a misnomer for rep start only. The error is very low because the heuristic is used in the algorithm.)
2. Rep End: The first frame where either 1) the identified OL-DL pair is separating beyond a small threshold or 2) either the OL or DL is moving back toward the line of scrimmage for a sustained number of frames. (Compared to rep start, no tracking metrics were used to manually mark the rep end. The nature of how the reps appeared to end informed the algorithm later on.)

To be marked as a rep, there must be at least one OL and one DL engaged in the rep. 

In [11]:
# Split Data
west_practice_2 = all_reps_df.filter(pl.col("session_name") == "2024WestPractice2").filter(pl.col("rep_number") != 8) # Filter out the false positive for matching purposes
west_practice_2_gt = pl.read_csv("~/Desktop/ShrineBowlSumerSportsAnalyticsCompetition/ManualTrackingValidation/2024WestPractice2_rep_ground_truth.csv")
west_practice_2_gt = west_practice_2_gt.with_columns(pl.col("gt_ol_jersey").cast(pl.Utf8), pl.col("gt_dl_jersey").cast(pl.Utf8))

west_practice_2_comparison = (
    west_practice_2
    .group_by("rep_number")
    .agg(
        ol_jersey = pl.col("ol_jersey_number").first(),
        dl_jersey = pl.col("dl_jersey_number").first(),
        first_frame = pl.col("frame_id").min(),
        last_frame = pl.col("frame_id").max(),
    )
    .sort("rep_number")
    .drop("rep_number")
    .with_row_index(name="rep_number", offset=1)
    .with_columns(pl.col("rep_number").cast(pl.Int64)) 
    .join(west_practice_2_gt, on=["rep_number"], how="left")
    .with_columns(
        start_diff = pl.col("first_frame") - pl.col("gt_start_frame"),
        end_diff = pl.col("last_frame") - pl.col("gt_end_frame"),
        ol_match = (pl.col("ol_jersey") == pl.col("gt_ol_jersey")),
        dl_match = (pl.col("dl_jersey") == pl.col("gt_dl_jersey"))
    )
)

mean_start_diff = round(west_practice_2_comparison.select(pl.col("start_diff").mean()).item(), 2)
mean_end_diff = round(west_practice_2_comparison.select(pl.col("end_diff").mean()).item(), 2)

print("Comparison for 2024WestPractice2 (Training Data)")
print(f"The automated system detected {west_practice_2.unique("rep_number").height} reps. There were {west_practice_2_gt.height} ground truth reps.")
print(f"The mean difference between the first frame and the ground truth start frame is {mean_start_diff}")
print(f"The mean difference between the last frame and the ground truth end frame is {mean_end_diff}")
print(f"The automated system correctly matched {west_practice_2_comparison.filter(pl.col("ol_match") & pl.col("dl_match")).height} reps.")

Comparison for 2024WestPractice2 (Training Data)
The automated system detected 23 reps. There were 23 ground truth reps.
The mean difference between the first frame and the ground truth start frame is 1.61
The mean difference between the last frame and the ground truth end frame is 1.87
The automated system correctly matched 23 reps.


##### Save Data

In [12]:
all_reps_df.write_csv("~/Desktop/ShrineBowlSumerSportsAnalyticsCompetition/Output/all_reps_df.csv")
all_reps_df_filtered.write_csv("~/Desktop/ShrineBowlSumerSportsAnalyticsCompetition/Output/all_reps_df_filtered.csv")