In [1]:
import os
os.environ["ONNXRUNTIME_EXECUTION_PROVIDERS"] = "[CUDAExecutionProvider]"

In [2]:
import os
HOME = os.getcwd()
print(HOME)

/home/ubuntu/projects/sure-football-analysis


In [3]:
# from inference import get_model
from ultralytics import YOLO

# ROBOFLOW_API_KEY = os.environ.get("ROBOFLOW_API_KEY")
# PLAYER_DETECTION_MODEL_ID = "football-players-detection-3zvbc/12"
# PLAYER_DETECTION_MODEL = get_model(PLAYER_DETECTION_MODEL_ID, api_key=ROBOFLOW_API_KEY)
PLAYER_DETECTION_MODEL = YOLO("app/models/yolo11_football_v2/weights/best.pt")

In [4]:
import torch
from transformers import AutoProcessor, SiglipVisionModel

SIGLIP_MODEL_PATH = 'google/siglip-base-patch16-224'

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
EMBEDDINGS_MODEL = SiglipVisionModel.from_pretrained(SIGLIP_MODEL_PATH).to(DEVICE)
EMBEDDINGS_PROCESSOR = AutoProcessor.from_pretrained(SIGLIP_MODEL_PATH)

In [5]:
import supervision as sv
import numpy as np
from more_itertools import chunked
from tqdm import tqdm

SOURCE_VIDEO_PATH = "app/test_data/raw/121364_0.mp4"
BATCH_SIZE = 32
PLAYER_ID = 2
STRIDE = 30

frame_generator = sv.get_video_frames_generator(
    source_path=SOURCE_VIDEO_PATH, stride=STRIDE)

crops = []
for frame in tqdm(frame_generator, desc='collecting crops'):
    # result = PLAYER_DETECTION_MODEL.infer(frame, confidence=0.3)[0]
    result = PLAYER_DETECTION_MODEL.predict(frame, conf=0.3)[0]
    # detections = sv.Detections.from_inference(result)
    detections = sv.Detections.from_ultralytics(result)
    detections = detections.with_nms(threshold=0.5, class_agnostic=True)
    detections = detections[detections.class_id == PLAYER_ID]
    players_crops = [sv.crop_image(frame, xyxy) for xyxy in detections.xyxy]
    crops += players_crops


crops = [sv.cv2_to_pillow(crop) for crop in crops]
batches = chunked(crops, BATCH_SIZE)
data = []
with torch.no_grad():
    for batch in tqdm(batches, desc='embedding extraction'):
        inputs = EMBEDDINGS_PROCESSOR(images=batch, return_tensors="pt").to(DEVICE)
        outputs = EMBEDDINGS_MODEL(**inputs)
        embeddings = torch.mean(outputs.last_hidden_state, dim=1).cpu().numpy()
        data.append(embeddings)

data = np.concatenate(data)

collecting crops: 0it [00:00, ?it/s]


0: 736x1280 1 ball, 2 goalkeepers, 20 players, 2 referees, 77.8ms
Speed: 17.0ms preprocess, 77.8ms inference, 235.5ms postprocess per image at shape (1, 3, 736, 1280)


collecting crops: 1it [00:02,  2.16s/it]


0: 736x1280 1 goalkeeper, 20 players, 2 referees, 32.3ms
Speed: 10.2ms preprocess, 32.3ms inference, 1.0ms postprocess per image at shape (1, 3, 736, 1280)


collecting crops: 2it [00:02,  1.03it/s]


0: 736x1280 1 goalkeeper, 21 players, 2 referees, 32.5ms
Speed: 6.9ms preprocess, 32.5ms inference, 1.0ms postprocess per image at shape (1, 3, 736, 1280)


collecting crops: 3it [00:02,  1.70it/s]


0: 736x1280 20 players, 2 referees, 32.5ms
Speed: 8.8ms preprocess, 32.5ms inference, 1.0ms postprocess per image at shape (1, 3, 736, 1280)


collecting crops: 4it [00:02,  2.47it/s]


0: 736x1280 1 ball, 20 players, 2 referees, 32.6ms
Speed: 10.8ms preprocess, 32.6ms inference, 1.0ms postprocess per image at shape (1, 3, 736, 1280)


collecting crops: 5it [00:02,  3.22it/s]


0: 736x1280 1 ball, 20 players, 2 referees, 33.0ms
Speed: 10.5ms preprocess, 33.0ms inference, 1.0ms postprocess per image at shape (1, 3, 736, 1280)


collecting crops: 6it [00:02,  4.02it/s]


0: 736x1280 20 players, 2 referees, 32.8ms
Speed: 8.1ms preprocess, 32.8ms inference, 1.0ms postprocess per image at shape (1, 3, 736, 1280)


collecting crops: 7it [00:02,  4.72it/s]


0: 736x1280 1 ball, 19 players, 2 referees, 32.5ms
Speed: 10.0ms preprocess, 32.5ms inference, 1.0ms postprocess per image at shape (1, 3, 736, 1280)


collecting crops: 8it [00:03,  5.42it/s]


0: 736x1280 1 ball, 1 goalkeeper, 20 players, 2 referees, 32.9ms
Speed: 7.0ms preprocess, 32.9ms inference, 1.0ms postprocess per image at shape (1, 3, 736, 1280)


collecting crops: 9it [00:03,  5.96it/s]


0: 736x1280 1 ball, 21 players, 2 referees, 33.0ms
Speed: 7.3ms preprocess, 33.0ms inference, 1.3ms postprocess per image at shape (1, 3, 736, 1280)


collecting crops: 10it [00:03,  6.22it/s]


0: 736x1280 1 ball, 22 players, 2 referees, 33.2ms
Speed: 9.9ms preprocess, 33.2ms inference, 1.0ms postprocess per image at shape (1, 3, 736, 1280)


collecting crops: 11it [00:03,  6.50it/s]


0: 736x1280 1 ball, 1 goalkeeper, 20 players, 2 referees, 33.0ms
Speed: 7.1ms preprocess, 33.0ms inference, 1.0ms postprocess per image at shape (1, 3, 736, 1280)


collecting crops: 12it [00:03,  6.68it/s]


0: 736x1280 1 goalkeeper, 21 players, 2 referees, 32.4ms
Speed: 10.3ms preprocess, 32.4ms inference, 1.0ms postprocess per image at shape (1, 3, 736, 1280)


collecting crops: 13it [00:03,  6.96it/s]


0: 736x1280 1 ball, 1 goalkeeper, 20 players, 2 referees, 32.4ms
Speed: 7.0ms preprocess, 32.4ms inference, 1.0ms postprocess per image at shape (1, 3, 736, 1280)


collecting crops: 14it [00:03,  7.04it/s]


0: 736x1280 1 goalkeeper, 21 players, 2 referees, 32.6ms
Speed: 11.1ms preprocess, 32.6ms inference, 1.0ms postprocess per image at shape (1, 3, 736, 1280)


collecting crops: 15it [00:04,  7.10it/s]


0: 736x1280 1 goalkeeper, 21 players, 2 referees, 32.4ms
Speed: 11.2ms preprocess, 32.4ms inference, 1.0ms postprocess per image at shape (1, 3, 736, 1280)


collecting crops: 16it [00:04,  7.04it/s]


0: 736x1280 20 players, 3 referees, 32.8ms
Speed: 9.8ms preprocess, 32.8ms inference, 1.0ms postprocess per image at shape (1, 3, 736, 1280)


collecting crops: 17it [00:04,  7.18it/s]


0: 736x1280 22 players, 1 referee, 32.7ms
Speed: 10.4ms preprocess, 32.7ms inference, 1.2ms postprocess per image at shape (1, 3, 736, 1280)


collecting crops: 18it [00:04,  7.15it/s]


0: 736x1280 22 players, 2 referees, 32.4ms
Speed: 11.4ms preprocess, 32.4ms inference, 1.0ms postprocess per image at shape (1, 3, 736, 1280)


collecting crops: 19it [00:04,  7.08it/s]


0: 736x1280 23 players, 2 referees, 33.1ms
Speed: 10.0ms preprocess, 33.1ms inference, 1.0ms postprocess per image at shape (1, 3, 736, 1280)


collecting crops: 20it [00:04,  7.21it/s]


0: 736x1280 1 ball, 1 goalkeeper, 19 players, 2 referees, 32.4ms
Speed: 11.2ms preprocess, 32.4ms inference, 1.0ms postprocess per image at shape (1, 3, 736, 1280)


collecting crops: 21it [00:04,  7.11it/s]


0: 736x1280 1 goalkeeper, 21 players, 2 referees, 32.7ms
Speed: 9.3ms preprocess, 32.7ms inference, 1.0ms postprocess per image at shape (1, 3, 736, 1280)


collecting crops: 22it [00:05,  7.33it/s]


0: 736x1280 1 ball, 1 goalkeeper, 21 players, 2 referees, 32.4ms
Speed: 6.9ms preprocess, 32.4ms inference, 1.1ms postprocess per image at shape (1, 3, 736, 1280)


collecting crops: 23it [00:05,  7.29it/s]


0: 736x1280 1 goalkeeper, 22 players, 1 referee, 33.1ms
Speed: 10.3ms preprocess, 33.1ms inference, 1.0ms postprocess per image at shape (1, 3, 736, 1280)


collecting crops: 24it [00:05,  7.44it/s]


0: 736x1280 1 goalkeeper, 22 players, 2 referees, 32.7ms
Speed: 11.5ms preprocess, 32.7ms inference, 1.0ms postprocess per image at shape (1, 3, 736, 1280)


collecting crops: 25it [00:05,  4.55it/s]
embedding extraction: 16it [00:03,  5.09it/s]


In [6]:
import umap
from sklearn.cluster import KMeans
from sports.common.team import TeamClassifier


REDUCER = umap.UMAP(n_components=3)
CLUSTERING_MODEL = KMeans(n_clusters=2)

projections = REDUCER.fit_transform(data)
clusters = CLUSTERING_MODEL.fit_predict(projections)

frame_generator = sv.get_video_frames_generator(
    source_path=SOURCE_VIDEO_PATH, stride=STRIDE)

crops = []
for frame in tqdm(frame_generator, desc='collecting crops'):
    # result = PLAYER_DETECTION_MODEL.infer(frame, confidence=0.3)[0]
    result = PLAYER_DETECTION_MODEL.predict(frame, conf=0.3)[0]
    # detections = sv.Detections.from_inference(result)
    detections = sv.Detections.from_ultralytics(result)
    players_detections = detections[detections.class_id == PLAYER_ID]
    players_crops = [sv.crop_image(frame, xyxy) for xyxy in detections.xyxy]
    crops += players_crops

team_classifier = TeamClassifier(device="cuda")
team_classifier.fit(crops)

collecting crops: 0it [00:00, ?it/s]


0: 736x1280 1 ball, 2 goalkeepers, 20 players, 2 referees, 33.4ms
Speed: 13.1ms preprocess, 33.4ms inference, 1.0ms postprocess per image at shape (1, 3, 736, 1280)

0: 736x1280 1 goalkeeper, 20 players, 2 referees, 32.9ms
Speed: 9.2ms preprocess, 32.9ms inference, 1.0ms postprocess per image at shape (1, 3, 736, 1280)


collecting crops: 2it [00:00,  8.43it/s]


0: 736x1280 1 goalkeeper, 21 players, 2 referees, 33.1ms
Speed: 12.2ms preprocess, 33.1ms inference, 1.6ms postprocess per image at shape (1, 3, 736, 1280)


collecting crops: 3it [00:00,  7.38it/s]


0: 736x1280 20 players, 2 referees, 33.1ms
Speed: 11.3ms preprocess, 33.1ms inference, 1.0ms postprocess per image at shape (1, 3, 736, 1280)


collecting crops: 4it [00:00,  6.94it/s]


0: 736x1280 1 ball, 20 players, 2 referees, 32.7ms
Speed: 10.1ms preprocess, 32.7ms inference, 1.1ms postprocess per image at shape (1, 3, 736, 1280)


collecting crops: 5it [00:00,  6.88it/s]


0: 736x1280 1 ball, 20 players, 2 referees, 33.1ms
Speed: 11.9ms preprocess, 33.1ms inference, 1.0ms postprocess per image at shape (1, 3, 736, 1280)


collecting crops: 6it [00:00,  6.92it/s]


0: 736x1280 20 players, 2 referees, 33.5ms
Speed: 7.2ms preprocess, 33.5ms inference, 1.1ms postprocess per image at shape (1, 3, 736, 1280)


collecting crops: 7it [00:00,  6.88it/s]


0: 736x1280 1 ball, 19 players, 2 referees, 33.2ms
Speed: 11.4ms preprocess, 33.2ms inference, 1.1ms postprocess per image at shape (1, 3, 736, 1280)


collecting crops: 8it [00:01,  6.92it/s]


0: 736x1280 1 ball, 1 goalkeeper, 20 players, 2 referees, 32.8ms
Speed: 10.5ms preprocess, 32.8ms inference, 1.0ms postprocess per image at shape (1, 3, 736, 1280)


collecting crops: 9it [00:01,  6.86it/s]


0: 736x1280 1 ball, 21 players, 2 referees, 32.8ms
Speed: 11.4ms preprocess, 32.8ms inference, 1.0ms postprocess per image at shape (1, 3, 736, 1280)


collecting crops: 10it [00:01,  6.71it/s]


0: 736x1280 1 ball, 22 players, 2 referees, 33.3ms
Speed: 11.4ms preprocess, 33.3ms inference, 1.6ms postprocess per image at shape (1, 3, 736, 1280)


collecting crops: 11it [00:01,  6.64it/s]


0: 736x1280 1 ball, 1 goalkeeper, 20 players, 2 referees, 32.7ms
Speed: 11.3ms preprocess, 32.7ms inference, 1.0ms postprocess per image at shape (1, 3, 736, 1280)


collecting crops: 12it [00:01,  6.47it/s]


0: 736x1280 1 goalkeeper, 21 players, 2 referees, 32.7ms
Speed: 10.9ms preprocess, 32.7ms inference, 1.0ms postprocess per image at shape (1, 3, 736, 1280)


collecting crops: 13it [00:01,  6.63it/s]


0: 736x1280 1 ball, 1 goalkeeper, 20 players, 2 referees, 33.4ms
Speed: 7.4ms preprocess, 33.4ms inference, 1.0ms postprocess per image at shape (1, 3, 736, 1280)


collecting crops: 14it [00:02,  6.61it/s]


0: 736x1280 1 goalkeeper, 21 players, 2 referees, 33.0ms
Speed: 11.5ms preprocess, 33.0ms inference, 1.1ms postprocess per image at shape (1, 3, 736, 1280)


collecting crops: 15it [00:02,  6.72it/s]


0: 736x1280 1 goalkeeper, 21 players, 2 referees, 33.4ms
Speed: 9.8ms preprocess, 33.4ms inference, 1.1ms postprocess per image at shape (1, 3, 736, 1280)


collecting crops: 16it [00:02,  6.66it/s]


0: 736x1280 20 players, 3 referees, 33.2ms
Speed: 11.4ms preprocess, 33.2ms inference, 1.0ms postprocess per image at shape (1, 3, 736, 1280)


collecting crops: 17it [00:02,  6.72it/s]


0: 736x1280 22 players, 1 referee, 32.8ms
Speed: 11.2ms preprocess, 32.8ms inference, 1.1ms postprocess per image at shape (1, 3, 736, 1280)


collecting crops: 18it [00:02,  6.72it/s]


0: 736x1280 22 players, 2 referees, 33.2ms
Speed: 10.5ms preprocess, 33.2ms inference, 1.0ms postprocess per image at shape (1, 3, 736, 1280)


collecting crops: 19it [00:02,  6.69it/s]


0: 736x1280 23 players, 2 referees, 33.4ms
Speed: 10.2ms preprocess, 33.4ms inference, 1.0ms postprocess per image at shape (1, 3, 736, 1280)


collecting crops: 20it [00:02,  6.87it/s]


0: 736x1280 1 ball, 1 goalkeeper, 19 players, 2 referees, 33.3ms
Speed: 7.0ms preprocess, 33.3ms inference, 1.1ms postprocess per image at shape (1, 3, 736, 1280)


collecting crops: 21it [00:03,  6.99it/s]


0: 736x1280 1 goalkeeper, 21 players, 2 referees, 33.2ms
Speed: 10.1ms preprocess, 33.2ms inference, 1.0ms postprocess per image at shape (1, 3, 736, 1280)


collecting crops: 22it [00:03,  7.24it/s]


0: 736x1280 1 ball, 1 goalkeeper, 21 players, 2 referees, 33.1ms
Speed: 7.5ms preprocess, 33.1ms inference, 1.0ms postprocess per image at shape (1, 3, 736, 1280)


collecting crops: 23it [00:03,  7.22it/s]


0: 736x1280 1 goalkeeper, 22 players, 1 referee, 32.7ms
Speed: 10.4ms preprocess, 32.7ms inference, 1.0ms postprocess per image at shape (1, 3, 736, 1280)


collecting crops: 24it [00:03,  7.41it/s]


0: 736x1280 1 goalkeeper, 22 players, 2 referees, 32.9ms
Speed: 10.6ms preprocess, 32.9ms inference, 1.0ms postprocess per image at shape (1, 3, 736, 1280)


collecting crops: 25it [00:03,  6.81it/s]
Embedding extraction: 19it [00:03,  4.84it/s]


In [7]:
import numpy as np
import supervision as sv

def resolve_goalkeepers_team_id(
    players: sv.Detections,
    goalkeepers: sv.Detections
) -> np.ndarray:
    goalkeepers_xy = goalkeepers.get_anchors_coordinates(sv.Position.BOTTOM_CENTER)
    players_xy = players.get_anchors_coordinates(sv.Position.BOTTOM_CENTER)
    team_0_centroid = players_xy[players.class_id == 0].mean(axis=0)
    team_1_centroid = players_xy[players.class_id == 1].mean(axis=0)
    goalkeepers_team_id = []
    for goalkeeper_xy in goalkeepers_xy:
        dist_0 = np.linalg.norm(goalkeeper_xy - team_0_centroid)
        dist_1 = np.linalg.norm(goalkeeper_xy - team_1_centroid)
        goalkeepers_team_id.append(0 if dist_0 < dist_1 else 1)

    return np.array(goalkeepers_team_id)


## Tracking Method 1 wih BotSort from Boxmot

In [12]:
import supervision as sv
from tqdm import tqdm
import numpy as np
from boxmot import BotSort # Import BoTSORT
import cv2
from pathlib import Path
import torch

# ----- Assumed Globals (Make sure these are defined/loaded) -----
# Ensure these models and functions are loaded/defined before use:
# PLAYER_DETECTION_MODEL = ... # Your loaded YOLO model
# team_classifier = ... # Your loaded team classification model
# def resolve_goalkeepers_team_id(players_detections, goalkeepers_detections):
#     # ... implementation ...
#     return goalkeeper_class_ids_array

# ----- Configuration -----
SOURCE_VIDEO_PATH = "app/test_data/raw/0bfacc_0.mp4"
OUTPUT_VIDEO_PATH = "0bfacc_0_botsort_tracked.mp4" # Updated output name
device = torch.device(0) if torch.cuda.is_available() else torch.device('cpu') # Use GPU if available

# Class IDs (as used in your training)
BALL_ID = 0
GOALKEEPER_ID = 1
PLAYER_ID = 2
REFEREE_ID = 3

# ----- Annotators -----
ellipse_annotator = sv.EllipseAnnotator(
    color=sv.ColorPalette.from_hex(['#00BFFF', '#FF1493', '#FFD700']), # Blue, Pink, Yellow for classes 0, 1, 2
    thickness=2
)
label_annotator = sv.LabelAnnotator(
    color=sv.ColorPalette.from_hex(['#00BFFF', '#FF1493', '#FFD700']),
    text_color=sv.Color.from_hex('#000000'),
    text_position=sv.Position.BOTTOM_CENTER
)
triangle_annotator = sv.TriangleAnnotator(
    color=sv.Color.from_hex('#FFD700'), # Assuming you want yellow for the ball
    base=25,
    height=21,
    outline_thickness=1
)

# ----- Tracker Initialization -----
# Initialize BoTSORT tracker
# Common BoTSORT args (adjust as needed based on boxmot documentation/your needs):
# - track_high_thresh: High confidence threshold for starting a track.
# - track_low_thresh: Low confidence threshold for linking.
# - new_track_thresh: Threshold for creating a new track from unmatched detections.
# - track_buffer: Number of frames to keep lost tracks.
# - match_thresh: IoU threshold for matching.
# - proximity_thresh: Proximity threshold (for matching by distance)
# - appearance_thresh: Appearance similarity threshold (if using ReID features)
# - cmc_method: Method for camera motion compensation
# Using default parameters here, specify if needed, e.g., BoTSORT(track_high_thresh=0.5, ...)
tracker = BotSort(
    reid_weights=Path('clip_market1501.pt'),
    device=device,
    half=False,
    with_reid=True,
)
# Note: BoTSORT doesn't typically need an explicit reset like ByteTrack for single video processing.

# ----- Video Processing Setup -----
# Use sv.VideoInfo to get properties like width, height, fps
try:
    video_info = sv.VideoInfo.from_video_path(SOURCE_VIDEO_PATH)
    width, height, fps = video_info.width, video_info.height, video_info.fps
    total_frames = video_info.total_frames or int(fps * 20) # Estimate if total_frames is None, process max 20s
    print(f"Video Info: {width}x{height}, FPS: {fps}, Total Frames: {total_frames}")
except Exception as e:
    print(f"Warning: Could not get video info using supervision. Using defaults. Error: {e}")
    # Fallback if sv.VideoInfo fails or source is not standard video file
    cap = cv2.VideoCapture(SOURCE_VIDEO_PATH)
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = int(cap.get(cv2.CAP_PROP_FPS))
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    cap.release()
    if fps == 0: fps = 30 # Provide a default fps if reading failed
    if total_frames == 0: total_frames = int(fps * 20) # Process max 20 seconds
    print(f"Fallback Video Info: {width}x{height}, FPS: {fps}, Total Frames: {total_frames}")


# Create frame generator
frame_generator = sv.get_video_frames_generator(source_path=SOURCE_VIDEO_PATH, stride=1)

# Initialize video writer
fourcc = cv2.VideoWriter_fourcc(*'mp4v') # or 'avc1', 'XVID'
video_writer = cv2.VideoWriter(OUTPUT_VIDEO_PATH, fourcc, fps, (width, height))

# ----- Frame Processing Function -----
def process_frame(frame: np.ndarray, frame_idx: int):
    """
    Processes a single frame: detects objects, classifies teams, tracks people, annotates.
    """
    # 1. Detection
    # Assuming PLAYER_DETECTION_MODEL outputs results compatible with ultralytics
    result = PLAYER_DETECTION_MODEL.predict(frame, conf=0.3, device=device, verbose=False)[0] # Added verbose=False
    detections = sv.Detections.from_ultralytics(result)

    # 2. Pre-processing Detections
    # Separate ball detections and pad their boxes.
    ball_detections = detections[detections.class_id == BALL_ID]
    if len(ball_detections) > 0:
        ball_detections.xyxy = sv.pad_boxes(xyxy=ball_detections.xyxy, px=10)

    # Get all non-ball detections and apply NMS
    people_detections = detections[detections.class_id != BALL_ID]
    if len(people_detections) > 0:
        people_detections = people_detections.with_nms(threshold=0.5, class_agnostic=True)

    # 3. Team/Role Classification
    players_detections = people_detections[people_detections.class_id == PLAYER_ID]
    goalkeepers_detections = people_detections[people_detections.class_id == GOALKEEPER_ID]
    referees_detections = people_detections[people_detections.class_id == REFEREE_ID]

    # Classify players if any exist
    if len(players_detections) > 0:
        players_crops = [sv.crop_image(frame, xyxy) for xyxy in players_detections.xyxy]
        # Assuming team_classifier.predict returns class IDs (e.g., 0 for team A, 1 for team B)
        players_detections.class_id = team_classifier.predict(players_crops)

    # Classify goalkeepers if any exist
    if len(goalkeepers_detections) > 0:
        # Assuming resolve_goalkeepers_team_id returns class IDs consistent with player teams
        goalkeepers_detections.class_id = resolve_goalkeepers_team_id(players_detections, goalkeepers_detections)

    # Adjust referee class ID (e.g., map to 2 if players/GKs are 0 and 1)
    if len(referees_detections) > 0:
        referees_detections.class_id = np.full(len(referees_detections), 2) # Assign class ID 2 (e.g., Yellow)

    # Merge detections *intended for tracking* (players, goalkeepers, referees)
    detections_to_track = sv.Detections.merge([
        players_detections, goalkeepers_detections, referees_detections
    ])

    # 4. Tracking using BoTSORT
    tracked_detections = sv.Detections.empty() # Initialize as empty
    if len(detections_to_track) > 0:
        # Convert sv.Detections to BoxMOT format: numpy array N x [x1, y1, x2, y2, conf, cls_id]
        boxmot_input = np.hstack((
            detections_to_track.xyxy,
            detections_to_track.confidence[:, np.newaxis], # Ensure confidence is (N, 1)
            detections_to_track.class_id[:, np.newaxis]    # Ensure class_id is (N, 1)
        ))

        # Update BoT-SORT tracker - requires the frame for potential appearance features
        # Output format is typically numpy array N x [x1, y1, x2, y2, track_id, conf, cls, *optional idx*]
        tracks = tracker.update(boxmot_input, frame) # Pass the *original frame*

        # Convert tracker output back to sv.Detections if tracks exist
        if tracks.shape[0] > 0:
            tracked_detections = sv.Detections(
                xyxy=tracks[:, 0:4],
                confidence=tracks[:, 5],
                class_id=tracks[:, 6].astype(int), # Ensure class IDs are integers
                tracker_id=tracks[:, 4].astype(int) # Ensure track IDs are integers
            )
    else:
        # If no detections to track, still update tracker with empty array to advance its state
         tracker.update(np.empty((0, 6)), frame)


    # 5. Annotation
    annotated_frame = frame.copy()

    # Annotate tracked objects (players, GKs, referees)
    if len(tracked_detections) > 0:
        # Create labels with tracker IDs
        labels = [f"#{tid} C{cid}" for tid, cid in zip(tracked_detections.tracker_id, tracked_detections.class_id)]
        # Annotate ellipses based on class ID (0, 1, 2 assigned during classification)
        annotated_frame = ellipse_annotator.annotate(
            scene=annotated_frame,
            detections=tracked_detections # Use class_id from tracked_detections
        )
        # Annotate labels with tracker IDs
        annotated_frame = label_annotator.annotate(
            scene=annotated_frame,
            detections=tracked_detections,
            labels=labels
        )

    # Annotate ball (which is not tracked by BoTSORT in this setup)
    if len(ball_detections) > 0:
        annotated_frame = triangle_annotator.annotate(
            scene=annotated_frame,
            detections=ball_detections
        )

    return annotated_frame

# ----- Main Video Processing Loop -----
with tqdm(total=total_frames, desc="Processing video with BoTSORT") as pbar:
    for frame_idx, frame in enumerate(frame_generator):
        annotated_frame = process_frame(frame, frame_idx)
        video_writer.write(annotated_frame)
        pbar.update(1)
        # Optional: Break early for testing
        # if frame_idx >= fps * 10: # Process only 10 seconds
        #    break

# Release the video writer
video_writer.release()
print(f"Finished processing. Annotated video saved to: {OUTPUT_VIDEO_PATH}")

[32m2025-04-13 07:02:38.631[0m | [1mINFO    [0m | [36mboxmot.utils.torch_utils[0m:[36mselect_device[0m:[36m52[0m - [1mYolo Tracking v12.0.5 🚀 Python-3.11.11 torch-2.5.1+cu121
CUDA:0 (NVIDIA L4, 22478MiB)[0m


Resized position embedding: %s to %s torch.Size([197, 768]) torch.Size([129, 768])
Position embedding resize to height:16 width: 8


[32m2025-04-13 07:02:45.022[0m | [32m[1mSUCCESS [0m | [36mboxmot.appearance.reid_model_factory[0m:[36mload_pretrained_weights[0m:[36m183[0m - [32m[1mLoaded pretrained weights from clip_market1501.pt[0m


Video Info: 1920x1080, FPS: 25, Total Frames: 750


Embedding extraction: 1it [00:00,  9.28it/s]  | 0/750 [00:00<?, ?it/s]
Embedding extraction: 1it [00:00,  8.44it/s]  | 1/750 [00:03<48:01,  3.85s/it]
Embedding extraction: 1it [00:00,  8.53it/s]  | 2/750 [00:04<22:06,  1.77s/it]
Embedding extraction: 1it [00:00,  8.48it/s]  | 3/750 [00:04<13:48,  1.11s/it]
Embedding extraction: 1it [00:00,  8.43it/s]  | 4/750 [00:04<09:43,  1.28it/s]
Embedding extraction: 1it [00:00,  8.47it/s]  | 5/750 [00:05<07:38,  1.62it/s]
Embedding extraction: 1it [00:00,  8.35it/s]  | 6/750 [00:05<06:18,  1.97it/s]
Embedding extraction: 1it [00:00,  8.45it/s]  | 7/750 [00:05<05:22,  2.31it/s]
Embedding extraction: 1it [00:00,  8.42it/s]  | 8/750 [00:05<04:55,  2.51it/s]
Embedding extraction: 1it [00:00,  8.48it/s]  | 9/750 [00:06<04:37,  2.67it/s]
Embedding extraction: 1it [00:00,  8.61it/s]  | 10/750 [00:06<04:23,  2.80it/s]
Embedding extraction: 1it [00:00,  9.29it/s]  | 11/750 [00:06<04:13,  2.91it/s]
Embedding extraction: 1it [00:00,  9.33it/s]  | 12/750 [00

Finished processing. Annotated video saved to: 0bfacc_0_botsort_tracked.mp4





## Tracking Method 2 wih BotSort from Ultralytics

In [13]:
import supervision as sv
from tqdm import tqdm
import numpy as np
# from boxmot import BoTSORT # No longer needed directly
import cv2
from pathlib import Path
import torch
from ultralytics import YOLO # Import YOLO from ultralytics

# ----- Assumed Globals (Make sure these are defined/loaded) -----
# Ensure these models and functions are loaded/defined before use:
# team_classifier = ... # Your loaded team classification model
# def resolve_goalkeepers_team_id(players_detections, goalkeepers_detections):
#     # ... implementation returning numpy array of class IDs ...
#     return goalkeeper_class_ids_array

# ----- Configuration -----
SOURCE_VIDEO_PATH = "app/test_data/raw/0bfacc_0.mp4"
OUTPUT_VIDEO_PATH = "0bfacc_0_ultralytics_botsort_tracked_2.mp4" # Updated output name
YOLO_MODEL_PATH = "app/models/yolo11_football_v2/weights/best.pt" # Or your custom trained YOLO model path
TRACKER_CONFIG = "botsort.yaml" # Make sure this config file is available or use default

device = torch.device(0) if torch.cuda.is_available() else torch.device('cpu')

# --- IMPORTANT: Define Class IDs as used by your YOLO model ---
# These MUST match the class IDs your YOLO model was trained with
BALL_ID = 0         # Example: Ball is class 0 in the YOLO model
GOALKEEPER_ID = 1   # Example: Goalkeeper is class 1
PLAYER_ID = 2       # Example: Player is class 2
REFEREE_ID = 3      # Example: Referee is class 3

# --- Define NEW Class IDs for Annotation (after team classification) ---
# These will be used by the annotators AFTER your custom logic.
# Let's say Team A = 0, Team B = 1, Referee = 2
TEAM_A_ID = 0
TEAM_B_ID = 1
ANNOTATION_REFEREE_ID = 2
# We'll map the classified players/GKs to TEAM_A_ID or TEAM_B_ID
# We'll map detected referees to ANNOTATION_REFEREE_ID

# ----- Load YOLO Model -----
model = YOLO(YOLO_MODEL_PATH)
model.to(device)

# ----- Annotators -----
# Use colors corresponding to the NEW annotation IDs (Team A, Team B, Referee)
ellipse_annotator = sv.EllipseAnnotator(
    color=sv.ColorPalette.from_hex(['#00BFFF', '#FF1493', '#FFD700']), # Team A (Blue), Team B (Pink), Referee (Yellow)
    thickness=2
)
label_annotator = sv.LabelAnnotator(
    color=sv.ColorPalette.from_hex(['#00BFFF', '#FF1493', '#FFD700']),
    text_color=sv.Color.from_hex('#000000'),
    text_position=sv.Position.BOTTOM_CENTER,
    text_scale=0.6 # Smaller text
)
triangle_annotator = sv.TriangleAnnotator(
    color=sv.Color.from_hex('#32CD32'), # Lime green for ball
    base=25,
    height=21,
    outline_thickness=1
)

# ----- Video Processing Setup -----
try:
    video_info = sv.VideoInfo.from_video_path(SOURCE_VIDEO_PATH)
    width, height, fps = video_info.width, video_info.height, video_info.fps
    total_frames = video_info.total_frames or int(fps * 20) # Estimate if total_frames is None
    print(f"Video Info: {width}x{height}, FPS: {fps}, Total Frames: {total_frames}")
except Exception as e:
    print(f"Warning: Could not get video info using supervision. Using defaults. Error: {e}")
    # Fallback if sv.VideoInfo fails
    cap = cv2.VideoCapture(SOURCE_VIDEO_PATH)
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = int(cap.get(cv2.CAP_PROP_FPS))
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    cap.release()
    if fps == 0: fps = 30
    if total_frames == 0: total_frames = int(fps * 20)
    print(f"Fallback Video Info: {width}x{height}, FPS: {fps}, Total Frames: {total_frames}")

frame_generator = sv.get_video_frames_generator(source_path=SOURCE_VIDEO_PATH, stride=1)

fourcc = cv2.VideoWriter_fourcc(*'mp4v')
video_writer = cv2.VideoWriter(OUTPUT_VIDEO_PATH, fourcc, fps, (width, height))

# ----- Main Video Processing Loop -----
with tqdm(total=total_frames, desc=f"Tracking with BoT-SORT ({TRACKER_CONFIG})") as pbar:
    for frame_idx, frame in enumerate(frame_generator):
        # 1. Run Integrated Detection and Tracking (Only for player-like objects)
        # We track only the classes we intend to classify later.
        # 'persist=True' maintains tracker state across frames.
        # 'classes' filters detections BEFORE tracking.
        results = model.track(
            source=frame,
            persist=True,
            tracker=TRACKER_CONFIG,
            classes=[GOALKEEPER_ID, PLAYER_ID, REFEREE_ID], # Track only these initial IDs
            conf=0.3, # Confidence threshold for initial detection
            verbose=False, # Suppress Ultralytics console output per frame
            device=device
        )

        # Separately detect the ball (as it's not classified/tracked the same way)
        # Using predict for ball allows different confidence etc.
        ball_results = model.predict(frame, classes=[BALL_ID], conf=0.1, verbose=False, device=device)
        ball_detections = sv.Detections.from_ultralytics(ball_results[0])
        if len(ball_detections) > 0:
             ball_detections.xyxy = sv.pad_boxes(xyxy=ball_detections.xyxy, px=10)


        # 2. Process Tracking Results
        annotated_frame = frame.copy()
        detections_for_annotation = sv.Detections.empty() # Start with empty detections

        if results[0].boxes.id is not None: # Check if tracking IDs are assigned
            # Convert tracked results to Supervision Detections object
            # This object contains xyxy, confidence, original class_id, and tracker_id
            tracked_detections = sv.Detections.from_ultralytics(results[0])

            # 3. Apply Custom Classification based on TRACKED items
            # Filter detections based on ORIGINAL class ID from YOLO
            players = tracked_detections[tracked_detections.class_id == PLAYER_ID]
            goalkeepers = tracked_detections[tracked_detections.class_id == GOALKEEPER_ID]
            referees = tracked_detections[tracked_detections.class_id == REFEREE_ID]

            final_detections_list = []

            # Process Players
            if len(players) > 0:
                player_crops = [sv.crop_image(frame, xyxy) for xyxy in players.xyxy]
                # Assume team_classifier returns an array of TEAM_A_ID or TEAM_B_ID
                new_player_class_ids = team_classifier.predict(player_crops)
                players.class_id = new_player_class_ids # OVERWRITE class ID for annotation
                final_detections_list.append(players)

            # Process Goalkeepers
            if len(goalkeepers) > 0:
                # Assume resolve_goalkeepers_team_id uses players info and returns TEAM_A_ID/TEAM_B_ID
                new_gk_class_ids = resolve_goalkeepers_team_id(players, goalkeepers) # Pass sv.Detections
                goalkeepers.class_id = new_gk_class_ids # OVERWRITE class ID for annotation
                final_detections_list.append(goalkeepers)

            # Process Referees
            if len(referees) > 0:
                # Assign the predefined annotation ID for referees
                referees.class_id = np.full(len(referees), ANNOTATION_REFEREE_ID)
                final_detections_list.append(referees)

            # Merge all processed detections back together for annotation
            if final_detections_list:
                 detections_for_annotation = sv.Detections.merge(final_detections_list)


        # 4. Annotation using Supervision
        # Annotate tracked+classified objects
        if len(detections_for_annotation) > 0:
            # Create labels using tracker ID and the NEW class ID
            labels = [
                f"T:{tid} C:{cid}"
                for tid, cid
                in zip(detections_for_annotation.tracker_id, detections_for_annotation.class_id)
            ]
            # Annotate ellipses based on the NEW class ID (Team A, Team B, Referee)
            annotated_frame = ellipse_annotator.annotate(
                scene=annotated_frame,
                detections=detections_for_annotation
            )
            # Annotate labels
            annotated_frame = label_annotator.annotate(
                scene=annotated_frame,
                detections=detections_for_annotation,
                labels=labels
            )

        # Annotate the ball separately
        if len(ball_detections) > 0:
            annotated_frame = triangle_annotator.annotate(
                scene=annotated_frame,
                detections=ball_detections
            )

        # 5. Write Frame
        video_writer.write(annotated_frame)
        pbar.update(1)

# Release resources
video_writer.release()
print(f"Finished processing. Annotated video saved to: {OUTPUT_VIDEO_PATH}")
# Optional: Close any CV2 windows if you were displaying frames live
# cv2.destroyAllWindows()

Video Info: 1920x1080, FPS: 25, Total Frames: 750


Embedding extraction: 1it [00:00,  8.45it/s]          | 0/750 [00:00<?, ?it/s]
Embedding extraction: 1it [00:00,  8.43it/s]          | 1/750 [00:00<09:32,  1.31it/s]
Embedding extraction: 1it [00:00,  8.53it/s]          | 2/750 [00:01<05:51,  2.13it/s]
Embedding extraction: 1it [00:00,  8.49it/s]          | 3/750 [00:01<04:41,  2.66it/s]
Embedding extraction: 1it [00:00,  8.52it/s]          | 4/750 [00:01<04:05,  3.03it/s]
Embedding extraction: 1it [00:00,  8.49it/s]          | 5/750 [00:01<03:46,  3.29it/s]
Embedding extraction: 1it [00:00,  8.48it/s]          | 6/750 [00:02<03:39,  3.40it/s]
Embedding extraction: 1it [00:00,  9.31it/s]          | 7/750 [00:02<03:30,  3.53it/s]
Embedding extraction: 1it [00:00,  8.41it/s]          | 8/750 [00:02<03:22,  3.67it/s]
Embedding extraction: 1it [00:00,  8.64it/s]          | 9/750 [00:02<03:19,  3.71it/s]
Embedding extraction: 1it [00:00,  8.64it/s]▏         | 10/750 [00:03<03:17,  3.75it/s]
Embedding extraction: 1it [00:00,  9.40it/s]▏     

Finished processing. Annotated video saved to: 0bfacc_0_ultralytics_botsort_tracked_2.mp4





## Tracking Method 3 wih BotSort + Paddle OCR

In [17]:
import supervision as sv
from tqdm import tqdm
import numpy as np
import cv2
from pathlib import Path
import torch
from ultralytics import YOLO
from paddleocr import PaddleOCR, draw_ocr # Import PaddleOCR
import re # For parsing OCR results
import time # For basic profiling/timing

# ----- Configuration -----
SOURCE_VIDEO_PATH = "app/test_data/raw/0bfacc_0.mp4" # Your input video
OUTPUT_VIDEO_PATH = "0bfacc_0_persistent_tracked.mp4" # Output video path
YOLO_MODEL_PATH = "app/models/yolo11_football_v2/weights/best.pt" # Path to your trained YOLO model or a standard one
TRACKER_CONFIG = "botsort.yaml" # BoT-SORT config (Ultralytics usually finds defaults)

# --- Device Setup ---
device = torch.device(0) if torch.cuda.is_available() else torch.device('cpu')
print(f"Using device: {device}")

# --- Class IDs (from your YOLO model's training) ---
BALL_ID = 0
GOALKEEPER_ID = 1
PLAYER_ID = 2
REFEREE_ID = 3

# --- Class IDs for Annotation (after team classification) ---
TEAM_A_ID = 0 # Example ID for Team A
TEAM_B_ID = 1 # Example ID for Team B
ANNOTATION_REFEREE_ID = 2 # Example ID for Referees

# --- OCR Configuration ---
OCR_CONFIDENCE_THRESHOLD = 0.5 # Minimum confidence for accepting OCR number (adjust!)
OCR_USE_GPU = False # torch.cuda.is_available() Use GPU for OCR if available

# --- State Management Configuration ---
MAX_ABSENCE_FRAMES = int(30) # Frames after which a lost player state is considered stale (e.g., 30fps * 1 sec)

# ----- Initialize Models -----

# Load YOLO Model
print(f"Loading YOLO model from: {YOLO_MODEL_PATH}")
model = YOLO(YOLO_MODEL_PATH)
model.to(device)

# Initialize PaddleOCR
# Download models automatically on first run. Specify 'lang='en'' for English numbers.
print("Initializing PaddleOCR...")
# Consider using `rec_model_dir` and `det_model_dir` with specific downloaded models for efficiency/consistency
# Use `use_angle_cls=False` if numbers are mostly upright
ocr_model = PaddleOCR(use_angle_cls=False, lang='en', use_gpu=OCR_USE_GPU, show_log=False)
print("PaddleOCR initialized.")

# ----- Annotators -----
# Colors match annotation IDs: Team A (Blue), Team B (Pink), Referee (Yellow)
palette = sv.ColorPalette.from_hex(['#00BFFF', '#FF1493', '#FFD700'])
ellipse_annotator = sv.EllipseAnnotator(color=palette, thickness=2)
label_annotator = sv.LabelAnnotator(
    color=palette,
    text_color=sv.Color.from_hex('#000000'),
    text_position=sv.Position.BOTTOM_CENTER,
    text_scale=0.6,
    text_thickness=1
)
triangle_annotator = sv.TriangleAnnotator(
    color=sv.Color.from_hex('#32CD32'), # Lime green for ball
    base=25, height=21, outline_thickness=1
)

# ----- Video Processing Setup -----
try:
    video_info = sv.VideoInfo.from_video_path(SOURCE_VIDEO_PATH)
    width, height, fps = video_info.width, video_info.height, video_info.fps
    total_frames = video_info.total_frames or int(fps * 600) # Estimate for long videos if needed
    print(f"Video Info: {width}x{height}, FPS: {fps}, Total Frames: {total_frames}")
    if fps == 0: fps = 30 # Default fps if reading failed
except Exception as e:
    print(f"Error getting video info: {e}. Exiting.")
    exit()

frame_generator = sv.get_video_frames_generator(source_path=SOURCE_VIDEO_PATH, stride=1)
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
video_writer = cv2.VideoWriter(OUTPUT_VIDEO_PATH, fourcc, fps, (width, height))

# ----- State Management Dictionaries -----
# Stores the state for each confirmed real-world player identity
# Key: canonical_id (e.g., "TeamA-10"), Value: dict
player_states = {}
# Maps the current BoT-SORT tracker_id to a confirmed canonical_id
# Key: tracker_id (int), Value: canonical_id (str)
tracker_id_to_canonical = {}

# ----- Helper Function for OCR -----
def run_ocr_on_crop(crop: np.ndarray, ocr_engine: PaddleOCR):
    """
    Runs PaddleOCR on a cropped image and extracts the most likely number.
    Returns (number_str, confidence) or (None, 0.0) if no number found.
    """
    if crop.shape[0] < 10 or crop.shape[1] < 10: # Skip tiny crops
        return None, 0.0

    try:
        # Perform OCR
        ocr_result = ocr_engine.ocr(crop, cls=False, det=True, rec=True) # Use det+rec

        best_num_str = None
        best_confidence = 0.0

        if ocr_result and ocr_result[0]: # Check if results exist
             for line in ocr_result[0]:
                  text, confidence = line[1] # Get text and confidence
                  # Try to find digits in the recognized text
                  numbers = re.findall(r'\d+', text)
                  if numbers:
                       num_str = numbers[0] # Take the first number found
                       if confidence > best_confidence:
                            best_confidence = confidence
                            best_num_str = num_str
                            # print(f"    OCR Found: '{text}' -> Number: {num_str} (Conf: {confidence:.2f})") # Debug

        return best_num_str, best_confidence

    except Exception as e:
        print(f"    Error during OCR: {e}")
        return None, 0.0

# ----- Main Video Processing Loop -----
frame_count = 0
with tqdm(total=total_frames, desc=f"Tracking+OCR") as pbar:
    for frame in frame_generator:
        frame_time_start = time.time()
        frame_count += 1

        # 1. Run YOLOv8 Tracking (BoT-SORT) for people
        results = model.track(
            source=frame,
            persist=True,
            tracker=TRACKER_CONFIG,
            classes=[GOALKEEPER_ID, PLAYER_ID, REFEREE_ID],
            conf=0.3,
            verbose=False,
            device=device
        )

        # 2. Detect Ball Separately
        ball_results = model.predict(frame, classes=[BALL_ID], conf=0.1, verbose=False, device=device)
        ball_detections = sv.Detections.from_ultralytics(ball_results[0])
        if len(ball_detections) > 0:
             ball_detections.xyxy = sv.pad_boxes(xyxy=ball_detections.xyxy, px=10)

        annotated_frame = frame.copy() # Start with original frame for annotation

        # 3. Process Tracking Results & Apply Custom Classification
        detections_for_processing = sv.Detections.empty()
        if results[0].boxes.id is not None:
            tracked_detections = sv.Detections.from_ultralytics(results[0])

            # Separate based on ORIGINAL YOLO ID for classification
            players = tracked_detections[tracked_detections.class_id == PLAYER_ID]
            goalkeepers = tracked_detections[tracked_detections.class_id == GOALKEEPER_ID]
            referees = tracked_detections[tracked_detections.class_id == REFEREE_ID]

            processed_detections_list = []
            
            # Classify Players
            if len(players) > 0:
                player_crops = [sv.crop_image(frame, xyxy) for xyxy in players.xyxy]
                new_player_class_ids = team_classifier.predict(player_crops)
                players.class_id = new_player_class_ids # Overwrite with TEAM_A/B ID
                processed_detections_list.append(players)

            # Classify Goalkeepers
            if len(goalkeepers) > 0:
                new_gk_class_ids = resolve_goalkeepers_team_id(players, goalkeepers)
                goalkeepers.class_id = new_gk_class_ids # Overwrite with TEAM_A/B ID
                processed_detections_list.append(goalkeepers)

            # Assign Referee Annotation ID
            if len(referees) > 0:
                referees.class_id = np.full(len(referees), ANNOTATION_REFEREE_ID)
                processed_detections_list.append(referees)

            # Merge back for OCR and final annotation steps
            if processed_detections_list:
                 detections_for_processing = sv.Detections.merge(processed_detections_list)

        # 4. Persistent ID Logic (State Management + OCR)
        annotation_labels = []
        current_tracker_ids = set()
        if len(detections_for_processing) > 0:
            current_tracker_ids = set(detections_for_processing.tracker_id.astype(int))

            # Mark tracks that disappeared this frame as 'lost' in player_states
            lost_canonical_this_frame = []
            for canonical_id, state in player_states.items():
                if state['current_tracker_id'] is not None and state['current_tracker_id'] not in current_tracker_ids:
                    # print(f"  Track lost for {canonical_id} (tracker_id: {state['current_tracker_id']})")
                    # Remove the potentially stale reverse mapping
                    if state['current_tracker_id'] in tracker_id_to_canonical:
                         if tracker_id_to_canonical[state['current_tracker_id']] == canonical_id:
                              del tracker_id_to_canonical[state['current_tracker_id']]
                    state['current_tracker_id'] = None # Keep state, but mark tracker_id as None
                    lost_canonical_this_frame.append(canonical_id)

            # Process current detections for OCR and state updates
            ocr_time = 0.0
            for i in range(len(detections_for_processing)):
                detection = detections_for_processing[i] # Get individual detection
                tracker_id = int(detection.tracker_id[0]) # Get scalar tracker ID
                bbox = detection.xyxy[0]
                assigned_annotation_class = detection.class_id[0] # Team A/B or Referee ID

                canonical_id_for_this_track = tracker_id_to_canonical.get(tracker_id)
                display_label = f"T:{tracker_id}" # Default label (Use T: prefix)

                ocr_performed = False
                # --- Attempt OCR only for players/GKs ---
                if assigned_annotation_class != ANNOTATION_REFEREE_ID:
                    ocr_start_time = time.time()
                    player_crop = sv.crop_image(annotated_frame, bbox) # Crop from copy
                    number, ocr_confidence = run_ocr_on_crop(player_crop, ocr_model)
                    ocr_time += (time.time() - ocr_start_time)
                    ocr_performed = True

                    if number is not None and ocr_confidence > OCR_CONFIDENCE_THRESHOLD:
                        team_prefix = "A" if assigned_annotation_class == TEAM_A_ID else "B"
                        canonical_id_ocr = f"{team_prefix}{number}" # Compact ID e.g., A10, B7

                        # --- State Update Logic ---
                        # If this tracker ID was previously mapped differently, remove old mapping
                        if tracker_id in tracker_id_to_canonical and tracker_id_to_canonical[tracker_id] != canonical_id_ocr:
                            # print(f"    Conflict: Tracker {tracker_id} mapping changed from {tracker_id_to_canonical[tracker_id]} to {canonical_id_ocr}")
                            # Mark the OLD canonical ID as potentially lost if it's not associated with another tracker
                            old_canonical = tracker_id_to_canonical[tracker_id]
                            if old_canonical in player_states and player_states[old_canonical]['current_tracker_id'] == tracker_id:
                                 player_states[old_canonical]['current_tracker_id'] = None
                            # Remove old mapping before adding new one
                            del tracker_id_to_canonical[tracker_id]


                        # Update/Create player state
                        if canonical_id_ocr not in player_states:
                            player_states[canonical_id_ocr] = {'current_tracker_id': None, 'last_seen_frame': -1, 'last_bbox': None, 'ocr_confidence': 0.0}

                        player_states[canonical_id_ocr].update({
                            'current_tracker_id': tracker_id,
                            'last_seen_frame': frame_count,
                            'last_bbox': bbox,
                            'ocr_confidence': ocr_confidence
                        })
                        tracker_id_to_canonical[tracker_id] = canonical_id_ocr
                        canonical_id_for_this_track = canonical_id_ocr
                        display_label = canonical_id_ocr # Use the confirmed ID

                    # --- End State Update ---

                # --- Update display label and state if association already exists ---
                if canonical_id_for_this_track:
                    display_label = canonical_id_for_this_track # Use existing if OCR failed/low conf
                    # Update last seen frame/bbox for existing association
                    if canonical_id_for_this_track in player_states:
                         player_states[canonical_id_for_this_track]['last_seen_frame'] = frame_count
                         player_states[canonical_id_for_this_track]['last_bbox'] = bbox
                         # Ensure the current tracker ID is correctly linked, especially if it was lost and found
                         player_states[canonical_id_for_this_track]['current_tracker_id'] = tracker_id
                         # Also update the reverse map just in case
                         tracker_id_to_canonical[tracker_id] = canonical_id_for_this_track
                # else: # No OCR success and no existing association
                     # display_label remains the default "T:{tracker_id}"

                annotation_labels.append(display_label)
            # print(f"  OCR time for frame: {ocr_time:.4f}s") # Optional timing

        # 5. Clean up stale player states (optional, can be done less frequently)
        if frame_count % (fps * 10) == 0: # Every 10 seconds
             stale_ids = [
                  cid for cid, state in player_states.items()
                  if state['current_tracker_id'] is None and (frame_count - state['last_seen_frame']) > MAX_ABSENCE_FRAMES
             ]
             for cid in stale_ids:
                  print(f"  Removing stale state for {cid}")
                  del player_states[cid]
                  # Clean up reverse map if any stale tracker ID points to it
                  stale_tracker_id = None
                  for tid, mapped_cid in tracker_id_to_canonical.items():
                       if mapped_cid == cid:
                            stale_tracker_id = tid
                            break
                  if stale_tracker_id:
                       del tracker_id_to_canonical[stale_tracker_id]


        # 6. Annotation
        # Annotate tracked people (using team/role class for color, canonical ID for label)
        if len(detections_for_processing) > 0:
            annotated_frame = ellipse_annotator.annotate(
                scene=annotated_frame,
                detections=detections_for_processing # Ellipse color based on Team A/B/Ref ID
            )
            annotated_frame = label_annotator.annotate(
                scene=annotated_frame,
                detections=detections_for_processing,
                labels=annotation_labels # Label text is Canonical ID or T:tracker_id
            )

        # Annotate ball
        if len(ball_detections) > 0:
            annotated_frame = triangle_annotator.annotate(
                scene=annotated_frame,
                detections=ball_detections
            )

        # 7. Write Frame
        video_writer.write(annotated_frame)
        pbar.update(1)
        frame_time_end = time.time()
        # print(f"Frame {frame_count} processing time: {frame_time_end - frame_time_start:.4f}s") # Optional timing

# Release resources
video_writer.release()
print(f"Finished processing. Annotated video saved to: {OUTPUT_VIDEO_PATH}")
print("\nFinal Player States:")
for cid, state in player_states.items():
    print(f"  {cid}: Last Seen Frame={state['last_seen_frame']}, Current TrackerID={state['current_tracker_id']}")

Using device: cuda:0
Loading YOLO model from: app/models/yolo11_football_v2/weights/best.pt
Initializing PaddleOCR...
PaddleOCR initialized.
Video Info: 1920x1080, FPS: 25, Total Frames: 750


Embedding extraction: 1it [00:00,  8.49it/s]?, ?it/s]
Embedding extraction: 1it [00:00,  8.44it/s]10:46,  1.16it/s]
Embedding extraction: 1it [00:00,  8.51it/s]06:56,  1.80it/s]
Embedding extraction: 1it [00:00,  8.40it/s]05:45,  2.16it/s]
Embedding extraction: 1it [00:00,  8.51it/s]05:12,  2.39it/s]
Embedding extraction: 1it [00:00,  8.43it/s]04:52,  2.54it/s]
Embedding extraction: 1it [00:00,  8.48it/s]04:45,  2.60it/s]
Embedding extraction: 1it [00:00,  8.55it/s]04:35,  2.69it/s]
Embedding extraction: 1it [00:00,  8.50it/s]04:49,  2.57it/s]
Embedding extraction: 1it [00:00,  8.60it/s]04:38,  2.66it/s]
Embedding extraction: 1it [00:00,  8.63it/s]<04:29,  2.74it/s]
Embedding extraction: 1it [00:00,  9.43it/s]<04:23,  2.80it/s]
Embedding extraction: 1it [00:00,  9.64it/s]<04:15,  2.89it/s]
Embedding extraction: 1it [00:00,  8.58it/s]<04:16,  2.88it/s]
Embedding extraction: 1it [00:00,  9.27it/s]<04:19,  2.83it/s]
Embedding extraction: 1it [00:00,  9.07it/s]<04:14,  2.89it/s]
Embedding 

  Removing stale state for A3


Embedding extraction: 1it [00:00,  7.72it/s]
Embedding extraction: 1it [00:00,  7.76it/s]8<03:25,  2.43it/s]
Embedding extraction: 1it [00:00,  7.71it/s]8<03:20,  2.48it/s]
Embedding extraction: 1it [00:00,  7.79it/s]8<03:16,  2.53it/s]
Embedding extraction: 1it [00:00,  7.75it/s]9<03:13,  2.57it/s]
Embedding extraction: 1it [00:00,  7.72it/s]9<03:10,  2.59it/s]
Embedding extraction: 1it [00:00,  8.35it/s]0<03:09,  2.61it/s]
Embedding extraction: 1it [00:00,  7.71it/s]0<03:05,  2.65it/s]
Embedding extraction: 1it [00:00,  7.78it/s]0<03:05,  2.65it/s]
Embedding extraction: 1it [00:00,  7.67it/s]1<03:05,  2.64it/s]
Embedding extraction: 1it [00:00,  7.65it/s]1<03:05,  2.64it/s]
Embedding extraction: 1it [00:00,  7.68it/s]1<03:05,  2.64it/s]
Embedding extraction: 1it [00:00,  8.48it/s]2<03:07,  2.61it/s]
Embedding extraction: 1it [00:00,  7.74it/s]2<03:04,  2.64it/s]
Embedding extraction: 1it [00:00,  7.80it/s]3<03:01,  2.68it/s]
Embedding extraction: 1it [00:00,  7.72it/s]3<03:01,  2.67i

  Removing stale state for A12
  Removing stale state for A72


Embedding extraction: 1it [00:00,  8.63it/s]
Embedding extraction: 1it [00:00,  8.40it/s]0<01:32,  2.70it/s]
Embedding extraction: 1it [00:00,  8.39it/s]0<01:35,  2.58it/s]
Embedding extraction: 1it [00:00,  8.39it/s]0<01:33,  2.64it/s]
Embedding extraction: 1it [00:00,  8.37it/s]1<01:31,  2.67it/s]
Embedding extraction: 1it [00:00,  8.38it/s]1<01:30,  2.70it/s]
Embedding extraction: 1it [00:00,  8.38it/s]2<01:30,  2.69it/s]
Embedding extraction: 1it [00:00,  8.35it/s]2<01:29,  2.72it/s]
Embedding extraction: 1it [00:00,  8.48it/s]2<01:30,  2.67it/s]
Embedding extraction: 1it [00:00,  8.41it/s]3<01:28,  2.72it/s]
Embedding extraction: 1it [00:00,  8.41it/s]3<01:28,  2.72it/s]
Embedding extraction: 1it [00:00,  8.28it/s]3<01:27,  2.73it/s]
Embedding extraction: 1it [00:00,  8.28it/s]4<01:28,  2.70it/s]
Embedding extraction: 1it [00:00,  8.36it/s]4<01:28,  2.68it/s]
Embedding extraction: 1it [00:00,  8.33it/s]4<01:26,  2.74it/s]
Embedding extraction: 1it [00:00,  8.38it/s]5<01:24,  2.79i

  Removing stale state for A1
Finished processing. Annotated video saved to: 0bfacc_0_persistent_tracked.mp4

Final Player States:
  A7: Last Seen Frame=750, Current TrackerID=9
  A2: Last Seen Frame=750, Current TrackerID=8
  B10: Last Seen Frame=750, Current TrackerID=401



