In [1]:
import os
os.environ["ONNXRUNTIME_EXECUTION_PROVIDERS"] = "[CUDAExecutionProvider]"

In [2]:
import os
HOME = os.getcwd()
print(HOME)

/home/ubuntu/projects/sure-football-analysis


In [3]:
# from inference import get_model
from ultralytics import YOLO

# ROBOFLOW_API_KEY = os.environ.get("ROBOFLOW_API_KEY")
# PLAYER_DETECTION_MODEL_ID = "football-players-detection-3zvbc/12"
# PLAYER_DETECTION_MODEL = get_model(PLAYER_DETECTION_MODEL_ID, api_key=ROBOFLOW_API_KEY)
PLAYER_DETECTION_MODEL = YOLO("app/models/yolo11_football_v2/weights/best.pt")

In [4]:
import torch
from transformers import AutoProcessor, SiglipVisionModel

SIGLIP_MODEL_PATH = 'google/siglip-base-patch16-224'

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
EMBEDDINGS_MODEL = SiglipVisionModel.from_pretrained(SIGLIP_MODEL_PATH).to(DEVICE)
EMBEDDINGS_PROCESSOR = AutoProcessor.from_pretrained(SIGLIP_MODEL_PATH)

In [5]:
import supervision as sv
import numpy as np
from more_itertools import chunked
from tqdm import tqdm

SOURCE_VIDEO_PATH = "app/test_data/raw/0bfacc_0.mp4"
BATCH_SIZE = 64
PLAYER_ID = 2
STRIDE = 30

frame_generator = sv.get_video_frames_generator(
    source_path=SOURCE_VIDEO_PATH, stride=STRIDE)

crops = []
for frame in tqdm(frame_generator, desc='collecting crops'):
    # result = PLAYER_DETECTION_MODEL.infer(frame, confidence=0.3)[0]
    result = PLAYER_DETECTION_MODEL.predict(frame, conf=0.3)[0]
    # detections = sv.Detections.from_inference(result)
    detections = sv.Detections.from_ultralytics(result)
    detections = detections.with_nms(threshold=0.5, class_agnostic=True)
    detections = detections[detections.class_id == PLAYER_ID]
    players_crops = [sv.crop_image(frame, xyxy) for xyxy in detections.xyxy]
    crops += players_crops


crops = [sv.cv2_to_pillow(crop) for crop in crops]
batches = chunked(crops, BATCH_SIZE)
data = []
with torch.no_grad():
    for batch in tqdm(batches, desc='embedding extraction'):
        inputs = EMBEDDINGS_PROCESSOR(images=batch, return_tensors="pt").to(DEVICE)
        outputs = EMBEDDINGS_MODEL(**inputs)
        embeddings = torch.mean(outputs.last_hidden_state, dim=1).cpu().numpy()
        data.append(embeddings)

data = np.concatenate(data)

collecting crops: 0it [00:00, ?it/s]


0: 736x1280 20 players, 3 referees, 72.6ms
Speed: 14.0ms preprocess, 72.6ms inference, 236.4ms postprocess per image at shape (1, 3, 736, 1280)


collecting crops: 1it [00:02,  2.02s/it]


0: 736x1280 20 players, 3 referees, 33.1ms
Speed: 10.3ms preprocess, 33.1ms inference, 0.9ms postprocess per image at shape (1, 3, 736, 1280)


collecting crops: 2it [00:02,  1.10it/s]


0: 736x1280 1 ball, 21 players, 3 referees, 32.7ms
Speed: 6.8ms preprocess, 32.7ms inference, 0.9ms postprocess per image at shape (1, 3, 736, 1280)


collecting crops: 3it [00:02,  1.78it/s]


0: 736x1280 1 ball, 20 players, 3 referees, 33.0ms
Speed: 11.2ms preprocess, 33.0ms inference, 1.0ms postprocess per image at shape (1, 3, 736, 1280)


collecting crops: 4it [00:02,  2.54it/s]


0: 736x1280 20 players, 3 referees, 32.4ms
Speed: 6.7ms preprocess, 32.4ms inference, 0.9ms postprocess per image at shape (1, 3, 736, 1280)


collecting crops: 5it [00:02,  3.28it/s]


0: 736x1280 22 players, 3 referees, 32.4ms
Speed: 9.7ms preprocess, 32.4ms inference, 1.0ms postprocess per image at shape (1, 3, 736, 1280)


collecting crops: 6it [00:02,  4.03it/s]


0: 736x1280 21 players, 4 referees, 32.5ms
Speed: 6.8ms preprocess, 32.5ms inference, 0.9ms postprocess per image at shape (1, 3, 736, 1280)


collecting crops: 7it [00:02,  4.71it/s]


0: 736x1280 20 players, 3 referees, 32.4ms
Speed: 9.0ms preprocess, 32.4ms inference, 1.0ms postprocess per image at shape (1, 3, 736, 1280)


collecting crops: 8it [00:02,  5.39it/s]


0: 736x1280 1 ball, 21 players, 3 referees, 32.4ms
Speed: 7.0ms preprocess, 32.4ms inference, 1.0ms postprocess per image at shape (1, 3, 736, 1280)


collecting crops: 9it [00:03,  5.87it/s]


0: 736x1280 21 players, 3 referees, 33.0ms
Speed: 6.7ms preprocess, 33.0ms inference, 0.9ms postprocess per image at shape (1, 3, 736, 1280)


collecting crops: 10it [00:03,  6.11it/s]


0: 736x1280 1 ball, 21 players, 3 referees, 32.4ms
Speed: 11.1ms preprocess, 32.4ms inference, 1.4ms postprocess per image at shape (1, 3, 736, 1280)


collecting crops: 11it [00:03,  6.48it/s]


0: 736x1280 22 players, 3 referees, 32.4ms
Speed: 9.3ms preprocess, 32.4ms inference, 1.0ms postprocess per image at shape (1, 3, 736, 1280)


collecting crops: 12it [00:03,  6.21it/s]


0: 736x1280 21 players, 3 referees, 32.4ms
Speed: 9.6ms preprocess, 32.4ms inference, 0.9ms postprocess per image at shape (1, 3, 736, 1280)


collecting crops: 13it [00:03,  5.79it/s]


0: 736x1280 20 players, 3 referees, 33.0ms
Speed: 11.0ms preprocess, 33.0ms inference, 0.9ms postprocess per image at shape (1, 3, 736, 1280)


collecting crops: 14it [00:03,  6.01it/s]


0: 736x1280 1 ball, 20 players, 3 referees, 32.7ms
Speed: 9.4ms preprocess, 32.7ms inference, 0.9ms postprocess per image at shape (1, 3, 736, 1280)


collecting crops: 15it [00:04,  6.38it/s]


0: 736x1280 20 players, 3 referees, 32.8ms
Speed: 7.3ms preprocess, 32.8ms inference, 1.0ms postprocess per image at shape (1, 3, 736, 1280)


collecting crops: 16it [00:04,  6.57it/s]


0: 736x1280 20 players, 3 referees, 32.7ms
Speed: 11.1ms preprocess, 32.7ms inference, 1.0ms postprocess per image at shape (1, 3, 736, 1280)


collecting crops: 17it [00:04,  6.91it/s]


0: 736x1280 20 players, 3 referees, 32.4ms
Speed: 11.2ms preprocess, 32.4ms inference, 0.9ms postprocess per image at shape (1, 3, 736, 1280)


collecting crops: 18it [00:04,  6.93it/s]


0: 736x1280 21 players, 2 referees, 32.4ms
Speed: 7.2ms preprocess, 32.4ms inference, 0.9ms postprocess per image at shape (1, 3, 736, 1280)


collecting crops: 19it [00:04,  6.94it/s]


0: 736x1280 21 players, 2 referees, 32.6ms
Speed: 11.0ms preprocess, 32.6ms inference, 0.9ms postprocess per image at shape (1, 3, 736, 1280)


collecting crops: 20it [00:04,  7.09it/s]


0: 736x1280 1 ball, 20 players, 3 referees, 32.8ms
Speed: 6.9ms preprocess, 32.8ms inference, 1.0ms postprocess per image at shape (1, 3, 736, 1280)


collecting crops: 21it [00:04,  7.13it/s]


0: 736x1280 20 players, 3 referees, 33.0ms
Speed: 9.2ms preprocess, 33.0ms inference, 0.9ms postprocess per image at shape (1, 3, 736, 1280)


collecting crops: 22it [00:05,  7.21it/s]


0: 736x1280 21 players, 3 referees, 32.9ms
Speed: 6.7ms preprocess, 32.9ms inference, 0.9ms postprocess per image at shape (1, 3, 736, 1280)


collecting crops: 23it [00:05,  7.20it/s]


0: 736x1280 1 ball, 19 players, 3 referees, 33.0ms
Speed: 10.9ms preprocess, 33.0ms inference, 0.9ms postprocess per image at shape (1, 3, 736, 1280)


collecting crops: 24it [00:05,  7.15it/s]


0: 736x1280 1 ball, 18 players, 3 referees, 33.0ms
Speed: 6.7ms preprocess, 33.0ms inference, 1.1ms postprocess per image at shape (1, 3, 736, 1280)


collecting crops: 25it [00:05,  4.52it/s]
embedding extraction: 8it [00:03,  2.54it/s]


In [6]:
import umap
from sklearn.cluster import KMeans
from sports.common.team import TeamClassifier


REDUCER = umap.UMAP(n_components=3)
CLUSTERING_MODEL = KMeans(n_clusters=2)

projections = REDUCER.fit_transform(data)
clusters = CLUSTERING_MODEL.fit_predict(projections)

frame_generator = sv.get_video_frames_generator(
    source_path=SOURCE_VIDEO_PATH, stride=STRIDE)

crops = []
for frame in tqdm(frame_generator, desc='collecting crops'):
    # result = PLAYER_DETECTION_MODEL.infer(frame, confidence=0.3)[0]
    result = PLAYER_DETECTION_MODEL.predict(frame, conf=0.3)[0]
    # detections = sv.Detections.from_inference(result)
    detections = sv.Detections.from_ultralytics(result)
    players_detections = detections[detections.class_id == PLAYER_ID]
    players_crops = [sv.crop_image(frame, xyxy) for xyxy in detections.xyxy]
    crops += players_crops

team_classifier = TeamClassifier(device="cuda")
team_classifier.fit(crops)

collecting crops: 0it [00:00, ?it/s]


0: 736x1280 20 players, 3 referees, 32.3ms
Speed: 5.9ms preprocess, 32.3ms inference, 0.9ms postprocess per image at shape (1, 3, 736, 1280)

0: 736x1280 20 players, 3 referees, 32.7ms
Speed: 7.7ms preprocess, 32.7ms inference, 0.9ms postprocess per image at shape (1, 3, 736, 1280)


collecting crops: 2it [00:00,  9.47it/s]


0: 736x1280 1 ball, 21 players, 3 referees, 32.4ms
Speed: 5.5ms preprocess, 32.4ms inference, 1.0ms postprocess per image at shape (1, 3, 736, 1280)


collecting crops: 3it [00:00,  8.29it/s]


0: 736x1280 1 ball, 20 players, 3 referees, 32.8ms
Speed: 8.1ms preprocess, 32.8ms inference, 1.0ms postprocess per image at shape (1, 3, 736, 1280)


collecting crops: 4it [00:00,  7.91it/s]


0: 736x1280 20 players, 3 referees, 32.3ms
Speed: 5.5ms preprocess, 32.3ms inference, 0.9ms postprocess per image at shape (1, 3, 736, 1280)


collecting crops: 5it [00:00,  7.49it/s]


0: 736x1280 22 players, 3 referees, 32.4ms
Speed: 8.2ms preprocess, 32.4ms inference, 0.9ms postprocess per image at shape (1, 3, 736, 1280)


collecting crops: 6it [00:00,  7.41it/s]


0: 736x1280 21 players, 4 referees, 32.5ms
Speed: 6.3ms preprocess, 32.5ms inference, 0.9ms postprocess per image at shape (1, 3, 736, 1280)


collecting crops: 7it [00:00,  7.35it/s]


0: 736x1280 20 players, 3 referees, 32.5ms
Speed: 7.7ms preprocess, 32.5ms inference, 0.9ms postprocess per image at shape (1, 3, 736, 1280)


collecting crops: 8it [00:01,  7.44it/s]


0: 736x1280 1 ball, 21 players, 3 referees, 32.2ms
Speed: 5.4ms preprocess, 32.2ms inference, 0.9ms postprocess per image at shape (1, 3, 736, 1280)


collecting crops: 9it [00:01,  7.44it/s]


0: 736x1280 21 players, 3 referees, 32.5ms
Speed: 5.6ms preprocess, 32.5ms inference, 1.1ms postprocess per image at shape (1, 3, 736, 1280)


collecting crops: 10it [00:01,  7.21it/s]


0: 736x1280 1 ball, 21 players, 3 referees, 32.3ms
Speed: 7.5ms preprocess, 32.3ms inference, 1.0ms postprocess per image at shape (1, 3, 736, 1280)


collecting crops: 11it [00:01,  7.37it/s]


0: 736x1280 22 players, 3 referees, 32.3ms
Speed: 5.6ms preprocess, 32.3ms inference, 0.9ms postprocess per image at shape (1, 3, 736, 1280)


collecting crops: 12it [00:01,  7.30it/s]


0: 736x1280 21 players, 3 referees, 32.9ms
Speed: 8.3ms preprocess, 32.9ms inference, 0.9ms postprocess per image at shape (1, 3, 736, 1280)


collecting crops: 13it [00:01,  7.37it/s]


0: 736x1280 20 players, 3 referees, 32.4ms
Speed: 6.0ms preprocess, 32.4ms inference, 0.9ms postprocess per image at shape (1, 3, 736, 1280)


collecting crops: 14it [00:01,  7.21it/s]


0: 736x1280 1 ball, 20 players, 3 referees, 32.8ms
Speed: 8.5ms preprocess, 32.8ms inference, 0.9ms postprocess per image at shape (1, 3, 736, 1280)


collecting crops: 15it [00:02,  7.30it/s]


0: 736x1280 20 players, 3 referees, 32.7ms
Speed: 5.4ms preprocess, 32.7ms inference, 0.9ms postprocess per image at shape (1, 3, 736, 1280)


collecting crops: 16it [00:02,  7.26it/s]


0: 736x1280 20 players, 3 referees, 32.6ms
Speed: 7.4ms preprocess, 32.6ms inference, 0.9ms postprocess per image at shape (1, 3, 736, 1280)


collecting crops: 17it [00:02,  7.44it/s]


0: 736x1280 20 players, 3 referees, 32.5ms
Speed: 7.5ms preprocess, 32.5ms inference, 0.9ms postprocess per image at shape (1, 3, 736, 1280)


collecting crops: 18it [00:02,  7.33it/s]


0: 736x1280 21 players, 2 referees, 32.3ms
Speed: 9.1ms preprocess, 32.3ms inference, 0.9ms postprocess per image at shape (1, 3, 736, 1280)


collecting crops: 19it [00:02,  7.19it/s]


0: 736x1280 21 players, 2 referees, 32.9ms
Speed: 6.9ms preprocess, 32.9ms inference, 1.0ms postprocess per image at shape (1, 3, 736, 1280)


collecting crops: 20it [00:02,  7.32it/s]


0: 736x1280 1 ball, 20 players, 3 referees, 32.7ms
Speed: 9.2ms preprocess, 32.7ms inference, 0.9ms postprocess per image at shape (1, 3, 736, 1280)


collecting crops: 21it [00:02,  7.25it/s]


0: 736x1280 20 players, 3 referees, 32.4ms
Speed: 8.7ms preprocess, 32.4ms inference, 1.0ms postprocess per image at shape (1, 3, 736, 1280)


collecting crops: 22it [00:02,  7.35it/s]


0: 736x1280 21 players, 3 referees, 33.0ms
Speed: 9.2ms preprocess, 33.0ms inference, 1.0ms postprocess per image at shape (1, 3, 736, 1280)


collecting crops: 23it [00:03,  7.21it/s]


0: 736x1280 1 ball, 19 players, 3 referees, 32.8ms
Speed: 7.5ms preprocess, 32.8ms inference, 0.9ms postprocess per image at shape (1, 3, 736, 1280)


collecting crops: 24it [00:03,  7.22it/s]


0: 736x1280 1 ball, 18 players, 3 referees, 32.5ms
Speed: 5.7ms preprocess, 32.5ms inference, 0.9ms postprocess per image at shape (1, 3, 736, 1280)


collecting crops: 25it [00:03,  7.25it/s]
Embedding extraction: 19it [00:03,  4.95it/s]


In [7]:
import numpy as np
import supervision as sv

def resolve_goalkeepers_team_id(
    players: sv.Detections,
    goalkeepers: sv.Detections
) -> np.ndarray:
    goalkeepers_xy = goalkeepers.get_anchors_coordinates(sv.Position.BOTTOM_CENTER)
    players_xy = players.get_anchors_coordinates(sv.Position.BOTTOM_CENTER)
    team_0_centroid = players_xy[players.class_id == 0].mean(axis=0)
    team_1_centroid = players_xy[players.class_id == 1].mean(axis=0)
    goalkeepers_team_id = []
    for goalkeeper_xy in goalkeepers_xy:
        dist_0 = np.linalg.norm(goalkeeper_xy - team_0_centroid)
        dist_1 = np.linalg.norm(goalkeeper_xy - team_1_centroid)
        goalkeepers_team_id.append(0 if dist_0 < dist_1 else 1)

    return np.array(goalkeepers_team_id)


## Tracking Method 1 wih BotSort from Boxmot

In [12]:
import supervision as sv
from tqdm import tqdm
import numpy as np
from boxmot import BotSort # Import BoTSORT
import cv2
from pathlib import Path
import torch

# ----- Assumed Globals (Make sure these are defined/loaded) -----
# Ensure these models and functions are loaded/defined before use:
# PLAYER_DETECTION_MODEL = ... # Your loaded YOLO model
# team_classifier = ... # Your loaded team classification model
# def resolve_goalkeepers_team_id(players_detections, goalkeepers_detections):
#     # ... implementation ...
#     return goalkeeper_class_ids_array

# ----- Configuration -----
SOURCE_VIDEO_PATH = "app/test_data/raw/0bfacc_0.mp4"
OUTPUT_VIDEO_PATH = "0bfacc_0_botsort_tracked.mp4" # Updated output name
device = torch.device(0) if torch.cuda.is_available() else torch.device('cpu') # Use GPU if available

# Class IDs (as used in your training)
BALL_ID = 0
GOALKEEPER_ID = 1
PLAYER_ID = 2
REFEREE_ID = 3

# ----- Annotators -----
ellipse_annotator = sv.EllipseAnnotator(
    color=sv.ColorPalette.from_hex(['#00BFFF', '#FF1493', '#FFD700']), # Blue, Pink, Yellow for classes 0, 1, 2
    thickness=2
)
label_annotator = sv.LabelAnnotator(
    color=sv.ColorPalette.from_hex(['#00BFFF', '#FF1493', '#FFD700']),
    text_color=sv.Color.from_hex('#000000'),
    text_position=sv.Position.BOTTOM_CENTER
)
triangle_annotator = sv.TriangleAnnotator(
    color=sv.Color.from_hex('#FFD700'), # Assuming you want yellow for the ball
    base=25,
    height=21,
    outline_thickness=1
)

# ----- Tracker Initialization -----
# Initialize BoTSORT tracker
# Common BoTSORT args (adjust as needed based on boxmot documentation/your needs):
# - track_high_thresh: High confidence threshold for starting a track.
# - track_low_thresh: Low confidence threshold for linking.
# - new_track_thresh: Threshold for creating a new track from unmatched detections.
# - track_buffer: Number of frames to keep lost tracks.
# - match_thresh: IoU threshold for matching.
# - proximity_thresh: Proximity threshold (for matching by distance)
# - appearance_thresh: Appearance similarity threshold (if using ReID features)
# - cmc_method: Method for camera motion compensation
# Using default parameters here, specify if needed, e.g., BoTSORT(track_high_thresh=0.5, ...)
tracker = BotSort(
    reid_weights=Path('clip_market1501.pt'),
    device=device,
    half=False,
    with_reid=True,
)
# Note: BoTSORT doesn't typically need an explicit reset like ByteTrack for single video processing.

# ----- Video Processing Setup -----
# Use sv.VideoInfo to get properties like width, height, fps
try:
    video_info = sv.VideoInfo.from_video_path(SOURCE_VIDEO_PATH)
    width, height, fps = video_info.width, video_info.height, video_info.fps
    total_frames = video_info.total_frames or int(fps * 20) # Estimate if total_frames is None, process max 20s
    print(f"Video Info: {width}x{height}, FPS: {fps}, Total Frames: {total_frames}")
except Exception as e:
    print(f"Warning: Could not get video info using supervision. Using defaults. Error: {e}")
    # Fallback if sv.VideoInfo fails or source is not standard video file
    cap = cv2.VideoCapture(SOURCE_VIDEO_PATH)
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = int(cap.get(cv2.CAP_PROP_FPS))
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    cap.release()
    if fps == 0: fps = 30 # Provide a default fps if reading failed
    if total_frames == 0: total_frames = int(fps * 20) # Process max 20 seconds
    print(f"Fallback Video Info: {width}x{height}, FPS: {fps}, Total Frames: {total_frames}")


# Create frame generator
frame_generator = sv.get_video_frames_generator(source_path=SOURCE_VIDEO_PATH, stride=1)

# Initialize video writer
fourcc = cv2.VideoWriter_fourcc(*'mp4v') # or 'avc1', 'XVID'
video_writer = cv2.VideoWriter(OUTPUT_VIDEO_PATH, fourcc, fps, (width, height))

# ----- Frame Processing Function -----
def process_frame(frame: np.ndarray, frame_idx: int):
    """
    Processes a single frame: detects objects, classifies teams, tracks people, annotates.
    """
    # 1. Detection
    # Assuming PLAYER_DETECTION_MODEL outputs results compatible with ultralytics
    result = PLAYER_DETECTION_MODEL.predict(frame, conf=0.3, device=device, verbose=False)[0] # Added verbose=False
    detections = sv.Detections.from_ultralytics(result)

    # 2. Pre-processing Detections
    # Separate ball detections and pad their boxes.
    ball_detections = detections[detections.class_id == BALL_ID]
    if len(ball_detections) > 0:
        ball_detections.xyxy = sv.pad_boxes(xyxy=ball_detections.xyxy, px=10)

    # Get all non-ball detections and apply NMS
    people_detections = detections[detections.class_id != BALL_ID]
    if len(people_detections) > 0:
        people_detections = people_detections.with_nms(threshold=0.5, class_agnostic=True)

    # 3. Team/Role Classification
    players_detections = people_detections[people_detections.class_id == PLAYER_ID]
    goalkeepers_detections = people_detections[people_detections.class_id == GOALKEEPER_ID]
    referees_detections = people_detections[people_detections.class_id == REFEREE_ID]

    # Classify players if any exist
    if len(players_detections) > 0:
        players_crops = [sv.crop_image(frame, xyxy) for xyxy in players_detections.xyxy]
        # Assuming team_classifier.predict returns class IDs (e.g., 0 for team A, 1 for team B)
        players_detections.class_id = team_classifier.predict(players_crops)

    # Classify goalkeepers if any exist
    if len(goalkeepers_detections) > 0:
        # Assuming resolve_goalkeepers_team_id returns class IDs consistent with player teams
        goalkeepers_detections.class_id = resolve_goalkeepers_team_id(players_detections, goalkeepers_detections)

    # Adjust referee class ID (e.g., map to 2 if players/GKs are 0 and 1)
    if len(referees_detections) > 0:
        referees_detections.class_id = np.full(len(referees_detections), 2) # Assign class ID 2 (e.g., Yellow)

    # Merge detections *intended for tracking* (players, goalkeepers, referees)
    detections_to_track = sv.Detections.merge([
        players_detections, goalkeepers_detections, referees_detections
    ])

    # 4. Tracking using BoTSORT
    tracked_detections = sv.Detections.empty() # Initialize as empty
    if len(detections_to_track) > 0:
        # Convert sv.Detections to BoxMOT format: numpy array N x [x1, y1, x2, y2, conf, cls_id]
        boxmot_input = np.hstack((
            detections_to_track.xyxy,
            detections_to_track.confidence[:, np.newaxis], # Ensure confidence is (N, 1)
            detections_to_track.class_id[:, np.newaxis]    # Ensure class_id is (N, 1)
        ))

        # Update BoT-SORT tracker - requires the frame for potential appearance features
        # Output format is typically numpy array N x [x1, y1, x2, y2, track_id, conf, cls, *optional idx*]
        tracks = tracker.update(boxmot_input, frame) # Pass the *original frame*

        # Convert tracker output back to sv.Detections if tracks exist
        if tracks.shape[0] > 0:
            tracked_detections = sv.Detections(
                xyxy=tracks[:, 0:4],
                confidence=tracks[:, 5],
                class_id=tracks[:, 6].astype(int), # Ensure class IDs are integers
                tracker_id=tracks[:, 4].astype(int) # Ensure track IDs are integers
            )
    else:
        # If no detections to track, still update tracker with empty array to advance its state
         tracker.update(np.empty((0, 6)), frame)


    # 5. Annotation
    annotated_frame = frame.copy()

    # Annotate tracked objects (players, GKs, referees)
    if len(tracked_detections) > 0:
        # Create labels with tracker IDs
        labels = [f"#{tid} C{cid}" for tid, cid in zip(tracked_detections.tracker_id, tracked_detections.class_id)]
        # Annotate ellipses based on class ID (0, 1, 2 assigned during classification)
        annotated_frame = ellipse_annotator.annotate(
            scene=annotated_frame,
            detections=tracked_detections # Use class_id from tracked_detections
        )
        # Annotate labels with tracker IDs
        annotated_frame = label_annotator.annotate(
            scene=annotated_frame,
            detections=tracked_detections,
            labels=labels
        )

    # Annotate ball (which is not tracked by BoTSORT in this setup)
    if len(ball_detections) > 0:
        annotated_frame = triangle_annotator.annotate(
            scene=annotated_frame,
            detections=ball_detections
        )

    return annotated_frame

# ----- Main Video Processing Loop -----
with tqdm(total=total_frames, desc="Processing video with BoTSORT") as pbar:
    for frame_idx, frame in enumerate(frame_generator):
        annotated_frame = process_frame(frame, frame_idx)
        video_writer.write(annotated_frame)
        pbar.update(1)
        # Optional: Break early for testing
        # if frame_idx >= fps * 10: # Process only 10 seconds
        #    break

# Release the video writer
video_writer.release()
print(f"Finished processing. Annotated video saved to: {OUTPUT_VIDEO_PATH}")

[32m2025-04-13 07:02:38.631[0m | [1mINFO    [0m | [36mboxmot.utils.torch_utils[0m:[36mselect_device[0m:[36m52[0m - [1mYolo Tracking v12.0.5 🚀 Python-3.11.11 torch-2.5.1+cu121
CUDA:0 (NVIDIA L4, 22478MiB)[0m


Resized position embedding: %s to %s torch.Size([197, 768]) torch.Size([129, 768])
Position embedding resize to height:16 width: 8


[32m2025-04-13 07:02:45.022[0m | [32m[1mSUCCESS [0m | [36mboxmot.appearance.reid_model_factory[0m:[36mload_pretrained_weights[0m:[36m183[0m - [32m[1mLoaded pretrained weights from clip_market1501.pt[0m


Video Info: 1920x1080, FPS: 25, Total Frames: 750


Embedding extraction: 1it [00:00,  9.28it/s]  | 0/750 [00:00<?, ?it/s]
Embedding extraction: 1it [00:00,  8.44it/s]  | 1/750 [00:03<48:01,  3.85s/it]
Embedding extraction: 1it [00:00,  8.53it/s]  | 2/750 [00:04<22:06,  1.77s/it]
Embedding extraction: 1it [00:00,  8.48it/s]  | 3/750 [00:04<13:48,  1.11s/it]
Embedding extraction: 1it [00:00,  8.43it/s]  | 4/750 [00:04<09:43,  1.28it/s]
Embedding extraction: 1it [00:00,  8.47it/s]  | 5/750 [00:05<07:38,  1.62it/s]
Embedding extraction: 1it [00:00,  8.35it/s]  | 6/750 [00:05<06:18,  1.97it/s]
Embedding extraction: 1it [00:00,  8.45it/s]  | 7/750 [00:05<05:22,  2.31it/s]
Embedding extraction: 1it [00:00,  8.42it/s]  | 8/750 [00:05<04:55,  2.51it/s]
Embedding extraction: 1it [00:00,  8.48it/s]  | 9/750 [00:06<04:37,  2.67it/s]
Embedding extraction: 1it [00:00,  8.61it/s]  | 10/750 [00:06<04:23,  2.80it/s]
Embedding extraction: 1it [00:00,  9.29it/s]  | 11/750 [00:06<04:13,  2.91it/s]
Embedding extraction: 1it [00:00,  9.33it/s]  | 12/750 [00

Finished processing. Annotated video saved to: 0bfacc_0_botsort_tracked.mp4





## Tracking Method 2 wih BotSort from Ultralytics

In [13]:
import supervision as sv
from tqdm import tqdm
import numpy as np
# from boxmot import BoTSORT # No longer needed directly
import cv2
from pathlib import Path
import torch
from ultralytics import YOLO # Import YOLO from ultralytics

# ----- Assumed Globals (Make sure these are defined/loaded) -----
# Ensure these models and functions are loaded/defined before use:
# team_classifier = ... # Your loaded team classification model
# def resolve_goalkeepers_team_id(players_detections, goalkeepers_detections):
#     # ... implementation returning numpy array of class IDs ...
#     return goalkeeper_class_ids_array

# ----- Configuration -----
SOURCE_VIDEO_PATH = "app/test_data/raw/0bfacc_0.mp4"
OUTPUT_VIDEO_PATH = "0bfacc_0_ultralytics_botsort_tracked_2.mp4" # Updated output name
YOLO_MODEL_PATH = "app/models/yolo11_football_v2/weights/best.pt" # Or your custom trained YOLO model path
TRACKER_CONFIG = "botsort.yaml" # Make sure this config file is available or use default

device = torch.device(0) if torch.cuda.is_available() else torch.device('cpu')

# --- IMPORTANT: Define Class IDs as used by your YOLO model ---
# These MUST match the class IDs your YOLO model was trained with
BALL_ID = 0         # Example: Ball is class 0 in the YOLO model
GOALKEEPER_ID = 1   # Example: Goalkeeper is class 1
PLAYER_ID = 2       # Example: Player is class 2
REFEREE_ID = 3      # Example: Referee is class 3

# --- Define NEW Class IDs for Annotation (after team classification) ---
# These will be used by the annotators AFTER your custom logic.
# Let's say Team A = 0, Team B = 1, Referee = 2
TEAM_A_ID = 0
TEAM_B_ID = 1
ANNOTATION_REFEREE_ID = 2
# We'll map the classified players/GKs to TEAM_A_ID or TEAM_B_ID
# We'll map detected referees to ANNOTATION_REFEREE_ID

# ----- Load YOLO Model -----
model = YOLO(YOLO_MODEL_PATH)
model.to(device)

# ----- Annotators -----
# Use colors corresponding to the NEW annotation IDs (Team A, Team B, Referee)
ellipse_annotator = sv.EllipseAnnotator(
    color=sv.ColorPalette.from_hex(['#00BFFF', '#FF1493', '#FFD700']), # Team A (Blue), Team B (Pink), Referee (Yellow)
    thickness=2
)
label_annotator = sv.LabelAnnotator(
    color=sv.ColorPalette.from_hex(['#00BFFF', '#FF1493', '#FFD700']),
    text_color=sv.Color.from_hex('#000000'),
    text_position=sv.Position.BOTTOM_CENTER,
    text_scale=0.6 # Smaller text
)
triangle_annotator = sv.TriangleAnnotator(
    color=sv.Color.from_hex('#32CD32'), # Lime green for ball
    base=25,
    height=21,
    outline_thickness=1
)

# ----- Video Processing Setup -----
try:
    video_info = sv.VideoInfo.from_video_path(SOURCE_VIDEO_PATH)
    width, height, fps = video_info.width, video_info.height, video_info.fps
    total_frames = video_info.total_frames or int(fps * 20) # Estimate if total_frames is None
    print(f"Video Info: {width}x{height}, FPS: {fps}, Total Frames: {total_frames}")
except Exception as e:
    print(f"Warning: Could not get video info using supervision. Using defaults. Error: {e}")
    # Fallback if sv.VideoInfo fails
    cap = cv2.VideoCapture(SOURCE_VIDEO_PATH)
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = int(cap.get(cv2.CAP_PROP_FPS))
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    cap.release()
    if fps == 0: fps = 30
    if total_frames == 0: total_frames = int(fps * 20)
    print(f"Fallback Video Info: {width}x{height}, FPS: {fps}, Total Frames: {total_frames}")

frame_generator = sv.get_video_frames_generator(source_path=SOURCE_VIDEO_PATH, stride=1)

fourcc = cv2.VideoWriter_fourcc(*'mp4v')
video_writer = cv2.VideoWriter(OUTPUT_VIDEO_PATH, fourcc, fps, (width, height))

# ----- Main Video Processing Loop -----
with tqdm(total=total_frames, desc=f"Tracking with BoT-SORT ({TRACKER_CONFIG})") as pbar:
    for frame_idx, frame in enumerate(frame_generator):
        # 1. Run Integrated Detection and Tracking (Only for player-like objects)
        # We track only the classes we intend to classify later.
        # 'persist=True' maintains tracker state across frames.
        # 'classes' filters detections BEFORE tracking.
        results = model.track(
            source=frame,
            persist=True,
            tracker=TRACKER_CONFIG,
            classes=[GOALKEEPER_ID, PLAYER_ID, REFEREE_ID], # Track only these initial IDs
            conf=0.3, # Confidence threshold for initial detection
            verbose=False, # Suppress Ultralytics console output per frame
            device=device
        )

        # Separately detect the ball (as it's not classified/tracked the same way)
        # Using predict for ball allows different confidence etc.
        ball_results = model.predict(frame, classes=[BALL_ID], conf=0.1, verbose=False, device=device)
        ball_detections = sv.Detections.from_ultralytics(ball_results[0])
        if len(ball_detections) > 0:
             ball_detections.xyxy = sv.pad_boxes(xyxy=ball_detections.xyxy, px=10)


        # 2. Process Tracking Results
        annotated_frame = frame.copy()
        detections_for_annotation = sv.Detections.empty() # Start with empty detections

        if results[0].boxes.id is not None: # Check if tracking IDs are assigned
            # Convert tracked results to Supervision Detections object
            # This object contains xyxy, confidence, original class_id, and tracker_id
            tracked_detections = sv.Detections.from_ultralytics(results[0])

            # 3. Apply Custom Classification based on TRACKED items
            # Filter detections based on ORIGINAL class ID from YOLO
            players = tracked_detections[tracked_detections.class_id == PLAYER_ID]
            goalkeepers = tracked_detections[tracked_detections.class_id == GOALKEEPER_ID]
            referees = tracked_detections[tracked_detections.class_id == REFEREE_ID]

            final_detections_list = []

            # Process Players
            if len(players) > 0:
                player_crops = [sv.crop_image(frame, xyxy) for xyxy in players.xyxy]
                # Assume team_classifier returns an array of TEAM_A_ID or TEAM_B_ID
                new_player_class_ids = team_classifier.predict(player_crops)
                players.class_id = new_player_class_ids # OVERWRITE class ID for annotation
                final_detections_list.append(players)

            # Process Goalkeepers
            if len(goalkeepers) > 0:
                # Assume resolve_goalkeepers_team_id uses players info and returns TEAM_A_ID/TEAM_B_ID
                new_gk_class_ids = resolve_goalkeepers_team_id(players, goalkeepers) # Pass sv.Detections
                goalkeepers.class_id = new_gk_class_ids # OVERWRITE class ID for annotation
                final_detections_list.append(goalkeepers)

            # Process Referees
            if len(referees) > 0:
                # Assign the predefined annotation ID for referees
                referees.class_id = np.full(len(referees), ANNOTATION_REFEREE_ID)
                final_detections_list.append(referees)

            # Merge all processed detections back together for annotation
            if final_detections_list:
                 detections_for_annotation = sv.Detections.merge(final_detections_list)


        # 4. Annotation using Supervision
        # Annotate tracked+classified objects
        if len(detections_for_annotation) > 0:
            # Create labels using tracker ID and the NEW class ID
            labels = [
                f"T:{tid} C:{cid}"
                for tid, cid
                in zip(detections_for_annotation.tracker_id, detections_for_annotation.class_id)
            ]
            # Annotate ellipses based on the NEW class ID (Team A, Team B, Referee)
            annotated_frame = ellipse_annotator.annotate(
                scene=annotated_frame,
                detections=detections_for_annotation
            )
            # Annotate labels
            annotated_frame = label_annotator.annotate(
                scene=annotated_frame,
                detections=detections_for_annotation,
                labels=labels
            )

        # Annotate the ball separately
        if len(ball_detections) > 0:
            annotated_frame = triangle_annotator.annotate(
                scene=annotated_frame,
                detections=ball_detections
            )

        # 5. Write Frame
        video_writer.write(annotated_frame)
        pbar.update(1)

# Release resources
video_writer.release()
print(f"Finished processing. Annotated video saved to: {OUTPUT_VIDEO_PATH}")
# Optional: Close any CV2 windows if you were displaying frames live
# cv2.destroyAllWindows()

Video Info: 1920x1080, FPS: 25, Total Frames: 750


Embedding extraction: 1it [00:00,  8.45it/s]          | 0/750 [00:00<?, ?it/s]
Embedding extraction: 1it [00:00,  8.43it/s]          | 1/750 [00:00<09:32,  1.31it/s]
Embedding extraction: 1it [00:00,  8.53it/s]          | 2/750 [00:01<05:51,  2.13it/s]
Embedding extraction: 1it [00:00,  8.49it/s]          | 3/750 [00:01<04:41,  2.66it/s]
Embedding extraction: 1it [00:00,  8.52it/s]          | 4/750 [00:01<04:05,  3.03it/s]
Embedding extraction: 1it [00:00,  8.49it/s]          | 5/750 [00:01<03:46,  3.29it/s]
Embedding extraction: 1it [00:00,  8.48it/s]          | 6/750 [00:02<03:39,  3.40it/s]
Embedding extraction: 1it [00:00,  9.31it/s]          | 7/750 [00:02<03:30,  3.53it/s]
Embedding extraction: 1it [00:00,  8.41it/s]          | 8/750 [00:02<03:22,  3.67it/s]
Embedding extraction: 1it [00:00,  8.64it/s]          | 9/750 [00:02<03:19,  3.71it/s]
Embedding extraction: 1it [00:00,  8.64it/s]▏         | 10/750 [00:03<03:17,  3.75it/s]
Embedding extraction: 1it [00:00,  9.40it/s]▏     

Finished processing. Annotated video saved to: 0bfacc_0_ultralytics_botsort_tracked_2.mp4





## Tracking Method 3 wih BotSort + Paddle OCR

### Method 1

In [None]:
import supervision as sv
from tqdm import tqdm
import numpy as np
import cv2
from pathlib import Path
import torch
from ultralytics import YOLO
from paddleocr import PaddleOCR, draw_ocr # Import PaddleOCR
import re # For parsing OCR results
import time # For basic profiling/timing

# ----- Configuration -----
SOURCE_VIDEO_PATH = "app/test_data/raw/0bfacc_0.mp4" # Your input video
OUTPUT_VIDEO_PATH = "0bfacc_0_persistent_tracked.mp4" # Output video path
YOLO_MODEL_PATH = "app/models/yolo11_football_v2/weights/best.pt" # Path to your trained YOLO model or a standard one
TRACKER_CONFIG = "botsort.yaml" # BoT-SORT config (Ultralytics usually finds defaults)

# --- Device Setup ---
device = torch.device(0) if torch.cuda.is_available() else torch.device('cpu')
print(f"Using device: {device}")

# --- Class IDs (from your YOLO model's training) ---
BALL_ID = 0
GOALKEEPER_ID = 1
PLAYER_ID = 2
REFEREE_ID = 3

# --- Class IDs for Annotation (after team classification) ---
TEAM_A_ID = 0 # Example ID for Team A
TEAM_B_ID = 1 # Example ID for Team B
ANNOTATION_REFEREE_ID = 2 # Example ID for Referees

# --- OCR Configuration ---
OCR_CONFIDENCE_THRESHOLD = 0.5 # Minimum confidence for accepting OCR number (adjust!)
OCR_USE_GPU = False # torch.cuda.is_available() Use GPU for OCR if available

# --- State Management Configuration ---
MAX_ABSENCE_FRAMES = int(30) # Frames after which a lost player state is considered stale (e.g., 30fps * 1 sec)

# ----- Initialize Models -----

# Load YOLO Model
print(f"Loading YOLO model from: {YOLO_MODEL_PATH}")
model = YOLO(YOLO_MODEL_PATH)
model.to(device)

# Initialize PaddleOCR
# Download models automatically on first run. Specify 'lang='en'' for English numbers.
print("Initializing PaddleOCR...")
# Consider using `rec_model_dir` and `det_model_dir` with specific downloaded models for efficiency/consistency
# Use `use_angle_cls=False` if numbers are mostly upright
ocr_model = PaddleOCR(use_angle_cls=False, lang='en', use_gpu=OCR_USE_GPU, show_log=False)
print("PaddleOCR initialized.")

# ----- Annotators -----
# Colors match annotation IDs: Team A (Blue), Team B (Pink), Referee (Yellow)
palette = sv.ColorPalette.from_hex(['#00BFFF', '#FF1493', '#FFD700'])
ellipse_annotator = sv.EllipseAnnotator(color=palette, thickness=2)
label_annotator = sv.LabelAnnotator(
    color=palette,
    text_color=sv.Color.from_hex('#000000'),
    text_position=sv.Position.BOTTOM_CENTER,
    text_scale=0.6,
    text_thickness=1
)
triangle_annotator = sv.TriangleAnnotator(
    color=sv.Color.from_hex('#32CD32'), # Lime green for ball
    base=25, height=21, outline_thickness=1
)

# ----- Video Processing Setup -----
try:
    video_info = sv.VideoInfo.from_video_path(SOURCE_VIDEO_PATH)
    width, height, fps = video_info.width, video_info.height, video_info.fps
    total_frames = video_info.total_frames or int(fps * 600) # Estimate for long videos if needed
    print(f"Video Info: {width}x{height}, FPS: {fps}, Total Frames: {total_frames}")
    if fps == 0: fps = 30 # Default fps if reading failed
except Exception as e:
    print(f"Error getting video info: {e}. Exiting.")

frame_generator = sv.get_video_frames_generator(source_path=SOURCE_VIDEO_PATH, stride=1)
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
video_writer = cv2.VideoWriter(OUTPUT_VIDEO_PATH, fourcc, fps, (width, height))

# ----- State Management Dictionaries -----
# Stores the state for each confirmed real-world player identity
# Key: canonical_id (e.g., "TeamA-10"), Value: dict
player_states = {}
# Maps the current BoT-SORT tracker_id to a confirmed canonical_id
# Key: tracker_id (int), Value: canonical_id (str)
tracker_id_to_canonical = {}

# ----- Helper Function for OCR -----
def run_ocr_on_crop(crop: np.ndarray, ocr_engine: PaddleOCR):
    """
    Runs PaddleOCR on a cropped image and extracts the most likely number.
    Returns (number_str, confidence) or (None, 0.0) if no number found.
    """
    if crop.shape[0] < 10 or crop.shape[1] < 10: # Skip tiny crops
        return None, 0.0

    try:
        # Perform OCR
        ocr_result = ocr_engine.ocr(crop, cls=False, det=True, rec=True) # Use det+rec

        best_num_str = None
        best_confidence = 0.0

        if ocr_result and ocr_result[0]: # Check if results exist
             for line in ocr_result[0]:
                  text, confidence = line[1] # Get text and confidence
                  # Try to find digits in the recognized text
                  numbers = re.findall(r'\d+', text)
                  if numbers:
                       num_str = numbers[0] # Take the first number found
                       if confidence > best_confidence:
                            best_confidence = confidence
                            best_num_str = num_str
                            # print(f"    OCR Found: '{text}' -> Number: {num_str} (Conf: {confidence:.2f})") # Debug

        return best_num_str, best_confidence

    except Exception as e:
        print(f"    Error during OCR: {e}")
        return None, 0.0

# ----- Main Video Processing Loop -----
frame_count = 0
with tqdm(total=total_frames, desc=f"Tracking+OCR") as pbar:
    for frame in frame_generator:
        frame_time_start = time.time()
        frame_count += 1

        # 1. Run YOLOv8 Tracking (BoT-SORT) for people
        results = model.track(
            source=frame,
            persist=True,
            tracker=TRACKER_CONFIG,
            classes=[GOALKEEPER_ID, PLAYER_ID, REFEREE_ID],
            conf=0.3,
            verbose=False,
            device=device
        )

        # 2. Detect Ball Separately
        ball_results = model.predict(frame, classes=[BALL_ID], conf=0.1, verbose=False, device=device)
        ball_detections = sv.Detections.from_ultralytics(ball_results[0])
        if len(ball_detections) > 0:
             ball_detections.xyxy = sv.pad_boxes(xyxy=ball_detections.xyxy, px=10)

        annotated_frame = frame.copy() # Start with original frame for annotation

        # 3. Process Tracking Results & Apply Custom Classification
        detections_for_processing = sv.Detections.empty()
        if results[0].boxes.id is not None:
            tracked_detections = sv.Detections.from_ultralytics(results[0])

            # Separate based on ORIGINAL YOLO ID for classification
            players = tracked_detections[tracked_detections.class_id == PLAYER_ID]
            goalkeepers = tracked_detections[tracked_detections.class_id == GOALKEEPER_ID]
            referees = tracked_detections[tracked_detections.class_id == REFEREE_ID]

            processed_detections_list = []
            
            # Classify Players
            if len(players) > 0:
                player_crops = [sv.crop_image(frame, xyxy) for xyxy in players.xyxy]
                new_player_class_ids = team_classifier.predict(player_crops)
                players.class_id = new_player_class_ids # Overwrite with TEAM_A/B ID
                processed_detections_list.append(players)

            # Classify Goalkeepers
            if len(goalkeepers) > 0:
                new_gk_class_ids = resolve_goalkeepers_team_id(players, goalkeepers)
                goalkeepers.class_id = new_gk_class_ids # Overwrite with TEAM_A/B ID
                processed_detections_list.append(goalkeepers)

            # Assign Referee Annotation ID
            if len(referees) > 0:
                referees.class_id = np.full(len(referees), ANNOTATION_REFEREE_ID)
                processed_detections_list.append(referees)

            # Merge back for OCR and final annotation steps
            if processed_detections_list:
                 detections_for_processing = sv.Detections.merge(processed_detections_list)

        # 4. Persistent ID Logic (State Management + OCR)
        annotation_labels = []
        current_tracker_ids = set()
        if len(detections_for_processing) > 0:
            current_tracker_ids = set(detections_for_processing.tracker_id.astype(int))

            # Mark tracks that disappeared this frame as 'lost' in player_states
            lost_canonical_this_frame = []
            for canonical_id, state in player_states.items():
                if state['current_tracker_id'] is not None and state['current_tracker_id'] not in current_tracker_ids:
                    # print(f"  Track lost for {canonical_id} (tracker_id: {state['current_tracker_id']})")
                    # Remove the potentially stale reverse mapping
                    if state['current_tracker_id'] in tracker_id_to_canonical:
                         if tracker_id_to_canonical[state['current_tracker_id']] == canonical_id:
                              del tracker_id_to_canonical[state['current_tracker_id']]
                    state['current_tracker_id'] = None # Keep state, but mark tracker_id as None
                    lost_canonical_this_frame.append(canonical_id)

            # Process current detections for OCR and state updates
            ocr_time = 0.0
            for i in range(len(detections_for_processing)):
                detection = detections_for_processing[i] # Get individual detection
                tracker_id = int(detection.tracker_id[0]) # Get scalar tracker ID
                bbox = detection.xyxy[0]
                assigned_annotation_class = detection.class_id[0] # Team A/B or Referee ID

                canonical_id_for_this_track = tracker_id_to_canonical.get(tracker_id)
                display_label = f"T:{tracker_id}" # Default label (Use T: prefix)

                ocr_performed = False
                # --- Attempt OCR only for players/GKs ---
                if assigned_annotation_class != ANNOTATION_REFEREE_ID:
                    ocr_start_time = time.time()
                    player_crop = sv.crop_image(annotated_frame, bbox) # Crop from copy
                    number, ocr_confidence = run_ocr_on_crop(player_crop, ocr_model)
                    ocr_time += (time.time() - ocr_start_time)
                    ocr_performed = True

                    if number is not None and ocr_confidence > OCR_CONFIDENCE_THRESHOLD:
                        team_prefix = "A" if assigned_annotation_class == TEAM_A_ID else "B"
                        canonical_id_ocr = f"{team_prefix}{number}" # Compact ID e.g., A10, B7

                        # --- State Update Logic ---
                        # If this tracker ID was previously mapped differently, remove old mapping
                        if tracker_id in tracker_id_to_canonical and tracker_id_to_canonical[tracker_id] != canonical_id_ocr:
                            # print(f"    Conflict: Tracker {tracker_id} mapping changed from {tracker_id_to_canonical[tracker_id]} to {canonical_id_ocr}")
                            # Mark the OLD canonical ID as potentially lost if it's not associated with another tracker
                            old_canonical = tracker_id_to_canonical[tracker_id]
                            if old_canonical in player_states and player_states[old_canonical]['current_tracker_id'] == tracker_id:
                                 player_states[old_canonical]['current_tracker_id'] = None
                            # Remove old mapping before adding new one
                            del tracker_id_to_canonical[tracker_id]


                        # Update/Create player state
                        if canonical_id_ocr not in player_states:
                            player_states[canonical_id_ocr] = {'current_tracker_id': None, 'last_seen_frame': -1, 'last_bbox': None, 'ocr_confidence': 0.0}

                        player_states[canonical_id_ocr].update({
                            'current_tracker_id': tracker_id,
                            'last_seen_frame': frame_count,
                            'last_bbox': bbox,
                            'ocr_confidence': ocr_confidence
                        })
                        tracker_id_to_canonical[tracker_id] = canonical_id_ocr
                        canonical_id_for_this_track = canonical_id_ocr
                        display_label = canonical_id_ocr # Use the confirmed ID

                    # --- End State Update ---

                # --- Update display label and state if association already exists ---
                if canonical_id_for_this_track:
                    display_label = canonical_id_for_this_track # Use existing if OCR failed/low conf
                    # Update last seen frame/bbox for existing association
                    if canonical_id_for_this_track in player_states:
                         player_states[canonical_id_for_this_track]['last_seen_frame'] = frame_count
                         player_states[canonical_id_for_this_track]['last_bbox'] = bbox
                         # Ensure the current tracker ID is correctly linked, especially if it was lost and found
                         player_states[canonical_id_for_this_track]['current_tracker_id'] = tracker_id
                         # Also update the reverse map just in case
                         tracker_id_to_canonical[tracker_id] = canonical_id_for_this_track
                # else: # No OCR success and no existing association
                     # display_label remains the default "T:{tracker_id}"

                annotation_labels.append(display_label)
            # print(f"  OCR time for frame: {ocr_time:.4f}s") # Optional timing

        # 5. Clean up stale player states (optional, can be done less frequently)
        if frame_count % (fps * 10) == 0: # Every 10 seconds
             stale_ids = [
                  cid for cid, state in player_states.items()
                  if state['current_tracker_id'] is None and (frame_count - state['last_seen_frame']) > MAX_ABSENCE_FRAMES
             ]
             for cid in stale_ids:
                  print(f"  Removing stale state for {cid}")
                  del player_states[cid]
                  # Clean up reverse map if any stale tracker ID points to it
                  stale_tracker_id = None
                  for tid, mapped_cid in tracker_id_to_canonical.items():
                       if mapped_cid == cid:
                            stale_tracker_id = tid
                            break
                  if stale_tracker_id:
                       del tracker_id_to_canonical[stale_tracker_id]


        # 6. Annotation
        # Annotate tracked people (using team/role class for color, canonical ID for label)
        if len(detections_for_processing) > 0:
            annotated_frame = ellipse_annotator.annotate(
                scene=annotated_frame,
                detections=detections_for_processing # Ellipse color based on Team A/B/Ref ID
            )
            annotated_frame = label_annotator.annotate(
                scene=annotated_frame,
                detections=detections_for_processing,
                labels=annotation_labels # Label text is Canonical ID or T:tracker_id
            )

        # Annotate ball
        if len(ball_detections) > 0:
            annotated_frame = triangle_annotator.annotate(
                scene=annotated_frame,
                detections=ball_detections
            )

        # 7. Write Frame
        video_writer.write(annotated_frame)
        pbar.update(1)
        frame_time_end = time.time()
        # print(f"Frame {frame_count} processing time: {frame_time_end - frame_time_start:.4f}s") # Optional timing

# Release resources
video_writer.release()
print(f"Finished processing. Annotated video saved to: {OUTPUT_VIDEO_PATH}")
print("\nFinal Player States:")
for cid, state in player_states.items():
    print(f"  {cid}: Last Seen Frame={state['last_seen_frame']}, Current TrackerID={state['current_tracker_id']}")

Using device: cuda:0
Loading YOLO model from: app/models/yolo11_football_v2/weights/best.pt
Initializing PaddleOCR...
PaddleOCR initialized.
Video Info: 1920x1080, FPS: 25, Total Frames: 750


Embedding extraction: 1it [00:00,  8.49it/s]?, ?it/s]
Embedding extraction: 1it [00:00,  8.44it/s]10:46,  1.16it/s]
Embedding extraction: 1it [00:00,  8.51it/s]06:56,  1.80it/s]
Embedding extraction: 1it [00:00,  8.40it/s]05:45,  2.16it/s]
Embedding extraction: 1it [00:00,  8.51it/s]05:12,  2.39it/s]
Embedding extraction: 1it [00:00,  8.43it/s]04:52,  2.54it/s]
Embedding extraction: 1it [00:00,  8.48it/s]04:45,  2.60it/s]
Embedding extraction: 1it [00:00,  8.55it/s]04:35,  2.69it/s]
Embedding extraction: 1it [00:00,  8.50it/s]04:49,  2.57it/s]
Embedding extraction: 1it [00:00,  8.60it/s]04:38,  2.66it/s]
Embedding extraction: 1it [00:00,  8.63it/s]<04:29,  2.74it/s]
Embedding extraction: 1it [00:00,  9.43it/s]<04:23,  2.80it/s]
Embedding extraction: 1it [00:00,  9.64it/s]<04:15,  2.89it/s]
Embedding extraction: 1it [00:00,  8.58it/s]<04:16,  2.88it/s]
Embedding extraction: 1it [00:00,  9.27it/s]<04:19,  2.83it/s]
Embedding extraction: 1it [00:00,  9.07it/s]<04:14,  2.89it/s]
Embedding 

  Removing stale state for A3


Embedding extraction: 1it [00:00,  7.72it/s]
Embedding extraction: 1it [00:00,  7.76it/s]8<03:25,  2.43it/s]
Embedding extraction: 1it [00:00,  7.71it/s]8<03:20,  2.48it/s]
Embedding extraction: 1it [00:00,  7.79it/s]8<03:16,  2.53it/s]
Embedding extraction: 1it [00:00,  7.75it/s]9<03:13,  2.57it/s]
Embedding extraction: 1it [00:00,  7.72it/s]9<03:10,  2.59it/s]
Embedding extraction: 1it [00:00,  8.35it/s]0<03:09,  2.61it/s]
Embedding extraction: 1it [00:00,  7.71it/s]0<03:05,  2.65it/s]
Embedding extraction: 1it [00:00,  7.78it/s]0<03:05,  2.65it/s]
Embedding extraction: 1it [00:00,  7.67it/s]1<03:05,  2.64it/s]
Embedding extraction: 1it [00:00,  7.65it/s]1<03:05,  2.64it/s]
Embedding extraction: 1it [00:00,  7.68it/s]1<03:05,  2.64it/s]
Embedding extraction: 1it [00:00,  8.48it/s]2<03:07,  2.61it/s]
Embedding extraction: 1it [00:00,  7.74it/s]2<03:04,  2.64it/s]
Embedding extraction: 1it [00:00,  7.80it/s]3<03:01,  2.68it/s]
Embedding extraction: 1it [00:00,  7.72it/s]3<03:01,  2.67i

  Removing stale state for A12
  Removing stale state for A72


Embedding extraction: 1it [00:00,  8.63it/s]
Embedding extraction: 1it [00:00,  8.40it/s]0<01:32,  2.70it/s]
Embedding extraction: 1it [00:00,  8.39it/s]0<01:35,  2.58it/s]
Embedding extraction: 1it [00:00,  8.39it/s]0<01:33,  2.64it/s]
Embedding extraction: 1it [00:00,  8.37it/s]1<01:31,  2.67it/s]
Embedding extraction: 1it [00:00,  8.38it/s]1<01:30,  2.70it/s]
Embedding extraction: 1it [00:00,  8.38it/s]2<01:30,  2.69it/s]
Embedding extraction: 1it [00:00,  8.35it/s]2<01:29,  2.72it/s]
Embedding extraction: 1it [00:00,  8.48it/s]2<01:30,  2.67it/s]
Embedding extraction: 1it [00:00,  8.41it/s]3<01:28,  2.72it/s]
Embedding extraction: 1it [00:00,  8.41it/s]3<01:28,  2.72it/s]
Embedding extraction: 1it [00:00,  8.28it/s]3<01:27,  2.73it/s]
Embedding extraction: 1it [00:00,  8.28it/s]4<01:28,  2.70it/s]
Embedding extraction: 1it [00:00,  8.36it/s]4<01:28,  2.68it/s]
Embedding extraction: 1it [00:00,  8.33it/s]4<01:26,  2.74it/s]
Embedding extraction: 1it [00:00,  8.38it/s]5<01:24,  2.79i

  Removing stale state for A1
Finished processing. Annotated video saved to: 0bfacc_0_persistent_tracked.mp4

Final Player States:
  A7: Last Seen Frame=750, Current TrackerID=9
  A2: Last Seen Frame=750, Current TrackerID=8
  B10: Last Seen Frame=750, Current TrackerID=401





### Method 2

In [8]:
import supervision as sv
from tqdm import tqdm
import numpy as np
from boxmot import BotSort # Using BoTSORT as requested
import cv2
from pathlib import Path
import torch
from collections import defaultdict, deque
import warnings
import logging
import traceback # Import traceback for detailed error printing

# Suppress PaddleOCR logging noise (optional)
logging.disable(logging.INFO) # Disable INFO messages
warnings.filterwarnings('ignore', category=UserWarning, module='paddle')

# Attempt to import PaddleOCR
try:
    from paddleocr import PaddleOCR
    PADDLEOCR_AVAILABLE = True
except ImportError:
    print("Warning: PaddleOCR not found. Please install it (`pip install paddlepaddle paddleocr`). OCR functionality will be disabled.")
    PADDLEOCR_AVAILABLE = False

# ----- Assumed Globals (Make sure these are defined/loaded) -----
# Ensure these models and functions are loaded/defined before use:
# PLAYER_DETECTION_MODEL = ... # Your loaded YOLO model for detecting players, ball, etc.
# team_classifier = ... # Your loaded team classification model
# def resolve_goalkeepers_team_id(players_detections, goalkeepers_detections):
#     # ... implementation returning numpy array of team class IDs ...
#     # Example placeholder:
#     if len(goalkeepers_detections) > 0:
#         # Replace with actual logic, ensure output matches TEAM IDs
#         return np.random.randint(0, 2, size=len(goalkeepers_detections))
#     return np.array([])

# ----- Configuration -----
SOURCE_VIDEO_PATH = "app/test_data/raw/0bfacc_0.mp4"
OUTPUT_VIDEO_PATH = "0bfacc_0_botsort_paddleocr_reid_debug_tracking.mp4" # Updated output name for debug
DEVICE = torch.device(0) if torch.cuda.is_available() else torch.device('cpu')
print(f"Using device: {DEVICE}")

# Class IDs (as used in your initial detection model)
BALL_ID = 0
GOALKEEPER_ID = 1
PLAYER_ID = 2
REFEREE_ID = 3

# Team/Role Class IDs (assigned *after* classification)
TEAM_A_ID = 0 # Example
TEAM_B_ID = 1 # Example
REFEREE_TEAM_ID = 2 # Example

# OCR Configuration
OCR_CONFIDENCE_THRESHOLD = 0.6
MIN_JERSEY_DIGITS = 1
MAX_JERSEY_DIGITS = 2

# ID Management Configuration
LOST_TRACK_MEMORY_SECONDS = 20
MISMATCH_CONSISTENCY_FRAMES = 3

# ----- Initialize PaddleOCR -----
# (Initialization remains the same)
ocr_model = None
if PADDLEOCR_AVAILABLE:
    try:
        ocr_model = PaddleOCR(use_angle_cls=False, lang='en', use_gpu=(DEVICE.type == 'cuda'), show_log=False)
        print("PaddleOCR initialized successfully.")
    except Exception as e:
        print(f"Error initializing PaddleOCR: {e}. Disabling OCR.")
        PADDLEOCR_AVAILABLE = False

# ----- OCR Function -----
# (OCR function remains the same)
def perform_ocr_on_crop(crop: np.ndarray) -> tuple[str | None, float | None]:
    if not PADDLEOCR_AVAILABLE or ocr_model is None or crop.size == 0: return None, None
    try:
        result = ocr_model.ocr(crop, cls=False)
        best_num, highest_conf = None, 0.0
        if result and result[0]:
             for res_item in result[0]:
                if len(res_item) == 2 and isinstance(res_item[1], tuple) and len(res_item[1]) == 2:
                    text, confidence = res_item[1]
                    if (isinstance(text, str) and text.isdigit() and
                        MIN_JERSEY_DIGITS <= len(text) <= MAX_JERSEY_DIGITS and
                        confidence > OCR_CONFIDENCE_THRESHOLD):
                        if confidence > highest_conf:
                            highest_conf, best_num = confidence, text
        return best_num, highest_conf if best_num else None
    except Exception as e: print(f"Error during PaddleOCR inference: {e}"); return None, None

# ----- Annotators -----
# (Annotators remain the same - increased visibility)
TEAM_COLORS = ['#00BFFF', '#FF1493', '#FFD700']
ellipse_annotator = sv.EllipseAnnotator(color=sv.ColorPalette.from_hex(TEAM_COLORS), thickness=4)
label_annotator = sv.LabelAnnotator(color=sv.ColorPalette.from_hex(TEAM_COLORS), text_color=sv.Color.from_hex('#000000'), text_position=sv.Position.BOTTOM_CENTER, text_scale=0.7, text_thickness=2)
triangle_annotator = sv.TriangleAnnotator(color=sv.Color.from_hex('#FFFFFF'), base=30, height=25, outline_thickness=2)

# ----- Tracker Initialization -----
# Consider lowering thresholds for debugging if needed
REID_WEIGHTS_PATH = Path('clip_market1501.pt')
if not REID_WEIGHTS_PATH.exists():
     print(f"Warning: ReID weights not found at {REID_WEIGHTS_PATH}. BoTSORT running without ReID features.")
     tracker = BotSort(
         device=DEVICE,
         half=False,
         with_reid=False,
         # track_high_thresh=0.3, # Lower for debug?
         # new_track_thresh=0.4, # Lower for debug?
     )
else:
    tracker = BotSort(
        reid_weights=REID_WEIGHTS_PATH,
        device=DEVICE,
        half=False,
        with_reid=True,
        # track_high_thresh=0.3, # Lower for debug?
        # new_track_thresh=0.4, # Lower for debug?
    )
print(f"Tracker initialized. Using ReID: {tracker.with_reid}")
# Print tracker thresholds if possible (depends on boxmot implementation)
# try:
#     print(f"Tracker thresholds: high={tracker.track_high_thresh}, low={tracker.track_low_thresh}, new={tracker.new_track_thresh}")
# except AttributeError:
#     print("Could not access tracker thresholds directly.")


# ----- Player ID Management State -----
# (State management remains the same)
player_data = {}
recently_lost_jerseys = defaultdict(lambda: deque(maxlen=10))

# ----- Video Processing Setup -----
# (Video setup remains the same)
try:
    video_info = sv.VideoInfo.from_video_path(SOURCE_VIDEO_PATH)
    width, height, fps = video_info.width, video_info.height, video_info.fps
    total_frames = video_info.total_frames if video_info.total_frames else int(fps * 60)
    print(f"Video Info: {width}x{height}, FPS: {fps}, Total Frames: {total_frames}")
except Exception as e:
    print(f"Warning: Could not get video info using supervision. Using OpenCV. Error: {e}")
    cap = cv2.VideoCapture(SOURCE_VIDEO_PATH)
    if not cap.isOpened(): raise IOError(f"Cannot open video file: {SOURCE_VIDEO_PATH}")
    width, height, fps = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)), int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)), cap.get(cv2.CAP_PROP_FPS)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    cap.release()
    if fps == 0: fps = 30
    if total_frames <= 0: total_frames = int(fps * 60)
    print(f"Fallback Video Info: {width}x{height}, FPS: {fps}, Total Frames: {total_frames}")

LOST_TRACK_MEMORY_FRAMES = int(fps * LOST_TRACK_MEMORY_SECONDS)
print(f"Lost track memory set to {LOST_TRACK_MEMORY_FRAMES} frames ({LOST_TRACK_MEMORY_SECONDS} seconds)")

frame_generator = sv.get_video_frames_generator(source_path=SOURCE_VIDEO_PATH, stride=1)
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
video_writer = cv2.VideoWriter(OUTPUT_VIDEO_PATH, fourcc, fps, (width, height))

# ----- Frame Processing Function -----
def process_frame(frame: np.ndarray, frame_idx: int):
    """
    Processes a single frame: detects, classifies, performs OCR, tracks, manages IDs, annotates.
    """
    global player_data, recently_lost_jerseys
    is_debug_frame = (frame_idx % (fps * 2) == 0) # Print debug info every 2 seconds

    # 1. Detection
    result = PLAYER_DETECTION_MODEL.predict(frame, conf=0.3, device=DEVICE, verbose=False)[0]
    detections = sv.Detections.from_ultralytics(result)
    if is_debug_frame: print(f"\n--- Frame {frame_idx} ---")
    if is_debug_frame: print(f"[Debug] Initial Detections: {len(detections)} (IDs: {detections.class_id if len(detections)>0 else 'None'})")

    # 2. Pre-processing Detections
    ball_detections = detections[detections.class_id == BALL_ID]
    people_detections = detections[detections.class_id != BALL_ID]
    if is_debug_frame: print(f"[Debug] Ball Detections: {len(ball_detections)}, People Detections (Raw): {len(people_detections)}")

    if len(people_detections) > 0:
        people_detections = people_detections.with_nms(threshold=0.5, class_agnostic=True)
        if is_debug_frame: print(f"[Debug] People Detections (After NMS): {len(people_detections)}")

    # 3. Team/Role Classification
    players_detections = people_detections[people_detections.class_id == PLAYER_ID]
    goalkeepers_detections = people_detections[people_detections.class_id == GOALKEEPER_ID]
    referees_detections = people_detections[people_detections.class_id == REFEREE_ID]
    if is_debug_frame: print(f"[Debug] Pre-classification counts: Players={len(players_detections)}, GK={len(goalkeepers_detections)}, Ref={len(referees_detections)}")

    # Assign team IDs based on classification models
    player_team_ids = np.array([], dtype=int)
    if len(players_detections) > 0:
        players_crops = [sv.crop_image(frame, xyxy) for xyxy in players_detections.xyxy]
        player_team_ids = team_classifier.predict(players_crops) # Assume returns numpy array
        players_detections.class_id = player_team_ids
        if is_debug_frame: print(f"[Debug] Player Team IDs assigned: {player_team_ids}")

    gk_team_ids = np.array([], dtype=int)
    if len(goalkeepers_detections) > 0:
        gk_team_ids = resolve_goalkeepers_team_id(players_detections, goalkeepers_detections) # Assume returns numpy array
        goalkeepers_detections.class_id = gk_team_ids
        if is_debug_frame: print(f"[Debug] Goalkeeper Team IDs assigned: {gk_team_ids}")

    ref_team_ids = np.array([], dtype=int)
    if len(referees_detections) > 0:
        ref_team_ids = np.full(len(referees_detections), REFEREE_TEAM_ID)
        referees_detections.class_id = ref_team_ids
        if is_debug_frame: print(f"[Debug] Referee Team IDs assigned: {ref_team_ids}")

    # Merge all classified people detections for tracking input
    detections_to_track = sv.Detections.merge([players_detections, goalkeepers_detections, referees_detections])
    if is_debug_frame: print(f"[Debug] Total Detections Merged for Tracking: {len(detections_to_track)}")

    # 4. Tracking using BoTSORT
    tracked_detections = sv.Detections.empty()
    current_frame_tracker_ids = set()

    if len(detections_to_track) > 0:
        # Prepare input for BoxMOT
        boxmot_input = np.hstack((
            detections_to_track.xyxy,
            detections_to_track.confidence[:, np.newaxis],
            detections_to_track.class_id[:, np.newaxis].astype(float) # Ensure class ID is float for some trackers
        ))

        if is_debug_frame:
            print(f"[Debug] Input to tracker shape: {boxmot_input.shape}, dtype: {boxmot_input.dtype}")
            print(f"[Debug] Input confidences: {boxmot_input[:, 4]}") # Print confidences
            print(f"[Debug] Input class IDs: {boxmot_input[:, 5]}") # Print class IDs

        # Update tracker
        tracks = tracker.update(boxmot_input, frame)

        if tracks.shape[0] > 0:
            tracked_detections = sv.Detections(
                xyxy=tracks[:, 0:4],
                confidence=tracks[:, 5],
                class_id=tracks[:, 6].astype(int),
                tracker_id=tracks[:, 4].astype(int)
            )
            current_frame_tracker_ids = set(tracked_detections.tracker_id)
    else:
        # Update tracker with empty array if no detections
         tracker.update(np.empty((0, 6)), frame)
         if is_debug_frame: print("[Debug] No detections to track, updating tracker with empty array.")

    # --- Previous Debug Print (still useful) ---
    if is_debug_frame: print(f"[Debug] Number of tracked detections OUTPUT: {len(tracked_detections)}")

    # 5. OCR and Player ID Management
    # (Logic remains the same as previous version)
    final_labels = []
    current_player_data = {}
    for i in range(len(tracked_detections)):
        track_id = tracked_detections.tracker_id[i]; team_id = tracked_detections.class_id[i]; bbox = tracked_detections.xyxy[i]
        x1, y1, x2, y2 = map(int, bbox); x1, y1 = max(0, x1), max(0, y1); x2, y2 = min(width, x2), min(height, y2)
        detected_jersey_num, ocr_confidence = None, None
        if x1 < x2 and y1 < y2:
            player_crop = frame[y1:y2, x1:x2]
            detected_jersey_num, ocr_confidence = perform_ocr_on_crop(player_crop)
        assigned_jersey_id, assigned_confidence = None, None
        if track_id in player_data:
            p_data = player_data[track_id]; p_data["last_seen"] = frame_idx; p_data["team_id"] = team_id
            current_jersey_id = p_data["jersey_id"]; mismatch_history = p_data["mismatch_history"]
            if detected_jersey_num is not None:
                if current_jersey_id is None or detected_jersey_num == current_jersey_id:
                    p_data["jersey_id"], p_data["jersey_confidence"] = detected_jersey_num, ocr_confidence; mismatch_history.clear()
                else:
                    mismatch_history.append(detected_jersey_num)
                    if len(mismatch_history) >= MISMATCH_CONSISTENCY_FRAMES and all(num == detected_jersey_num for num in mismatch_history):
                        p_data["jersey_id"], p_data["jersey_confidence"] = detected_jersey_num, ocr_confidence; mismatch_history.clear()
            else: mismatch_history.clear()
            assigned_jersey_id, assigned_confidence = p_data["jersey_id"], p_data["jersey_confidence"]
            current_player_data[track_id] = p_data
        else:
            found_match = False
            if detected_jersey_num is not None and detected_jersey_num in recently_lost_jerseys:
                potential_matches = []
                for lost_track_info in reversed(recently_lost_jerseys[detected_jersey_num]):
                    time_diff = frame_idx - lost_track_info["last_seen"]
                    if time_diff < LOST_TRACK_MEMORY_FRAMES and lost_track_info["team_id"] == team_id: potential_matches.append((lost_track_info, time_diff))
                if potential_matches:
                    potential_matches.sort(key=lambda x: x[1]); best_match_info, _ = potential_matches[0]
                    assigned_jersey_id, assigned_confidence = detected_jersey_num, ocr_confidence
                    try: recently_lost_jerseys[detected_jersey_num].remove(best_match_info)
                    except ValueError: pass
                    found_match = True
            if not found_match: assigned_jersey_id, assigned_confidence = detected_jersey_num, ocr_confidence
            current_player_data[track_id] = {"jersey_id": assigned_jersey_id, "jersey_confidence": assigned_confidence, "last_seen": frame_idx, "team_id": team_id, "mismatch_history": deque(maxlen=MISMATCH_CONSISTENCY_FRAMES)}
        display_id = f"T{track_id}"
        if assigned_jersey_id is not None: conf_str = f" ({assigned_confidence:.1f})" if assigned_confidence is not None else ""; display_id = f"#{assigned_jersey_id}{conf_str}"
        final_labels.append(display_id)

    # --- Previous Debug Print (still useful) ---
    if is_debug_frame and len(tracked_detections) > 0: print(f"[Debug] Generated labels: {final_labels}")

    # 6. Update Global Player Data & Handle Lost Tracks
    # (Logic remains the same)
    lost_tracker_ids = set(player_data.keys()) - current_frame_tracker_ids
    for lost_id in lost_tracker_ids:
        lost_info = player_data[lost_id]
        if lost_info["jersey_id"] is not None: recently_lost_jerseys[lost_info["jersey_id"]].append({"tracker_id": lost_id, "last_seen": lost_info["last_seen"], "team_id": lost_info["team_id"]})
    player_data = current_player_data

    # 7. Annotation
    # (Logic remains the same - includes error checks)
    annotated_frame = frame.copy()
    if len(tracked_detections) > 0:
        if len(final_labels) == len(tracked_detections):
            try:
                annotated_frame = ellipse_annotator.annotate(scene=annotated_frame, detections=tracked_detections)
                annotated_frame = label_annotator.annotate(scene=annotated_frame, detections=tracked_detections, labels=final_labels)
            except Exception as e: print(f"[Frame {frame_idx}] Error during player/referee annotation: {e}"); traceback.print_exc()
        else: print(f"[Frame {frame_idx}] Warning: Mismatch between tracks ({len(tracked_detections)}) and labels ({len(final_labels)}). Skipping player annotation.")
    if len(ball_detections) > 0:
         try: annotated_frame = triangle_annotator.annotate(scene=annotated_frame, detections=ball_detections)
         except Exception as e: print(f"[Frame {frame_idx}] Error during ball annotation: {e}"); traceback.print_exc()

    return annotated_frame

# ----- Main Video Processing Loop -----
# (Loop remains the same - includes error checks)
try:
    with tqdm(total=total_frames, desc="Processing video with BoTSORT + PaddleOCR") as pbar:
        for frame_idx, frame in enumerate(frame_generator):
            if frame is None: print("Warning: Received None frame, ending processing."); break
            try:
                annotated_frame = process_frame(frame, frame_idx)
                video_writer.write(annotated_frame)
            except Exception as e: print(f"\nError processing frame {frame_idx}: {e}"); traceback.print_exc()
            pbar.update(1)
except KeyboardInterrupt: print("Processing interrupted by user.")
finally:
    video_writer.release()
    print(f"Finished processing. Annotated video saved to: {OUTPUT_VIDEO_PATH}")
    if DEVICE == torch.device('cuda'): torch.cuda.empty_cache()



Using device: cuda:0


[32m2025-04-14 13:11:25.345[0m | [1mINFO    [0m | [36mboxmot.utils.torch_utils[0m:[36mselect_device[0m:[36m52[0m - [1mYolo Tracking v12.0.5 🚀 Python-3.11.11 torch-2.5.1+cu121
CUDA:0 (NVIDIA L4, 22478MiB)[0m


PaddleOCR initialized successfully.
Resized position embedding: %s to %s torch.Size([197, 768]) torch.Size([129, 768])
Position embedding resize to height:16 width: 8


[32m2025-04-14 13:11:27.604[0m | [32m[1mSUCCESS [0m | [36mboxmot.appearance.reid_model_factory[0m:[36mload_pretrained_weights[0m:[36m183[0m - [32m[1mLoaded pretrained weights from clip_market1501.pt[0m


Tracker initialized. Using ReID: True
Video Info: 1920x1080, FPS: 25, Total Frames: 750
Lost track memory set to 500 frames (20 seconds)


Processing video with BoTSORT + PaddleOCR:   0%|          | 0/750 [00:00<?, ?it/s]


--- Frame 0 ---
[Debug] Initial Detections: 23 (IDs: [2 2 2 2 2 2 2 2 2 2 3 2 3 2 2 2 2 2 2 2 2 3 2])
[Debug] Ball Detections: 0, People Detections (Raw): 23
[Debug] People Detections (After NMS): 23
[Debug] Pre-classification counts: Players=20, GK=0, Ref=3


Embedding extraction: 1it [00:00,  8.39it/s]


[Debug] Player Team IDs assigned: [1 1 1 1 1 0 1 0 0 1 0 0 0 1 1 0 0 1 0 0]
[Debug] Referee Team IDs assigned: [2 2 2]
[Debug] Total Detections Merged for Tracking: 23
[Debug] Input to tracker shape: (23, 6), dtype: float64
[Debug] Input confidences: [    0.91953     0.90255     0.89822     0.89176     0.89118     0.88861     0.88785     0.88748     0.88565     0.87901     0.86872     0.86543       0.865     0.86226     0.85987     0.85151     0.83763     0.81968     0.81329     0.79433     0.87135     0.86626     0.80767]
[Debug] Input class IDs: [          1           1           1           1           1           0           1           0           0           1           0           0           0           1           1           0           0           1           0           0           2           2           2]
[Debug] Number of tracked detections OUTPUT: 23


Processing video with BoTSORT + PaddleOCR:   0%|          | 1/750 [00:04<53:26,  4.28s/it]

[Debug] Generated labels: ['T1', 'T2', 'T3', 'T4', 'T5', 'T6', 'T7', 'T8', 'T9', 'T10', 'T11', 'T12', 'T13', 'T14', 'T15', 'T16', 'T17', 'T18', 'T19', 'T20', 'T21', 'T22', 'T23']


Embedding extraction: 1it [00:00,  8.52it/s]
Embedding extraction: 1it [00:00,  8.55it/s] 0%|          | 2/750 [00:04<24:55,  2.00s/it]
Embedding extraction: 1it [00:00,  8.57it/s] 0%|          | 3/750 [00:05<16:04,  1.29s/it]
Embedding extraction: 1it [00:00,  8.51it/s] 1%|          | 4/750 [00:05<11:33,  1.08it/s]
Embedding extraction: 1it [00:00,  8.62it/s] 1%|          | 5/750 [00:05<09:14,  1.34it/s]
Embedding extraction: 1it [00:00,  8.59it/s] 1%|          | 6/750 [00:06<07:45,  1.60it/s]
Embedding extraction: 1it [00:00,  8.54it/s] 1%|          | 7/750 [00:06<06:43,  1.84it/s]
Embedding extraction: 1it [00:00,  8.54it/s] 1%|          | 8/750 [00:07<06:21,  1.95it/s]
Embedding extraction: 1it [00:00,  8.67it/s] 1%|          | 9/750 [00:07<06:04,  2.03it/s]
Embedding extraction: 1it [00:00,  8.76it/s] 1%|▏         | 10/750 [00:07<05:45,  2.14it/s]
Embedding extraction: 1it [00:00,  9.43it/s] 1%|▏         | 11/750 [00:08<05:34,  2.21it/s]
Embedding extraction: 1it [00:00,  9.41it/s


--- Frame 50 ---
[Debug] Initial Detections: 24 (IDs: [2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 3 3 2])
[Debug] Ball Detections: 0, People Detections (Raw): 24
[Debug] People Detections (After NMS): 23
[Debug] Pre-classification counts: Players=19, GK=0, Ref=4


Embedding extraction: 1it [00:00,  8.55it/s]


[Debug] Player Team IDs assigned: [1 1 1 0 1 0 0 0 0 0 1 1 1 1 0 0 1 0 1]
[Debug] Referee Team IDs assigned: [2 2 2 2]
[Debug] Total Detections Merged for Tracking: 23
[Debug] Input to tracker shape: (23, 6), dtype: float64
[Debug] Input confidences: [    0.91549     0.90833      0.9071     0.89885     0.89652     0.89615     0.89537     0.89489     0.89465     0.89415     0.89057     0.88972      0.8888     0.88812     0.88757      0.8866     0.88506     0.87781     0.86233     0.83954     0.81767     0.76286     0.60015]
[Debug] Input class IDs: [          1           1           1           0           1           0           0           0           0           0           1           1           1           1           0           0           1           0           1           2           2           2           2]
[Debug] Number of tracked detections OUTPUT: 23
[Debug] Generated labels: ['T1', 'T2', 'T3', 'T4', 'T5', 'T6', 'T7', 'T8', 'T9', 'T10', 'T11', 'T12', 'T14', 'T15', 'T16

Embedding extraction: 1it [00:00,  8.55it/s] 7%|▋         | 51/750 [00:25<04:43,  2.46it/s]
Embedding extraction: 1it [00:00,  8.38it/s] 7%|▋         | 52/750 [00:25<04:42,  2.47it/s]
Embedding extraction: 1it [00:00,  8.37it/s] 7%|▋         | 53/750 [00:25<04:40,  2.49it/s]
Embedding extraction: 1it [00:00,  8.43it/s] 7%|▋         | 54/750 [00:26<04:39,  2.49it/s]
Embedding extraction: 1it [00:00,  8.37it/s] 7%|▋         | 55/750 [00:26<04:44,  2.44it/s]
Embedding extraction: 1it [00:00,  8.49it/s] 7%|▋         | 56/750 [00:27<04:42,  2.45it/s]
Embedding extraction: 1it [00:00,  8.70it/s] 8%|▊         | 57/750 [00:27<05:01,  2.30it/s]
Embedding extraction: 1it [00:00,  8.58it/s] 8%|▊         | 58/750 [00:27<04:57,  2.32it/s]
Embedding extraction: 1it [00:00,  8.60it/s] 8%|▊         | 59/750 [00:28<04:59,  2.31it/s]
Embedding extraction: 1it [00:00,  8.54it/s] 8%|▊         | 60/750 [00:28<04:47,  2.40it/s]
Embedding extraction: 1it [00:00,  8.58it/s] 8%|▊         | 61/750 [00:29<04:44,


--- Frame 100 ---
[Debug] Initial Detections: 24 (IDs: [2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 2 2 2 3 3 0])
[Debug] Ball Detections: 1, People Detections (Raw): 23
[Debug] People Detections (After NMS): 23
[Debug] Pre-classification counts: Players=20, GK=0, Ref=3


Embedding extraction: 1it [00:00,  8.54it/s]


[Debug] Player Team IDs assigned: [0 1 0 1 1 0 0 0 1 0 1 1 0 1 0 0 1 1 0 1]
[Debug] Referee Team IDs assigned: [2 2 2]
[Debug] Total Detections Merged for Tracking: 23
[Debug] Input to tracker shape: (23, 6), dtype: float64
[Debug] Input confidences: [    0.93728     0.91781     0.91625     0.91539     0.90856     0.90826     0.90467     0.89724     0.89414     0.89134     0.88889     0.88765      0.8868     0.88362     0.86977     0.86606     0.86248     0.86112     0.86046     0.85327     0.86519     0.85108      0.7562]
[Debug] Input class IDs: [          0           1           0           1           1           0           0           0           1           0           1           1           0           1           0           0           1           1           0           1           2           2           2]
[Debug] Number of tracked detections OUTPUT: 23


Processing video with BoTSORT + PaddleOCR:  13%|█▎        | 101/750 [00:46<04:40,  2.31it/s]

[Debug] Generated labels: ['T1', 'T2', 'T3', 'T4', 'T5', 'T6', 'T7', 'T8', '#7 (0.8)', 'T10', 'T11', 'T12', 'T14', 'T15', 'T16', 'T18', 'T20', 'T21', 'T22', 'T23', '#3 (0.7)', 'T19', 'T25']


Embedding extraction: 1it [00:00,  8.60it/s]
Embedding extraction: 1it [00:00,  8.59it/s]14%|█▎        | 102/750 [00:47<04:30,  2.40it/s]
Embedding extraction: 1it [00:00,  8.56it/s]14%|█▎        | 103/750 [00:47<04:29,  2.40it/s]
Embedding extraction: 1it [00:00,  8.46it/s]14%|█▍        | 104/750 [00:47<04:21,  2.47it/s]
Embedding extraction: 1it [00:00,  8.55it/s]14%|█▍        | 105/750 [00:48<04:29,  2.39it/s]
Embedding extraction: 1it [00:00,  8.56it/s]14%|█▍        | 106/750 [00:48<04:27,  2.41it/s]
Embedding extraction: 1it [00:00,  7.80it/s]14%|█▍        | 107/750 [00:49<04:19,  2.48it/s]
Embedding extraction: 1it [00:00,  7.84it/s]14%|█▍        | 108/750 [00:49<04:24,  2.43it/s]
Embedding extraction: 1it [00:00,  7.82it/s]15%|█▍        | 109/750 [00:49<04:25,  2.41it/s]
Embedding extraction: 1it [00:00,  8.49it/s]15%|█▍        | 110/750 [00:50<04:31,  2.36it/s]
Embedding extraction: 1it [00:00,  8.50it/s]15%|█▍        | 111/750 [00:50<04:29,  2.38it/s]
Embedding extraction: 1it


--- Frame 150 ---
[Debug] Initial Detections: 25 (IDs: [2 2 2 2 2 2 2 2 2 2 3 2 2 2 2 2 2 2 2 2 3 3 2 2 2])
[Debug] Ball Detections: 0, People Detections (Raw): 25
[Debug] People Detections (After NMS): 24
[Debug] Pre-classification counts: Players=21, GK=0, Ref=3


Embedding extraction: 1it [00:00,  7.80it/s]


[Debug] Player Team IDs assigned: [1 0 1 1 1 0 1 1 1 0 0 0 0 0 0 1 1 0 1 1 0]
[Debug] Referee Team IDs assigned: [2 2 2]
[Debug] Total Detections Merged for Tracking: 24
[Debug] Input to tracker shape: (24, 6), dtype: float64
[Debug] Input confidences: [    0.90915     0.90839     0.90816     0.90471     0.90393     0.89703     0.89635     0.89625     0.89387     0.89275     0.88071     0.88056     0.88015     0.87354     0.86215     0.85716     0.84681     0.84672     0.84578     0.66841     0.59744      0.8891     0.81441     0.76233]
[Debug] Input class IDs: [          1           0           1           1           1           0           1           1           1           0           0           0           0           0           0           1           1           0           1           1           0           2           2           2]
[Debug] Number of tracked detections OUTPUT: 23


Processing video with BoTSORT + PaddleOCR:  20%|██        | 151/750 [01:07<04:14,  2.36it/s]

[Debug] Generated labels: ['T1', 'T2', 'T3', 'T4', 'T5', 'T6', 'T7', 'T8', '#7 (0.8)', 'T10', 'T11', 'T12', 'T14', 'T15', 'T16', 'T18', 'T20', 'T21', 'T22', 'T23', '#3 (0.7)', 'T19', 'T25']


Embedding extraction: 1it [00:00,  7.83it/s]
Embedding extraction: 1it [00:00,  7.31it/s]20%|██        | 152/750 [01:08<04:16,  2.33it/s]
Embedding extraction: 1it [00:00,  7.35it/s]20%|██        | 153/750 [01:08<04:13,  2.35it/s]
Embedding extraction: 1it [00:00,  7.36it/s]21%|██        | 154/750 [01:09<04:10,  2.38it/s]
Embedding extraction: 1it [00:00,  7.77it/s]21%|██        | 155/750 [01:09<04:14,  2.33it/s]
Embedding extraction: 1it [00:00,  7.80it/s]21%|██        | 156/750 [01:10<04:07,  2.40it/s]
Embedding extraction: 1it [00:00,  7.78it/s]21%|██        | 157/750 [01:10<04:08,  2.38it/s]
Embedding extraction: 1it [00:00,  7.82it/s]21%|██        | 158/750 [01:10<04:04,  2.42it/s]
Embedding extraction: 1it [00:00,  7.81it/s]21%|██        | 159/750 [01:11<04:08,  2.38it/s]
Embedding extraction: 1it [00:00,  7.81it/s]21%|██▏       | 160/750 [01:11<04:17,  2.30it/s]
Embedding extraction: 1it [00:00,  7.47it/s]21%|██▏       | 161/750 [01:12<04:14,  2.31it/s]
Embedding extraction: 1it


--- Frame 200 ---
[Debug] Initial Detections: 26 (IDs: [2 2 2 2 2 2 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 3 3 2 3 0])
[Debug] Ball Detections: 1, People Detections (Raw): 25
[Debug] People Detections (After NMS): 23
[Debug] Pre-classification counts: Players=20, GK=0, Ref=3


Embedding extraction: 1it [00:00,  8.47it/s]


[Debug] Player Team IDs assigned: [0 1 1 1 0 0 1 1 0 0 0 1 1 1 1 0 0 1 0 1]
[Debug] Referee Team IDs assigned: [2 2 2]
[Debug] Total Detections Merged for Tracking: 23
[Debug] Input to tracker shape: (23, 6), dtype: float64
[Debug] Input confidences: [    0.92195     0.91523     0.91015      0.9083     0.90365     0.89154     0.88645     0.88639     0.88411     0.88403      0.8833     0.88202     0.88191     0.87994     0.87928     0.86634     0.86146     0.85498     0.84601     0.83715     0.88464     0.74829     0.69732]
[Debug] Input class IDs: [          0           1           1           1           0           0           1           1           0           0           0           1           1           1           1           0           0           1           0           1           2           2           2]


Processing video with BoTSORT + PaddleOCR:  27%|██▋       | 201/750 [01:29<03:54,  2.34it/s]

[Debug] Number of tracked detections OUTPUT: 23
[Debug] Generated labels: ['T1', 'T2', 'T3', 'T4', 'T5', 'T6', 'T7', '#2 (0.7)', '#7 (0.8)', 'T10', 'T11', 'T12', 'T14', 'T15', 'T16', 'T18', 'T20', 'T21', 'T22', '#3 (0.7)', 'T25', 'T26', 'T23']


Embedding extraction: 1it [00:00,  8.61it/s]
Embedding extraction: 1it [00:00,  8.44it/s]27%|██▋       | 202/750 [01:29<03:45,  2.43it/s]
Embedding extraction: 1it [00:00,  8.44it/s]27%|██▋       | 203/750 [01:29<03:40,  2.48it/s]
Embedding extraction: 1it [00:00,  8.40it/s]27%|██▋       | 204/750 [01:30<03:40,  2.48it/s]
Embedding extraction: 1it [00:00,  8.41it/s]27%|██▋       | 205/750 [01:30<03:39,  2.48it/s]
Embedding extraction: 1it [00:00,  8.31it/s]27%|██▋       | 206/750 [01:31<03:41,  2.45it/s]
Embedding extraction: 1it [00:00,  8.32it/s]28%|██▊       | 207/750 [01:31<03:36,  2.50it/s]
Embedding extraction: 1it [00:00,  8.28it/s]28%|██▊       | 208/750 [01:31<03:32,  2.55it/s]
Embedding extraction: 1it [00:00,  8.40it/s]28%|██▊       | 209/750 [01:32<03:30,  2.56it/s]
Embedding extraction: 1it [00:00,  8.40it/s]28%|██▊       | 210/750 [01:32<03:29,  2.58it/s]
Embedding extraction: 1it [00:00,  8.36it/s]28%|██▊       | 211/750 [01:33<03:33,  2.52it/s]
Embedding extraction: 1it


--- Frame 250 ---
[Debug] Initial Detections: 24 (IDs: [2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 2 2 2 2 3 3])
[Debug] Ball Detections: 0, People Detections (Raw): 24
[Debug] People Detections (After NMS): 24
[Debug] Pre-classification counts: Players=21, GK=0, Ref=3


Embedding extraction: 1it [00:00,  7.72it/s]


[Debug] Player Team IDs assigned: [1 0 0 1 0 1 1 1 0 1 0 0 0 0 1 1 1 1 0 1 0]
[Debug] Referee Team IDs assigned: [2 2 2]
[Debug] Total Detections Merged for Tracking: 24
[Debug] Input to tracker shape: (24, 6), dtype: float64
[Debug] Input confidences: [    0.91288      0.9119     0.90803     0.90446     0.90427     0.90404     0.90276     0.90131     0.90056     0.90014     0.89494     0.89169     0.88666     0.88516     0.88431     0.88224     0.85834     0.84713     0.82788     0.82053     0.77079     0.86286     0.75135     0.72746]
[Debug] Input class IDs: [          1           0           0           1           0           1           1           1           0           1           0           0           0           0           1           1           1           1           0           1           0           2           2           2]
[Debug] Number of tracked detections OUTPUT: 24


Processing video with BoTSORT + PaddleOCR:  33%|███▎      | 251/750 [01:49<03:41,  2.25it/s]

[Debug] Generated labels: ['T1', 'T2', 'T3', 'T4', 'T5', 'T6', 'T7', '#2 (0.7)', '#7 (0.8)', 'T10', 'T11', 'T12', 'T14', 'T15', 'T16', 'T18', 'T20', 'T21', 'T22', 'T25', 'T26', 'T23', 'T13', 'T27']


Embedding extraction: 1it [00:00,  7.69it/s]
Embedding extraction: 1it [00:00,  7.74it/s]34%|███▎      | 252/750 [01:49<03:35,  2.31it/s]
Embedding extraction: 1it [00:00,  7.74it/s]34%|███▎      | 253/750 [01:50<03:30,  2.37it/s]
Embedding extraction: 1it [00:00,  7.70it/s]34%|███▍      | 254/750 [01:50<03:25,  2.41it/s]
Embedding extraction: 1it [00:00,  8.46it/s]34%|███▍      | 255/750 [01:50<03:31,  2.34it/s]
Embedding extraction: 1it [00:00,  8.38it/s]34%|███▍      | 256/750 [01:51<03:23,  2.43it/s]
Embedding extraction: 1it [00:00,  8.49it/s]34%|███▍      | 257/750 [01:51<03:18,  2.49it/s]
Embedding extraction: 1it [00:00,  8.34it/s]34%|███▍      | 258/750 [01:52<03:15,  2.52it/s]
Embedding extraction: 1it [00:00,  8.41it/s]35%|███▍      | 259/750 [01:52<03:12,  2.56it/s]
Embedding extraction: 1it [00:00,  8.36it/s]35%|███▍      | 260/750 [01:52<03:11,  2.56it/s]
Embedding extraction: 1it [00:00,  8.39it/s]35%|███▍      | 261/750 [01:53<03:09,  2.59it/s]
Embedding extraction: 1it


--- Frame 300 ---
[Debug] Initial Detections: 25 (IDs: [2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 2 2 2 2 3 2 2 2 3 0])
[Debug] Ball Detections: 1, People Detections (Raw): 24
[Debug] People Detections (After NMS): 24
[Debug] Pre-classification counts: Players=21, GK=0, Ref=3


Embedding extraction: 1it [00:00,  7.68it/s]


[Debug] Player Team IDs assigned: [1 1 1 0 0 0 1 0 1 1 0 1 1 0 1 0 0 1 0 0 1]
[Debug] Referee Team IDs assigned: [2 2 2]
[Debug] Total Detections Merged for Tracking: 24
[Debug] Input to tracker shape: (24, 6), dtype: float64
[Debug] Input confidences: [    0.91456     0.90725     0.90692     0.90527     0.90073     0.89614      0.8882     0.88661     0.88573     0.87833     0.87523     0.87406     0.87185     0.87005     0.87005     0.86688       0.861     0.86092     0.81473     0.80849      0.7906     0.87097     0.83075     0.73958]
[Debug] Input class IDs: [          1           1           1           0           0           0           1           0           1           1           0           1           1           0           1           0           0           1           0           0           1           2           2           2]
[Debug] Number of tracked detections OUTPUT: 24
[Debug] Generated labels: ['T1', 'T2', 'T3', 'T4', 'T5', 'T6', 'T7', '#2 (0.7)', '#7 (0.8)', '

Embedding extraction: 1it [00:00,  7.65it/s]40%|████      | 301/750 [02:09<03:03,  2.45it/s]
Embedding extraction: 1it [00:00,  7.65it/s]40%|████      | 302/750 [02:09<03:00,  2.48it/s]
Embedding extraction: 1it [00:00,  8.38it/s]40%|████      | 303/750 [02:10<02:59,  2.49it/s]
Embedding extraction: 1it [00:00,  8.33it/s]41%|████      | 304/750 [02:10<02:56,  2.52it/s]
Embedding extraction: 1it [00:00,  8.36it/s]41%|████      | 305/750 [02:11<02:54,  2.55it/s]
Embedding extraction: 1it [00:00,  7.65it/s]41%|████      | 306/750 [02:11<02:58,  2.49it/s]
Embedding extraction: 1it [00:00,  7.62it/s]41%|████      | 307/750 [02:11<02:57,  2.49it/s]
Embedding extraction: 1it [00:00,  7.68it/s]41%|████      | 308/750 [02:12<03:02,  2.42it/s]
Embedding extraction: 1it [00:00,  7.61it/s]41%|████      | 309/750 [02:12<02:59,  2.46it/s]
Embedding extraction: 1it [00:00,  7.76it/s]41%|████▏     | 310/750 [02:13<03:02,  2.41it/s]
Embedding extraction: 1it [00:00,  7.65it/s]41%|████▏     | 311/750 [0


--- Frame 350 ---
[Debug] Initial Detections: 26 (IDs: [2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 2 3 2 2 2 3 2 2 0])
[Debug] Ball Detections: 1, People Detections (Raw): 25
[Debug] People Detections (After NMS): 24
[Debug] Pre-classification counts: Players=21, GK=0, Ref=3


Embedding extraction: 1it [00:00,  7.70it/s]


[Debug] Player Team IDs assigned: [1 1 1 1 0 0 0 1 0 1 1 0 0 1 1 0 0 0 1 1 1]
[Debug] Referee Team IDs assigned: [2 2 2]
[Debug] Total Detections Merged for Tracking: 24
[Debug] Input to tracker shape: (24, 6), dtype: float64
[Debug] Input confidences: [    0.90852     0.90707     0.90545     0.90515     0.90222     0.89917     0.89895     0.89812     0.89034     0.88874     0.88831      0.8791     0.87883      0.8787     0.87102     0.86607     0.86121      0.8487     0.82566     0.80061     0.63928     0.86858     0.85274     0.70915]
[Debug] Input class IDs: [          1           1           1           1           0           0           0           1           0           1           1           0           0           1           1           0           0           0           1           1           1           2           2           2]
[Debug] Number of tracked detections OUTPUT: 24
[Debug] Generated labels: ['T1', 'T2', 'T3', 'T4', 'T5', 'T6', 'T7', '#2 (0.7)', '#7 (0.8)', '

Embedding extraction: 1it [00:00,  7.71it/s]47%|████▋     | 351/750 [02:30<02:41,  2.47it/s]
Embedding extraction: 1it [00:00,  7.66it/s]47%|████▋     | 352/750 [02:30<02:48,  2.36it/s]
Embedding extraction: 1it [00:00,  7.68it/s]47%|████▋     | 353/750 [02:31<02:50,  2.32it/s]
Embedding extraction: 1it [00:00,  7.69it/s]47%|████▋     | 354/750 [02:31<02:47,  2.37it/s]
Embedding extraction: 1it [00:00,  7.69it/s]47%|████▋     | 355/750 [02:31<02:44,  2.40it/s]
Embedding extraction: 1it [00:00,  7.69it/s]47%|████▋     | 356/750 [02:32<02:41,  2.43it/s]
Embedding extraction: 1it [00:00,  7.69it/s]48%|████▊     | 357/750 [02:32<02:40,  2.46it/s]
Embedding extraction: 1it [00:00,  7.69it/s]48%|████▊     | 358/750 [02:33<02:42,  2.41it/s]
Embedding extraction: 1it [00:00,  7.70it/s]48%|████▊     | 359/750 [02:33<02:44,  2.37it/s]
Embedding extraction: 1it [00:00,  7.67it/s]48%|████▊     | 360/750 [02:33<02:46,  2.34it/s]
Embedding extraction: 1it [00:00,  7.63it/s]48%|████▊     | 361/750 [0


--- Frame 400 ---
[Debug] Initial Detections: 24 (IDs: [2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 2 2 2 2 2 3 2 3 0])
[Debug] Ball Detections: 1, People Detections (Raw): 23
[Debug] People Detections (After NMS): 23
[Debug] Pre-classification counts: Players=20, GK=0, Ref=3


Embedding extraction: 1it [00:00,  8.36it/s]


[Debug] Player Team IDs assigned: [1 1 1 0 1 0 0 0 1 1 0 1 1 0 0 0 0 0 1 1]
[Debug] Referee Team IDs assigned: [2 2 2]
[Debug] Total Detections Merged for Tracking: 23
[Debug] Input to tracker shape: (23, 6), dtype: float64
[Debug] Input confidences: [    0.91951     0.90654     0.90633     0.90584      0.9024     0.90201     0.90167     0.89788     0.89116     0.88975      0.8869     0.88297     0.87421      0.8715      0.8708     0.86956     0.86621     0.86219      0.8611     0.81184     0.87199     0.85011     0.57799]
[Debug] Input class IDs: [          1           1           1           0           1           0           0           0           1           1           0           1           1           0           0           0           0           0           1           1           2           2           2]


Processing video with BoTSORT + PaddleOCR:  53%|█████▎    | 401/750 [02:50<02:11,  2.65it/s]

[Debug] Number of tracked detections OUTPUT: 23
[Debug] Generated labels: ['T1', 'T2', 'T3', 'T4', 'T5', 'T6', 'T7', '#2 (0.7)', '#7 (0.8)', 'T10', 'T11', 'T12', 'T14', 'T15', 'T18', 'T20', 'T21', 'T22', 'T25', 'T13', 'T27', 'T16', 'T23']


Embedding extraction: 1it [00:00,  8.39it/s]
Embedding extraction: 1it [00:00,  8.40it/s]54%|█████▎    | 402/750 [02:50<02:11,  2.65it/s]
Embedding extraction: 1it [00:00,  8.25it/s]54%|█████▎    | 403/750 [02:51<02:12,  2.61it/s]
Embedding extraction: 1it [00:00,  8.45it/s]54%|█████▍    | 404/750 [02:51<02:12,  2.61it/s]
Embedding extraction: 1it [00:00,  8.44it/s]54%|█████▍    | 405/750 [02:51<02:12,  2.61it/s]
Embedding extraction: 1it [00:00,  8.38it/s]54%|█████▍    | 406/750 [02:52<02:11,  2.62it/s]
Embedding extraction: 1it [00:00,  8.38it/s]54%|█████▍    | 407/750 [02:52<02:10,  2.63it/s]
Embedding extraction: 1it [00:00,  8.32it/s]54%|█████▍    | 408/750 [02:52<02:10,  2.62it/s]
Embedding extraction: 1it [00:00,  8.43it/s]55%|█████▍    | 409/750 [02:53<02:10,  2.62it/s]
Embedding extraction: 1it [00:00,  8.37it/s]55%|█████▍    | 410/750 [02:53<02:09,  2.62it/s]
Embedding extraction: 1it [00:00,  8.41it/s]55%|█████▍    | 411/750 [02:54<02:08,  2.63it/s]
Embedding extraction: 1it


--- Frame 450 ---
[Debug] Initial Detections: 23 (IDs: [2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 2 2 2 3 2 2 2 3])
[Debug] Ball Detections: 0, People Detections (Raw): 23
[Debug] People Detections (After NMS): 23
[Debug] Pre-classification counts: Players=20, GK=0, Ref=3


Embedding extraction: 1it [00:00,  8.33it/s]


[Debug] Player Team IDs assigned: [1 0 1 1 0 1 1 0 1 0 0 1 0 0 0 0 0 1 1 1]
[Debug] Referee Team IDs assigned: [2 2 2]
[Debug] Total Detections Merged for Tracking: 23
[Debug] Input to tracker shape: (23, 6), dtype: float64
[Debug] Input confidences: [    0.91785     0.90791      0.9059     0.90278     0.90146     0.90127     0.90017     0.89899     0.88566     0.88461     0.88427     0.88291     0.88138     0.87519     0.87487     0.87262     0.86955       0.866     0.86203     0.84044     0.87593     0.86855     0.73023]
[Debug] Input class IDs: [          1           0           1           1           0           1           1           0           1           0           0           1           0           0           0           0           0           1           1           1           2           2           2]


Processing video with BoTSORT + PaddleOCR:  60%|██████    | 451/750 [03:09<01:58,  2.53it/s]

[Debug] Number of tracked detections OUTPUT: 23
[Debug] Generated labels: ['T1', 'T2', 'T3', 'T4', 'T5', 'T6', 'T7', '#2 (0.7)', '#7 (0.8)', 'T10', 'T11', 'T12', 'T14', 'T15', 'T18', 'T20', 'T21', 'T22', 'T25', 'T13', 'T27', 'T16', 'T28']


Embedding extraction: 1it [00:00,  8.36it/s]
Embedding extraction: 1it [00:00,  8.31it/s]60%|██████    | 452/750 [03:09<01:57,  2.55it/s]
Embedding extraction: 1it [00:00,  8.27it/s]60%|██████    | 453/750 [03:10<01:55,  2.56it/s]
Embedding extraction: 1it [00:00,  9.25it/s]61%|██████    | 454/750 [03:10<01:54,  2.59it/s]
Embedding extraction: 1it [00:00,  8.34it/s]61%|██████    | 455/750 [03:11<01:52,  2.62it/s]
Embedding extraction: 1it [00:00,  8.30it/s]61%|██████    | 456/750 [03:11<01:55,  2.56it/s]
Embedding extraction: 1it [00:00,  8.36it/s]61%|██████    | 457/750 [03:11<01:54,  2.56it/s]
Embedding extraction: 1it [00:00,  8.37it/s]61%|██████    | 458/750 [03:12<01:53,  2.58it/s]
Embedding extraction: 1it [00:00,  8.36it/s]61%|██████    | 459/750 [03:12<01:53,  2.57it/s]
Embedding extraction: 1it [00:00,  8.28it/s]61%|██████▏   | 460/750 [03:12<01:51,  2.59it/s]
Embedding extraction: 1it [00:00,  8.27it/s]61%|██████▏   | 461/750 [03:13<01:51,  2.60it/s]
Embedding extraction: 1it


--- Frame 500 ---
[Debug] Initial Detections: 23 (IDs: [2 2 2 2 2 2 2 2 2 2 3 2 2 2 2 2 2 3 2 2 2 3 0])
[Debug] Ball Detections: 1, People Detections (Raw): 22
[Debug] People Detections (After NMS): 22
[Debug] Pre-classification counts: Players=19, GK=0, Ref=3


Embedding extraction: 1it [00:00,  8.52it/s]


[Debug] Player Team IDs assigned: [1 0 0 0 1 0 1 1 1 1 0 1 0 1 0 0 0 0 1]
[Debug] Referee Team IDs assigned: [2 2 2]
[Debug] Total Detections Merged for Tracking: 22
[Debug] Input to tracker shape: (22, 6), dtype: float64
[Debug] Input confidences: [    0.92029     0.91521     0.91366      0.9076      0.9075     0.90474     0.90344      0.9032     0.89907     0.89691     0.89099     0.88772     0.88692     0.88581     0.87076     0.86925     0.86138     0.82626     0.80532     0.89608     0.86527     0.72894]
[Debug] Input class IDs: [          1           0           0           0           1           0           1           1           1           1           0           1           0           1           0           0           0           0           1           2           2           2]
[Debug] Number of tracked detections OUTPUT: 22


Processing video with BoTSORT + PaddleOCR:  67%|██████▋   | 501/750 [03:29<01:41,  2.45it/s]

[Debug] Generated labels: ['T1', 'T2', 'T3', 'T4', 'T5', 'T6', '#2 (0.7)', '#7 (0.8)', 'T10', 'T11', 'T12', 'T14', 'T15', 'T18', 'T20', 'T21', 'T22', 'T25', 'T13', 'T27', 'T16', 'T28']


Embedding extraction: 1it [00:00,  8.34it/s]
Embedding extraction: 1it [00:00,  8.29it/s]67%|██████▋   | 502/750 [03:29<01:41,  2.45it/s]
Embedding extraction: 1it [00:00,  8.28it/s]67%|██████▋   | 503/750 [03:30<01:38,  2.50it/s]
Embedding extraction: 1it [00:00,  8.35it/s]67%|██████▋   | 504/750 [03:30<01:37,  2.53it/s]
Embedding extraction: 1it [00:00,  8.30it/s]67%|██████▋   | 505/750 [03:31<01:37,  2.50it/s]
Embedding extraction: 1it [00:00,  8.27it/s]67%|██████▋   | 506/750 [03:31<01:41,  2.39it/s]
Embedding extraction: 1it [00:00,  8.34it/s]68%|██████▊   | 507/750 [03:31<01:41,  2.41it/s]
Embedding extraction: 1it [00:00,  8.32it/s]68%|██████▊   | 508/750 [03:32<01:40,  2.41it/s]
Embedding extraction: 1it [00:00,  8.30it/s]68%|██████▊   | 509/750 [03:32<01:37,  2.48it/s]
Embedding extraction: 1it [00:00,  8.34it/s]68%|██████▊   | 510/750 [03:33<01:35,  2.52it/s]
Embedding extraction: 1it [00:00,  8.27it/s]68%|██████▊   | 511/750 [03:33<01:33,  2.56it/s]
Embedding extraction: 1it


--- Frame 550 ---
[Debug] Initial Detections: 24 (IDs: [2 2 2 2 2 2 2 2 2 3 2 2 2 3 2 2 2 2 2 2 2 2 3 3])
[Debug] Ball Detections: 0, People Detections (Raw): 24
[Debug] People Detections (After NMS): 24
[Debug] Pre-classification counts: Players=20, GK=0, Ref=4


Embedding extraction: 1it [00:00,  8.31it/s]


[Debug] Player Team IDs assigned: [1 1 1 0 0 1 0 0 1 0 0 0 0 1 1 0 1 1 1 0]
[Debug] Referee Team IDs assigned: [2 2 2 2]
[Debug] Total Detections Merged for Tracking: 24
[Debug] Input to tracker shape: (24, 6), dtype: float64
[Debug] Input confidences: [    0.92076     0.91069     0.90671      0.9065     0.90403     0.89805     0.89362     0.89224     0.89205     0.88412     0.88348     0.87048     0.86767     0.85758     0.85728     0.84087     0.83789     0.82723     0.82344     0.82029     0.89157     0.86974     0.51766      0.5157]
[Debug] Input class IDs: [          1           1           1           0           0           1           0           0           1           0           0           0           0           1           1           0           1           1           1           0           2           2           2           2]
[Debug] Number of tracked detections OUTPUT: 23
[Debug] Generated labels: ['T1', 'T2', 'T3', 'T4', 'T5', '#2 (0.7)', '#7 (0.8)', 'T10', 'T11',

Embedding extraction: 1it [00:00,  8.33it/s]73%|███████▎  | 551/750 [03:49<01:21,  2.46it/s]
Embedding extraction: 1it [00:00,  8.35it/s]74%|███████▎  | 552/750 [03:50<01:19,  2.50it/s]
Embedding extraction: 1it [00:00,  8.28it/s]74%|███████▎  | 553/750 [03:50<01:17,  2.53it/s]
Embedding extraction: 1it [00:00,  8.32it/s]74%|███████▍  | 554/750 [03:51<01:16,  2.55it/s]
Embedding extraction: 1it [00:00,  8.27it/s]74%|███████▍  | 555/750 [03:51<01:18,  2.49it/s]
Embedding extraction: 1it [00:00,  8.26it/s]74%|███████▍  | 556/750 [03:51<01:19,  2.45it/s]
Embedding extraction: 1it [00:00,  8.21it/s]74%|███████▍  | 557/750 [03:52<01:18,  2.47it/s]
Embedding extraction: 1it [00:00,  8.24it/s]74%|███████▍  | 558/750 [03:52<01:16,  2.51it/s]
Embedding extraction: 1it [00:00,  8.26it/s]75%|███████▍  | 559/750 [03:53<01:15,  2.54it/s]
Embedding extraction: 1it [00:00,  8.20it/s]75%|███████▍  | 560/750 [03:53<01:13,  2.58it/s]
Embedding extraction: 1it [00:00,  8.27it/s]75%|███████▍  | 561/750 [0


--- Frame 600 ---
[Debug] Initial Detections: 24 (IDs: [2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 2 2 2 3 2 3 0])
[Debug] Ball Detections: 1, People Detections (Raw): 23
[Debug] People Detections (After NMS): 23
[Debug] Pre-classification counts: Players=20, GK=0, Ref=3


Embedding extraction: 1it [00:00,  8.24it/s]


[Debug] Player Team IDs assigned: [1 0 1 1 0 0 1 1 1 0 1 0 1 1 0 0 0 1 0 0]
[Debug] Referee Team IDs assigned: [2 2 2]
[Debug] Total Detections Merged for Tracking: 23
[Debug] Input to tracker shape: (23, 6), dtype: float64
[Debug] Input confidences: [    0.91864     0.89711     0.89665     0.89151     0.88875     0.88486     0.88348     0.88311     0.88075     0.87978     0.87802      0.8775     0.87642     0.87612     0.87473     0.87408     0.85746     0.85737     0.85531     0.78641     0.87434     0.84246     0.75983]
[Debug] Input class IDs: [          1           0           1           1           0           0           1           1           1           0           1           0           1           1           0           0           0           1           0           0           2           2           2]


Processing video with BoTSORT + PaddleOCR:  80%|████████  | 601/750 [04:09<00:57,  2.59it/s]

[Debug] Number of tracked detections OUTPUT: 23
[Debug] Generated labels: ['T1', 'T2', 'T3', 'T4', 'T5', '#2 (0.7)', '#7 (0.8)', 'T10', 'T11', 'T12', 'T14', 'T15', 'T18', 'T20', 'T21', 'T22', 'T25', 'T13', 'T27', 'T16', 'T7', 'T6', 'T30']


Embedding extraction: 1it [00:00,  8.01it/s]
Embedding extraction: 1it [00:00,  8.19it/s]80%|████████  | 602/750 [04:09<01:00,  2.46it/s]
Embedding extraction: 1it [00:00,  8.19it/s]80%|████████  | 603/750 [04:10<00:58,  2.51it/s]
Embedding extraction: 1it [00:00,  8.23it/s]81%|████████  | 604/750 [04:10<00:57,  2.54it/s]
Embedding extraction: 1it [00:00,  8.32it/s]81%|████████  | 605/750 [04:10<00:58,  2.48it/s]
Embedding extraction: 1it [00:00,  8.22it/s]81%|████████  | 606/750 [04:11<00:58,  2.44it/s]
Embedding extraction: 1it [00:00,  8.16it/s]81%|████████  | 607/750 [04:11<00:57,  2.49it/s]
Embedding extraction: 1it [00:00,  8.27it/s]81%|████████  | 608/750 [04:12<00:56,  2.52it/s]
Embedding extraction: 1it [00:00,  8.20it/s]81%|████████  | 609/750 [04:12<00:55,  2.55it/s]
Embedding extraction: 1it [00:00,  8.28it/s]81%|████████▏ | 610/750 [04:12<00:56,  2.50it/s]
Embedding extraction: 1it [00:00,  8.26it/s]81%|████████▏ | 611/750 [04:13<00:56,  2.46it/s]
Embedding extraction: 1it


--- Frame 650 ---
[Debug] Initial Detections: 25 (IDs: [2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 3 2 2 2 2 3 2 0])
[Debug] Ball Detections: 1, People Detections (Raw): 24
[Debug] People Detections (After NMS): 23
[Debug] Pre-classification counts: Players=20, GK=0, Ref=3


Embedding extraction: 1it [00:00,  8.26it/s]


[Debug] Player Team IDs assigned: [1 1 0 0 1 1 0 1 0 1 0 0 0 0 0 1 1 1 0 1]
[Debug] Referee Team IDs assigned: [2 2 2]
[Debug] Total Detections Merged for Tracking: 23
[Debug] Input to tracker shape: (23, 6), dtype: float64
[Debug] Input confidences: [    0.90494     0.89907      0.8977     0.89038     0.88883     0.88675     0.88322      0.8815     0.88029     0.87696     0.87563        0.87     0.86602     0.86121     0.86046     0.84755     0.78739     0.78377     0.77518     0.76754     0.85018     0.80435     0.75633]
[Debug] Input class IDs: [          1           1           0           0           1           1           0           1           0           1           0           0           0           0           0           1           1           1           0           1           2           2           2]
[Debug] Number of tracked detections OUTPUT: 23


Processing video with BoTSORT + PaddleOCR:  87%|████████▋ | 651/750 [04:29<00:41,  2.39it/s]

[Debug] Generated labels: ['T1', 'T2', 'T3', 'T4', 'T5', '#2 (0.7)', '#7 (0.8)', 'T10', 'T11', 'T12', 'T14', 'T15', 'T20', 'T21', 'T22', 'T25', 'T13', 'T27', 'T16', 'T7', 'T6', 'T30', 'T18']


Embedding extraction: 1it [00:00,  8.20it/s]
Embedding extraction: 1it [00:00,  7.61it/s]87%|████████▋ | 652/750 [04:29<00:41,  2.37it/s]
Embedding extraction: 1it [00:00,  8.23it/s]87%|████████▋ | 653/750 [04:30<00:40,  2.41it/s]
Embedding extraction: 1it [00:00,  8.25it/s]87%|████████▋ | 654/750 [04:30<00:39,  2.42it/s]
Embedding extraction: 1it [00:00,  8.21it/s]87%|████████▋ | 655/750 [04:30<00:40,  2.37it/s]
Embedding extraction: 1it [00:00,  8.23it/s]87%|████████▋ | 656/750 [04:31<00:40,  2.31it/s]
Embedding extraction: 1it [00:00,  8.30it/s]88%|████████▊ | 657/750 [04:31<00:40,  2.27it/s]
Embedding extraction: 1it [00:00,  8.25it/s]88%|████████▊ | 658/750 [04:32<00:40,  2.25it/s]
Embedding extraction: 1it [00:00,  8.26it/s]88%|████████▊ | 659/750 [04:32<00:40,  2.24it/s]
Embedding extraction: 1it [00:00,  8.28it/s]88%|████████▊ | 660/750 [04:33<00:39,  2.29it/s]
Embedding extraction: 1it [00:00,  8.27it/s]88%|████████▊ | 661/750 [04:33<00:38,  2.31it/s]
Embedding extraction: 1it


--- Frame 700 ---
[Debug] Initial Detections: 23 (IDs: [2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 3 2 2 2 2 2 3 0])
[Debug] Ball Detections: 1, People Detections (Raw): 22
[Debug] People Detections (After NMS): 22
[Debug] Pre-classification counts: Players=19, GK=0, Ref=3


Embedding extraction: 1it [00:00,  8.28it/s]


[Debug] Player Team IDs assigned: [1 1 1 1 0 0 0 0 0 0 1 1 1 0 0 1 1 1 0]
[Debug] Referee Team IDs assigned: [2 2 2]
[Debug] Total Detections Merged for Tracking: 22
[Debug] Input to tracker shape: (22, 6), dtype: float64
[Debug] Input confidences: [    0.91019     0.90707     0.89973     0.89223     0.88977     0.88661     0.88151     0.88043     0.87942     0.87316     0.87164     0.87044     0.86242     0.85803     0.83588     0.81979       0.812     0.80262     0.80165     0.86164      0.8544      0.6732]
[Debug] Input class IDs: [          1           1           1           1           0           0           0           0           0           0           1           1           1           0           0           1           1           1           0           2           2           2]


Processing video with BoTSORT + PaddleOCR:  93%|█████████▎| 701/750 [04:49<00:18,  2.62it/s]

[Debug] Number of tracked detections OUTPUT: 22
[Debug] Generated labels: ['T1', 'T2', 'T3', 'T4', '#2 (0.7)', '#7 (0.8)', 'T10', 'T11', 'T12', 'T14', 'T15', 'T21', 'T22', 'T25', 'T13', 'T27', 'T16', 'T7', 'T30', 'T20', 'T5', 'T6']


Embedding extraction: 1it [00:00,  8.42it/s]
Embedding extraction: 1it [00:00,  8.20it/s]94%|█████████▎| 702/750 [04:49<00:18,  2.63it/s]
Embedding extraction: 1it [00:00,  8.25it/s]94%|█████████▎| 703/750 [04:50<00:17,  2.64it/s]
Embedding extraction: 1it [00:00,  8.41it/s]94%|█████████▍| 704/750 [04:50<00:17,  2.66it/s]
Embedding extraction: 1it [00:00,  8.27it/s]94%|█████████▍| 705/750 [04:50<00:17,  2.62it/s]
Embedding extraction: 1it [00:00,  8.43it/s]94%|█████████▍| 706/750 [04:51<00:16,  2.65it/s]
Embedding extraction: 1it [00:00,  8.32it/s]94%|█████████▍| 707/750 [04:51<00:16,  2.67it/s]
Embedding extraction: 1it [00:00,  8.36it/s]94%|█████████▍| 708/750 [04:52<00:15,  2.67it/s]
Embedding extraction: 1it [00:00,  8.39it/s]95%|█████████▍| 709/750 [04:52<00:15,  2.67it/s]
Embedding extraction: 1it [00:00,  8.33it/s]95%|█████████▍| 710/750 [04:52<00:14,  2.68it/s]
Embedding extraction: 1it [00:00,  8.31it/s]95%|█████████▍| 711/750 [04:53<00:14,  2.63it/s]
Embedding extraction: 1it

Finished processing. Annotated video saved to: 0bfacc_0_botsort_paddleocr_reid_debug_tracking.mp4





### Method 3
Enhanced Visuals

In [9]:
import supervision as sv
from tqdm import tqdm
import numpy as np
from boxmot import BotSort # Using BoTSORT as requested
import cv2
from pathlib import Path
import torch
from collections import defaultdict, deque
import warnings
import logging
import traceback # Import traceback for detailed error printing
import os # Added for directory creation
import random # Added for sparkle effect

# Suppress most logging messages
logging.basicConfig(level=logging.WARNING) # Show only warnings and errors
logging.disable(logging.INFO) # Disable INFO messages specifically
warnings.filterwarnings('ignore', category=UserWarning, module='paddle')
warnings.filterwarnings('ignore', category=UserWarning, module='torchvision') # Ignore potential torchvision warnings

# Attempt to import PaddleOCR
try:
    from paddleocr import PaddleOCR
    PADDLEOCR_AVAILABLE = True
except ImportError:
    print("Warning: PaddleOCR not found. Please install it (`pip install paddlepaddle paddleocr`). OCR functionality will be disabled.")
    PADDLEOCR_AVAILABLE = False

# ----- Configuration -----
SOURCE_VIDEO_PATH = "app/test_data/raw/0bfacc_0.mp4"
OUTPUT_VIDEO_PATH = "0bfacc_0_tracking_magical_trail.mp4" # New output name
OCR_DEBUG_DIR = "ocr_debug_crops" # Directory to save OCR debug crops
DEVICE = torch.device(0) if torch.cuda.is_available() else torch.device('cpu')
print(f"Using device: {DEVICE}")

# Create OCR debug directory if it doesn't exist
os.makedirs(OCR_DEBUG_DIR, exist_ok=True)
print(f"OCR debug crops will be saved to: {OCR_DEBUG_DIR}")

# Class IDs (initial detection model)
BALL_ID = 0
GOALKEEPER_ID = 1
PLAYER_ID = 2
REFEREE_ID = 3

# Team/Role Class IDs (assigned *after* classification)
TEAM_A_ID = 0 # Example ID for Team A
TEAM_B_ID = 1 # Example ID for Team B
REFEREE_TEAM_ID = 2 # Example ID for Referee

# OCR Configuration
OCR_CONFIDENCE_THRESHOLD = 0.6
MIN_JERSEY_DIGITS = 1
MAX_JERSEY_DIGITS = 2

# ID Management Configuration
LOST_TRACK_MEMORY_SECONDS = 20
MISMATCH_CONSISTENCY_FRAMES = 3

# Ball Trail Configuration
BALL_TRAIL_SECONDS = 3 # Shorter trail for magical effect?
SPARKLE_COUNT = 3 # Number of sparkles per point
SPARKLE_RADIUS = 0 # Radius 0 for single pixel, 1 for small dot
SPARKLE_OFFSET = 3 # Max random offset for sparkles

# ----- Initialize PaddleOCR -----
ocr_model = None
if PADDLEOCR_AVAILABLE:
    try:
        ocr_model = PaddleOCR(use_angle_cls=False, lang='en', use_gpu=(DEVICE.type == 'cuda'), show_log=False)
        print("PaddleOCR initialized successfully.")
    except Exception as e:
        print(f"Error initializing PaddleOCR: {e}. Disabling OCR.")
        PADDLEOCR_AVAILABLE = False

# ----- OCR Function -----
def perform_ocr_on_crop(crop: np.ndarray) -> tuple[str | None, float | None]:
    """Performs OCR on a given crop, returning the best digit sequence and confidence."""
    if not PADDLEOCR_AVAILABLE or ocr_model is None or crop.size == 0:
        return None, None
    try:
        result = ocr_model.ocr(crop, cls=False)
        best_num, highest_conf = None, 0.0
        if result and result[0]:
             for res_item in result[0]:
                  if len(res_item) == 2 and isinstance(res_item[1], tuple) and len(res_item[1]) == 2:
                      text, confidence = res_item[1]
                      if (isinstance(text, str) and text.isdigit() and
                          MIN_JERSEY_DIGITS <= len(text) <= MAX_JERSEY_DIGITS and
                          isinstance(confidence, (float, int)) and
                          confidence > OCR_CONFIDENCE_THRESHOLD):
                           if confidence > highest_conf:
                               highest_conf, best_num = confidence, text
        return best_num, highest_conf if best_num else None
    except Exception as e:
        print(f"Error during PaddleOCR inference: {e}")
        return None, None

# ----- Color Calculation -----
DEFAULT_TEAM_A_COLOR = sv.Color.from_hex('#FF0000') # Red
DEFAULT_TEAM_B_COLOR = sv.Color.from_hex('#FFFF00') # Yellow
DEFAULT_REFEREE_COLOR = sv.Color.from_hex('#00FFFF') # Cyan
FALLBACK_COLOR = sv.Color.from_hex('#808080') # Grey

def calculate_average_color(frame: np.ndarray, detections: sv.Detections, central_fraction: float = 0.5) -> sv.Color | None:
    """Calculates the average color from the central region of detection boxes."""
    if len(detections) == 0: return None
    avg_colors = []
    height, width, _ = frame.shape
    for xyxy in detections.xyxy:
        x1, y1, x2, y2 = map(int, xyxy)
        x1, y1 = max(0, x1), max(0, y1)
        x2, y2 = min(width, x2), min(height, y2)
        if x1 >= x2 or y1 >= y2: continue
        box_w, box_h = x2 - x1, y2 - y1
        center_x, center_y = x1 + box_w // 2, y1 + box_h // 2
        central_w, central_h = int(box_w * central_fraction), int(box_h * central_fraction)
        cx1 = max(x1, center_x - central_w // 2)
        cy1 = max(y1, center_y - central_h // 2)
        cx2 = min(x2, center_x + central_w // 2)
        cy2 = min(y2, center_y + central_h // 2)
        if cx1 >= cx2 or cy1 >= cy2: continue
        crop = frame[cy1:cy2, cx1:cx2]
        if crop.size > 0:
            avg_bgr = cv2.mean(crop)[:3]
            avg_colors.append(avg_bgr)
    if not avg_colors: return None
    final_avg_bgr = np.mean(avg_colors, axis=0)
    b, g, r = map(int, final_avg_bgr)
    min_intensity = 50
    if r < min_intensity and g < min_intensity and b < min_intensity:
        r, g, b = min_intensity, min_intensity, min_intensity
    return sv.Color(r=r, g=g, b=b)

# ----- Annotation Parameters -----
ELLIPSE_THICKNESS = 1
LABEL_TEXT_COLOR = sv.Color.BLACK
LABEL_TEXT_POSITION = sv.Position.BOTTOM_CENTER
LABEL_TEXT_SCALE = 0.4 # Reduced scale
LABEL_TEXT_THICKNESS = 1
# Ball trail colors (BGR format for OpenCV)
BALL_TRAIL_BASE_COLOR = (255, 255, 0) # Bright Cyan
BALL_TRAIL_THICKNESS = 1 # Thin base line
SPARKLE_BASE_INTENSITY = 150 # Base brightness for oldest sparkles
SPARKLE_MAX_INTENSITY = 255 # Brightness for newest sparkles
# Current ball marker
CURRENT_BALL_MARKER_RADIUS = 4
CURRENT_BALL_MARKER_COLOR = (255, 255, 255) # White (BGR)
CURRENT_BALL_MARKER_THICKNESS = -1 # Filled circle

# ----- Tracker Initialization -----
REID_WEIGHTS_PATH = Path('clip_market1501.pt')
tracker = BotSort(
    reid_weights=REID_WEIGHTS_PATH if REID_WEIGHTS_PATH.exists() else None,
    device=DEVICE,
    half=False,
    with_reid=REID_WEIGHTS_PATH.exists(),
)

# ----- Player ID & Ball Trail State -----
player_data = {}
recently_lost_jerseys = defaultdict(lambda: deque(maxlen=10))
ball_positions = None # Will be initialized after knowing FPS

# ----- Video Processing Setup -----
try:
    video_info = sv.VideoInfo.from_video_path(str(SOURCE_VIDEO_PATH))
    width, height, fps = video_info.width, video_info.height, video_info.fps
    total_frames = video_info.total_frames if video_info.total_frames else 0
    print(f"Video Info: {width}x{height}, FPS: {fps}, Total Frames: {total_frames if total_frames > 0 else 'Unknown'}")
except Exception as e:
    print(f"Warning: Could not get video info using supervision. Using OpenCV. Error: {e}")
    cap = cv2.VideoCapture(str(SOURCE_VIDEO_PATH))
    if not cap.isOpened(): raise IOError(f"Cannot open video file: {SOURCE_VIDEO_PATH}")
    width, height = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)), int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = cap.get(cv2.CAP_PROP_FPS)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    cap.release()
    if not fps or fps <= 0: fps = 30
    if total_frames <= 0: total_frames = 0
    print(f"Fallback Video Info: {width}x{height}, FPS: {fps}, Total Frames: {total_frames if total_frames > 0 else 'Unknown'}")

# Initialize ball_positions deque
if fps > 0:
    trail_maxlen = int(fps * BALL_TRAIL_SECONDS)
    ball_positions = deque(maxlen=trail_maxlen)
    print(f"Ball trail deque initialized with maxlen={trail_maxlen} ({BALL_TRAIL_SECONDS} seconds)")
else:
    print("Warning: Could not determine FPS. Ball trail disabled.")
    ball_positions = deque(maxlen=1)

LOST_TRACK_MEMORY_FRAMES = int(fps * LOST_TRACK_MEMORY_SECONDS) if fps > 0 else 30 * LOST_TRACK_MEMORY_SECONDS
print(f"Lost track memory set to {LOST_TRACK_MEMORY_FRAMES} frames ({LOST_TRACK_MEMORY_SECONDS} seconds)")

frame_generator = sv.get_video_frames_generator(source_path=str(SOURCE_VIDEO_PATH), stride=1)
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
video_writer = cv2.VideoWriter(str(OUTPUT_VIDEO_PATH), fourcc, fps if fps > 0 else 30, (width, height))

# ----- Frame Processing Function -----
def process_frame(frame: np.ndarray, frame_idx: int):
    """
    Processes a single frame: detects, classifies, performs OCR, tracks, manages IDs, annotates.
    """
    global player_data, recently_lost_jerseys, ball_positions

    # 1. Detection
    results = PLAYER_DETECTION_MODEL.predict(frame, conf=0.3, iou=0.5, device=DEVICE, verbose=False)
    if not results or len(results) == 0: return frame
    detections = sv.Detections.from_ultralytics(results[0])

    # 2. Pre-processing & Ball Position Update
    ball_detections = detections[detections.class_id == BALL_ID]
    people_detections = detections[detections.class_id != BALL_ID]

    if len(ball_detections) > 0 and ball_positions is not None:
        x1, y1, x2, y2 = ball_detections.xyxy[0]
        center_x = int((x1 + x2) / 2)
        center_y = int((y1 + y2) / 2)
        ball_positions.append((center_x, center_y))

    # 3. Team/Role Classification (Same logic)
    players_detections = people_detections[people_detections.class_id == PLAYER_ID]
    goalkeepers_detections = people_detections[people_detections.class_id == GOALKEEPER_ID]
    referees_detections = people_detections[people_detections.class_id == REFEREE_ID]
    classified_players = sv.Detections.empty()
    if len(players_detections) > 0:
        players_crops = []
        valid_indices = []
        for i, xyxy in enumerate(players_detections.xyxy):
            crop = sv.crop_image(frame, xyxy)
            if crop is not None and crop.size > 0:
                 players_crops.append(crop)
                 valid_indices.append(i)
        if players_crops:
             predicted_team_ids = team_classifier.predict(players_crops)
             if predicted_team_ids is not None and len(predicted_team_ids) == len(players_crops):
                 assigned_ids = np.full(len(players_detections), -1, dtype=int)
                 for i, pred_id in enumerate(predicted_team_ids):
                      original_index = valid_indices[i]
                      assigned_ids[original_index] = pred_id
                 valid_classification_mask = (assigned_ids != -1)
                 players_detections.class_id = assigned_ids
                 classified_players = players_detections[valid_classification_mask]

    classified_gks = sv.Detections.empty()
    if len(goalkeepers_detections) > 0:
        gk_team_ids = resolve_goalkeepers_team_id(classified_players, goalkeepers_detections)
        if gk_team_ids is not None and len(gk_team_ids) == len(goalkeepers_detections):
            goalkeepers_detections.class_id = gk_team_ids
            classified_gks = goalkeepers_detections

    classified_refs = sv.Detections.empty()
    if len(referees_detections) > 0:
        ref_team_ids = np.full(len(referees_detections), REFEREE_TEAM_ID)
        referees_detections.class_id = ref_team_ids
        classified_refs = referees_detections

    # --- Calculate Dynamic Team Colors ---
    team_a_detections = classified_players[classified_players.class_id == TEAM_A_ID]
    team_b_detections = classified_players[classified_players.class_id == TEAM_B_ID]
    current_team_a_color = calculate_average_color(frame, team_a_detections) or DEFAULT_TEAM_A_COLOR
    current_team_b_color = calculate_average_color(frame, team_b_detections) or DEFAULT_TEAM_B_COLOR
    current_referee_color = DEFAULT_REFEREE_COLOR
    dynamic_color_map = {
        TEAM_A_ID: current_team_a_color,
        TEAM_B_ID: current_team_b_color,
        REFEREE_TEAM_ID: current_referee_color
    }

    # --- Merge Detections for Tracking ---
    detections_to_track = sv.Detections.merge([classified_players, classified_gks, classified_refs])

    # 4. Tracking using BoTSORT (Same logic)
    tracked_detections = sv.Detections.empty()
    current_frame_tracker_ids = set()
    if len(detections_to_track) > 0 and tracker is not None:
        boxmot_input = np.hstack((
            detections_to_track.xyxy,
            detections_to_track.confidence[:, np.newaxis],
            detections_to_track.class_id[:, np.newaxis]
        ))
        try:
            tracks = tracker.update(boxmot_input, frame)
            if tracks.shape[0] > 0:
                tracked_detections = sv.Detections(
                    xyxy=tracks[:, 0:4],
                    tracker_id=tracks[:, 4].astype(int),
                    confidence=tracks[:, 5],
                    class_id=tracks[:, 6].astype(int)
                )
                current_frame_tracker_ids = set(tracked_detections.tracker_id)
        except Exception as e:
            print(f"[Frame {frame_idx}] Error during tracker update: {e}")
            tracked_detections = sv.Detections.empty()
    elif tracker is not None:
         try: tracker.update(np.empty((0, 6)), frame)
         except Exception as e: print(f"[Frame {frame_idx}] Error updating tracker with empty input: {e}")

    # 5. OCR and Player ID Management (Label Generation)
    final_labels = []
    current_player_data = {}
    if len(tracked_detections) > 0:
        for i in range(len(tracked_detections)):
            track_id = tracked_detections.tracker_id[i]
            team_id = tracked_detections.class_id[i]
            bbox = tracked_detections.xyxy[i]

            # --- OCR & Debug Saving ---
            x1, y1, x2, y2 = map(int, bbox)
            x1, y1 = max(0, x1), max(0, y1)
            x2, y2 = min(width, x2), min(height, y2)
            detected_jersey_num, ocr_confidence = None, None
            player_crop, gray_crop = None, None
            if x1 < x2 and y1 < y2:
                player_crop = frame[y1:y2, x1:x2]
                gray_crop = cv2.cvtColor(player_crop, cv2.COLOR_BGR2GRAY)
                detected_jersey_num, ocr_confidence = perform_ocr_on_crop(gray_crop)
                if detected_jersey_num is not None and player_crop is not None and gray_crop is not None:
                    try:
                        player_filename = os.path.join(OCR_DEBUG_DIR, f"frame{frame_idx}_track{track_id}_player.png")
                        ocr_input_filename = os.path.join(OCR_DEBUG_DIR, f"frame{frame_idx}_track{track_id}_ocr_input.png")
                        cv2.imwrite(player_filename, player_crop)
                        cv2.imwrite(ocr_input_filename, gray_crop)
                    except Exception as write_e:
                        print(f"[Frame {frame_idx}] Error saving OCR debug crop for track {track_id}: {write_e}")

            # --- ID Management Logic (Same as before) ---
            assigned_jersey_id = None
            if track_id in player_data:
                p_data = player_data[track_id]; p_data["last_seen"] = frame_idx; p_data["team_id"] = team_id
                current_jersey_id = p_data["jersey_id"]; mismatch_history = p_data["mismatch_history"]
                if detected_jersey_num is not None:
                    if current_jersey_id is None or detected_jersey_num == current_jersey_id:
                        p_data["jersey_id"] = detected_jersey_num; p_data["jersey_confidence"] = ocr_confidence; mismatch_history.clear()
                    else:
                        mismatch_history.append(detected_jersey_num)
                        if len(mismatch_history) >= MISMATCH_CONSISTENCY_FRAMES and all(num == detected_jersey_num for num in mismatch_history):
                            p_data["jersey_id"] = detected_jersey_num; p_data["jersey_confidence"] = ocr_confidence; mismatch_history.clear()
                else: mismatch_history.clear()
                assigned_jersey_id = p_data["jersey_id"]
                current_player_data[track_id] = p_data
            else: # New track ID
                found_match = False
                if detected_jersey_num is not None and detected_jersey_num in recently_lost_jerseys:
                    potential_matches = []
                    for lost_track_info in reversed(recently_lost_jerseys[detected_jersey_num]):
                        time_diff = frame_idx - lost_track_info["last_seen"]
                        if time_diff < LOST_TRACK_MEMORY_FRAMES and lost_track_info["team_id"] == team_id: potential_matches.append((lost_track_info, time_diff))
                    if potential_matches:
                        potential_matches.sort(key=lambda x: x[1]); best_match_info, _ = potential_matches[0]
                        assigned_jersey_id = detected_jersey_num
                        p_data = {"jersey_id": assigned_jersey_id, "jersey_confidence": ocr_confidence, "last_seen": frame_idx, "team_id": team_id, "mismatch_history": deque(maxlen=MISMATCH_CONSISTENCY_FRAMES)}
                        current_player_data[track_id] = p_data
                        try: recently_lost_jerseys[detected_jersey_num].remove(best_match_info)
                        except ValueError: pass
                        found_match = True
                if not found_match:
                    assigned_jersey_id = detected_jersey_num
                    current_player_data[track_id] = {"jersey_id": assigned_jersey_id, "jersey_confidence": ocr_confidence if detected_jersey_num is not None else None, "last_seen": frame_idx, "team_id": team_id, "mismatch_history": deque(maxlen=MISMATCH_CONSISTENCY_FRAMES)}

            # --- Generate Final Label String (Same as before) ---
            if team_id == TEAM_A_ID: team_prefix = "T1"
            elif team_id == TEAM_B_ID: team_prefix = "T2"
            elif team_id == REFEREE_TEAM_ID: team_prefix = "Ref"
            else: team_prefix = f"T{team_id}"
            base_label = f"{team_prefix} P{track_id}"
            display_id = base_label
            if assigned_jersey_id is not None: display_id = f"{base_label} #{assigned_jersey_id}"
            final_labels.append(display_id)

    # 6. Update Global Player Data & Handle Lost Tracks (Same as before)
    lost_tracker_ids = set(player_data.keys()) - current_frame_tracker_ids
    for lost_id in lost_tracker_ids:
        lost_info = player_data[lost_id]
        if lost_info.get("jersey_id") is not None:
             recently_lost_jerseys[lost_info["jersey_id"]].append({"tracker_id": lost_id, "last_seen": lost_info["last_seen"], "team_id": lost_info["team_id"]})
    if frame_idx > 0 and fps > 0 and frame_idx % (int(fps) * 60) == 0:
        for jersey_num in list(recently_lost_jerseys.keys()):
            q = recently_lost_jerseys[jersey_num]
            valid_entries = deque([entry for entry in q if (frame_idx - entry["last_seen"]) < LOST_TRACK_MEMORY_FRAMES * 2], maxlen=10)
            if valid_entries: recently_lost_jerseys[jersey_num] = valid_entries
            else: del recently_lost_jerseys[jersey_num]
    player_data = current_player_data

    # 7. Annotation
    annotated_frame = frame.copy()

    # --- Annotate "Magical" Ball Trail ---
    if ball_positions is not None and len(ball_positions) >= 2:
        num_points = len(ball_positions)
        for i in range(1, num_points):
            # --- Draw main trail line segment ---
            pt1 = ball_positions[i-1]
            pt2 = ball_positions[i]
            if isinstance(pt1, tuple) and isinstance(pt2, tuple) and len(pt1) == 2 and len(pt2) == 2:
                 cv2.line(annotated_frame, pt1, pt2, BALL_TRAIL_BASE_COLOR, BALL_TRAIL_THICKNESS)

                 # --- Draw sparkles around the newer point (pt2) ---
                 # Intensity fades from max (newest) to base (oldest visible point)
                 alpha_fraction = (i - 1) / max(1, num_points - 1) # Fraction of the trail's visible age
                 sparkle_intensity = int(SPARKLE_BASE_INTENSITY + (SPARKLE_MAX_INTENSITY - SPARKLE_BASE_INTENSITY) * alpha_fraction)
                 sparkle_color = (sparkle_intensity, sparkle_intensity, sparkle_intensity) # Fading white sparkles

                 for _ in range(SPARKLE_COUNT):
                     offset_x = random.randint(-SPARKLE_OFFSET, SPARKLE_OFFSET)
                     offset_y = random.randint(-SPARKLE_OFFSET, SPARKLE_OFFSET)
                     sparkle_pt = (pt2[0] + offset_x, pt2[1] + offset_y)
                     # Draw small circle/dot for sparkle
                     cv2.circle(annotated_frame, sparkle_pt, SPARKLE_RADIUS, sparkle_color, -1) # Filled dot/circle

    # --- Annotate Current Ball Position (Keep the circle) ---
    if ball_positions is not None and len(ball_positions) > 0:
         last_pos = ball_positions[-1]
         if isinstance(last_pos, tuple) and len(last_pos) == 2:
              cv2.circle(annotated_frame, last_pos, CURRENT_BALL_MARKER_RADIUS, CURRENT_BALL_MARKER_COLOR, CURRENT_BALL_MARKER_THICKNESS)

    # --- Annotate Tracked People ---
    if len(tracked_detections) > 0:
        if len(final_labels) == len(tracked_detections):
            unique_team_ids = np.unique(tracked_detections.class_id)
            for current_team_id in unique_team_ids:
                team_mask = (tracked_detections.class_id == current_team_id)
                team_detections = tracked_detections[team_mask]
                team_labels = [label for i, label in enumerate(final_labels) if team_mask[i]]
                if len(team_detections) == 0: continue
                team_color = dynamic_color_map.get(current_team_id, FALLBACK_COLOR)
                temp_ellipse_annotator = sv.EllipseAnnotator(color=team_color, thickness=ELLIPSE_THICKNESS)
                temp_label_annotator = sv.LabelAnnotator(color=team_color, text_color=LABEL_TEXT_COLOR, text_position=LABEL_TEXT_POSITION, text_scale=LABEL_TEXT_SCALE, text_thickness=LABEL_TEXT_THICKNESS)
                try:
                    annotated_frame = temp_ellipse_annotator.annotate(annotated_frame, team_detections)
                    annotated_frame = temp_label_annotator.annotate(annotated_frame, team_detections, team_labels)
                except Exception as e:
                    print(f"[Frame {frame_idx}] Error during annotation for team {current_team_id} (Color: {team_color.as_hex()}): {e}")

    return annotated_frame

# ----- Main Video Processing Loop -----
try:
    tqdm_total = total_frames if total_frames and total_frames > 0 else None
    with tqdm(total=tqdm_total, desc="Processing video", unit="frame") as pbar:
        for frame_idx, frame in enumerate(frame_generator):
            if frame is None:
                print(f"\nWarning: Received None frame at index {frame_idx}, ending processing.")
                break
            try:
                annotated_frame = process_frame(frame, frame_idx)
                if annotated_frame is not None:
                    video_writer.write(annotated_frame)
                else:
                     print(f"\nError: process_frame returned None for frame {frame_idx}. Writing original frame.")
                     video_writer.write(frame)
            except Exception as e:
                print(f"\n--- CRITICAL ERROR processing frame {frame_idx}: {e} ---")
                traceback.print_exc()
                print("Attempting to continue and write original frame...")
                video_writer.write(frame)
            pbar.update(1)
except KeyboardInterrupt:
    print("\nProcessing interrupted by user.")
except Exception as e:
    print(f"\n--- UNHANDLED EXCEPTION in main loop: {e} ---")
    traceback.print_exc()
finally:
    video_writer.release()
    print(f"\nFinished processing. Annotated video saved to: {OUTPUT_VIDEO_PATH}")
    if DEVICE.type == 'cuda':
        try:
            torch.cuda.empty_cache()
            print("CUDA cache cleared.")
        except Exception as e:
            print(f"Error clearing CUDA cache: {e}")
    print("Done.")

Using device: cuda:0
OCR debug crops will be saved to: ocr_debug_crops


[32m2025-04-15 14:39:20.789[0m | [1mINFO    [0m | [36mboxmot.utils.torch_utils[0m:[36mselect_device[0m:[36m52[0m - [1mYolo Tracking v12.0.5 🚀 Python-3.11.11 torch-2.5.1+cu121
CUDA:0 (NVIDIA L4, 22478MiB)[0m


PaddleOCR initialized successfully.
Resized position embedding: %s to %s torch.Size([197, 768]) torch.Size([129, 768])
Position embedding resize to height:16 width: 8


[32m2025-04-15 14:39:22.664[0m | [32m[1mSUCCESS [0m | [36mboxmot.appearance.reid_model_factory[0m:[36mload_pretrained_weights[0m:[36m183[0m - [32m[1mLoaded pretrained weights from clip_market1501.pt[0m


Video Info: 1920x1080, FPS: 25, Total Frames: 750
Ball trail deque initialized with maxlen=75 (3 seconds)
Lost track memory set to 500 frames (20 seconds)


Embedding extraction: 1it [00:00,  8.56it/s]:00<?, ?frame/s]
Embedding extraction: 1it [00:00,  8.47it/s]:00<05:13,  2.39frame/s]
Embedding extraction: 1it [00:00,  9.46it/s]:00<04:52,  2.56frame/s]
Embedding extraction: 1it [00:00,  8.55it/s]:01<04:57,  2.51frame/s]
Embedding extraction: 1it [00:00,  8.57it/s]:01<04:49,  2.58frame/s]
Embedding extraction: 1it [00:00,  8.53it/s]:01<04:55,  2.52frame/s]
Embedding extraction: 1it [00:00,  8.58it/s]:02<04:53,  2.53frame/s]
Embedding extraction: 1it [00:00,  8.52it/s]:02<04:47,  2.59frame/s]
Embedding extraction: 1it [00:00,  8.58it/s]:03<05:01,  2.46frame/s]
Embedding extraction: 1it [00:00,  8.63it/s]:03<05:02,  2.45frame/s]
Embedding extraction: 1it [00:00,  8.71it/s]0:04<05:02,  2.45frame/s]
Embedding extraction: 1it [00:00,  9.37it/s]0:04<05:01,  2.45frame/s]
Embedding extraction: 1it [00:00,  9.50it/s]0:04<04:48,  2.56frame/s]
Embedding extraction: 1it [00:00,  8.69it/s]0:05<04:42,  2.61frame/s]
Embedding extraction: 1it [00:00,  8.5


Finished processing. Annotated video saved to: 0bfacc_0_tracking_magical_trail.mp4
CUDA cache cleared.
Done.





## Tracking Method 4 with GTA-Link

### Imports

In [11]:
# Import necessary libraries
import torch
import numpy as np
import supervision as sv
from ultralytics import YOLO
from boxmot import BotSort
from tqdm import tqdm
import cv2
from pathlib import Path
from collections import defaultdict
import copy
import sys
import os
import pickle # For potential saving/loading if needed later

# --- Logging ---
from loguru import logger

from gta_link.reid import torchreid
logger.remove() # Remove default handler
logger.add(sys.stderr, level="INFO") # Add console logger

# --- Image Handling ---
from PIL import Image
import torchvision.transforms as T


from gta_link.Tracklet import Tracklet
# import torchreid.utils import FeatureExtractor
# --- Imports for Refine Tracklets Logic ---
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
from scipy.spatial.distance import cdist
# Note: matplotlib/seaborn only needed if using display_Dist, skipping for now


### Helper functions

In [36]:
# =============================================================================
# == Modified Tracklet Class (to include class_id) ==
# =============================================================================
class Tracklet:
    """Represents a tracklet with detections, features, and class IDs."""
    def __init__(self, track_id=None, frames=None, scores=None, bboxes=None, feats=None, class_ids=None):
        """
        Initializes the Tracklet.

        Args:
            track_id (int, optional): Unique identifier for the track. Defaults to None.
            frames (list or int, optional): Frame numbers (1-based). Defaults to None.
            scores (list or float, optional): Detection scores. Defaults to None.
            bboxes (list of lists or list, optional): Bounding boxes [l, t, w, h]. Defaults to None.
            feats (list of np.array, optional): Feature vectors. Defaults to None.
            class_ids (list or int, optional): Class IDs (e.g., team/role). Defaults to None.
        """
        self.track_id = track_id
        self.parent_id = track_id # Keep track of original parent if split
        self.scores = self._listify(scores)
        self.times = self._listify(frames) # Frame numbers (1-based)
        self.bboxes = self._listify_bboxes(bboxes) # List of [l, t, w, h]
        self.features = feats if feats is not None else [] # List of numpy arrays
        self.class_ids = self._listify(class_ids) # List of class IDs

    def _listify(self, item):
        """Helper to ensure item is a list."""
        if item is None:
            return []
        return item if isinstance(item, list) else [item]

    def _listify_bboxes(self, bboxes):
        """Helper to ensure bboxes is a list of lists."""
        if bboxes is None:
            return []
        # Check if it's already a list of lists
        if isinstance(bboxes, list) and bboxes and isinstance(bboxes[0], list):
            return bboxes
        # Check if it's a single bbox list
        elif isinstance(bboxes, list) and len(bboxes) == 4 and not isinstance(bboxes[0], list):
             return [bboxes]
        # Otherwise return empty
        return []


    def append_det(self, frame, score, bbox_ltwh, class_id):
        """Appends a detection (frame, score, bbox, class_id) to the tracklet."""
        if not isinstance(bbox_ltwh, list) or len(bbox_ltwh) != 4:
             logger.warning(f"Tracklet {self.track_id}: Invalid bbox format {bbox_ltwh}. Skipping append.")
             return
        self.scores.append(score)
        self.times.append(frame)
        self.bboxes.append(bbox_ltwh)
        self.class_ids.append(class_id)

    def append_feat(self, feat):
        """Appends a feature vector."""
        self.features.append(feat)

    def extract(self, start_idx, end_idx):
        """Extracts a sub-tracklet based on list indices."""
        if not (0 <= start_idx <= end_idx < len(self.times)):
             logger.error(f"Tracklet {self.track_id}: Invalid indices for extract ({start_idx}, {end_idx}). Length is {len(self.times)}")
             return None # Return None or raise error

        subtrack = Tracklet(
            track_id=self.track_id, # Keep original ID for now
            frames=self.times[start_idx : end_idx + 1],
            scores=self.scores[start_idx : end_idx + 1],
            bboxes=self.bboxes[start_idx : end_idx + 1],
            feats=self.features[start_idx : end_idx + 1] if self.features else None,
            class_ids=self.class_ids[start_idx : end_idx + 1] if self.class_ids else None
        )
        subtrack.parent_id = self.parent_id # Assign parent ID
        return subtrack

    def __len__(self):
        """Return the number of detections in the tracklet."""
        return len(self.times)

# =============================================================================
# == Functions Copied/Adapted from refine_tracklets.py ==
# =============================================================================

def find_consecutive_segments(track_times):
    """Identifies start/end indices of consecutive frame segments."""
    if not track_times: return []
    segments = []
    start_index = 0
    for i in range(1, len(track_times)):
        if track_times[i] != track_times[i-1] + 1:
            segments.append((start_index, i - 1))
            start_index = i
    segments.append((start_index, len(track_times) - 1)) # Add the last segment
    return segments

def query_subtracks(seg1_indices, seg2_indices, track1, track2):
    """Pairs up segments from two tracks based on temporal order."""
    subtracks = []
    seg1 = list(seg1_indices) # Make copies to modify
    seg2 = list(seg2_indices)

    while seg1 and seg2:
        s1_start_idx, s1_end_idx = seg1[0]
        s2_start_idx, s2_end_idx = seg2[0]

        # Extract subtracks using indices
        subtrack_1 = track1.extract(s1_start_idx, s1_end_idx)
        subtrack_2 = track2.extract(s2_start_idx, s2_end_idx)

        if subtrack_1 is None or subtrack_2 is None: # Handle extraction errors
             logger.warning("Subtrack extraction failed in query_subtracks.")
             # Decide how to handle: break, continue, remove problematic segment?
             # For now, remove both and continue
             if seg1: seg1.pop(0)
             if seg2: seg2.pop(0)
             continue

        s1_start_frame = subtrack_1.times[0]
        s1_end_frame = subtrack_1.times[-1]
        s2_start_frame = subtrack_2.times[0]
        s2_end_frame = subtrack_2.times[-1]

        # Check for temporal ordering (allow touching, i.e. end_frame <= start_frame)
        if s1_end_frame <= s2_start_frame: # Segment 1 comes before Segment 2
            subtracks.append(subtrack_1)
            subtracks.append(subtrack_2)
            seg1.pop(0)
            seg2.pop(0)
        elif s2_end_frame <= s1_start_frame: # Segment 2 comes before Segment 1
            subtracks.append(subtrack_2)
            subtracks.append(subtrack_1)
            seg1.pop(0)
            seg2.pop(0)
        else:
            # This case implies overlap or incorrect segment logic, should ideally not happen if segments are correct
            logger.warning(f"Unexpected overlap/order in query_subtracks: T1({s1_start_frame}-{s1_end_frame}), T2({s2_start_frame}-{s2_end_frame}). Removing earlier starting segment.")
            # As a fallback, remove the segment that starts earlier to try and resolve
            if s1_start_frame <= s2_start_frame:
                 seg1.pop(0)
            else:
                 seg2.pop(0)

    # Add remaining segments (optional: add length filter here if desired)
    seg_remain, track_remain = (seg1, track1) if seg1 else (seg2, track2)
    while seg_remain:
        s_start_idx, s_end_idx = seg_remain.pop(0)
        subtrack = track_remain.extract(s_start_idx, s_end_idx)
        if subtrack: # Add if extraction was successful
             subtracks.append(subtrack)

    return subtracks

def get_spatial_constraints(tid2track, factor):
    """Calculates max spatial range based on bounding box centers."""
    min_x, max_x = float('inf'), -float('inf')
    min_y, max_y = float('inf'), -float('inf')

    for track in tid2track.values():
        for bbox in track.bboxes:
            if len(bbox) == 4:
                l, t, w, h = bbox
                center_x = l + w / 2
                center_y = t + h / 2
                min_x, max_x = min(min_x, center_x), max(max_x, center_x)
                min_y, max_y = min(min_y, center_y), max(max_y, center_y)

    if min_x == float('inf'): # Handle case with no valid bboxes
        return 0.0, 0.0

    x_range = abs(max_x - min_x) * factor
    y_range = abs(max_y - min_y) * factor
    return x_range, y_range

def get_distance(track1, track2):
    """Calculates cosine distance between avg features of two tracks."""
    # Check for temporal overlap
    if set(track1.times) & set(track2.times):
        return 1.0 # Max distance if they overlap in time

    if not track1.features or not track2.features:
        return 1.0 # Max distance if features are missing

    try:
        # Use average feature for distance calculation (simpler than pairwise)
        avg_feat1 = np.mean(np.stack(track1.features), axis=0, keepdims=True)
        avg_feat2 = np.mean(np.stack(track2.features), axis=0, keepdims=True)

        # Calculate cosine distance (1 - cosine_similarity)
        dist = cdist(avg_feat1, avg_feat2, metric='cosine')[0, 0]
        return dist if not np.isnan(dist) else 1.0 # Handle potential NaN
    except Exception as e:
        logger.error(f"Error calculating distance between T{track1.track_id} and T{track2.track_id}: {e}")
        return 1.0 # Return max distance on error

def get_distance_matrix(tid2track):
    """Constructs distance matrix between tracklets."""
    tids = sorted(list(tid2track.keys()))
    num_tracks = len(tids)
    dist_matrix = np.full((num_tracks, num_tracks), 1.0) # Initialize with max distance

    tid_to_idx = {tid: i for i, tid in enumerate(tids)}

    for i in range(num_tracks):
        for j in range(i, num_tracks): # Calculate upper triangle including diagonal
            tid1 = tids[i]
            tid2 = tids[j]
            track1 = tid2track[tid1]
            track2 = tid2track[tid2]

            if i == j:
                dist_matrix[i, j] = 0.0 # Distance to self is 0
            else:
                distance = get_distance(track1, track2)
                dist_matrix[i, j] = distance
                dist_matrix[j, i] = distance # Symmetric matrix
    return dist_matrix, tid_to_idx, tids


def check_spatial_constraints(trk_1, trk_2, max_x_range, max_y_range):
    """Checks if end of trk_1 and start of trk_2 are spatially close."""
    seg_1_indices = find_consecutive_segments(trk_1.times)
    seg_2_indices = find_consecutive_segments(trk_2.times)

    if not seg_1_indices or not seg_2_indices:
        return False # Cannot check if segments are missing

    # Get temporally ordered subtracks (ignoring internal connections)
    subtracks = query_subtracks(seg_1_indices, seg_2_indices, trk_1, trk_2)

    if len(subtracks) < 2:
        return True # No connection points to check, trivially true? Or false? Assume true.

    for i in range(len(subtracks) - 1):
        subtrack_1st = subtracks[i]
        subtrack_2nd = subtracks[i+1]

        # Only check if they originate from different parent tracklets
        if subtrack_1st.parent_id == subtrack_2nd.parent_id:
            continue

        # Check spatial distance between end of 1st and start of 2nd
        if not subtrack_1st.bboxes or not subtrack_2nd.bboxes: continue # Skip if bboxes missing

        l1, t1, w1, h1 = subtrack_1st.bboxes[-1]
        l2, t2, w2, h2 = subtrack_2nd.bboxes[0]
        center_x1 = l1 + w1 / 2
        center_y1 = t1 + h1 / 2
        center_x2 = l2 + w2 / 2
        center_y2 = t2 + h2 / 2

        dx = abs(center_x1 - center_x2)
        dy = abs(center_y1 - center_y2)

        if dx > max_x_range or dy > max_y_range:
            # logger.debug(f"Spatial constraint failed: T{trk_1.track_id}(end) vs T{trk_2.track_id}(start). dx={dx:.1f}, dy={dy:.1f}")
            return False # Constraint failed

    return True # All connection points passed

def merge_tracklets(tracklets, dist_matrix, tid_to_idx, tids, max_x_range, max_y_range, merge_dist_thres):
    """Performs hierarchical merging based on distance and spatial constraints."""
    current_tracklets = copy.deepcopy(tracklets) # Work on a copy
    dist = dist_matrix.copy()
    current_tid_to_idx = tid_to_idx.copy()
    current_tids = tids.copy()

    while True:
        num_tracks = dist.shape[0]
        if num_tracks <= 1: break # Nothing left to merge

        # Find minimum distance below threshold (excluding diagonal)
        min_val = np.inf
        idx1, idx2 = -1, -1
        for r in range(num_tracks):
            for c in range(r + 1, num_tracks): # Check upper triangle
                 if dist[r, c] < min_val:
                      min_val = dist[r, c]
                      idx1, idx2 = r, c

        # Check if minimum distance is below threshold
        if min_val >= merge_dist_thres:
            break # No more pairs to merge below threshold

        # Get original track IDs and objects
        tid1 = current_tids[idx1]
        tid2 = current_tids[idx2]
        track1 = current_tracklets[tid1]
        track2 = current_tracklets[tid2]

        # Check spatial constraints
        if check_spatial_constraints(track1, track2, max_x_range, max_y_range):
            logger.debug(f"Merging T{tid1} and T{tid2} (Dist: {min_val:.4f})")

            # Merge track2 into track1 (append data)
            # Ensure temporal order when merging (simple append might not be right)
            # Re-create merged tracklet data in order
            merged_times = track1.times + track2.times
            merged_scores = track1.scores + track2.scores
            merged_bboxes = track1.bboxes + track2.bboxes
            merged_features = track1.features + track2.features
            merged_class_ids = track1.class_ids + track2.class_ids

            # Sort based on time
            sort_indices = np.argsort(merged_times)
            track1.times = [merged_times[i] for i in sort_indices]
            track1.scores = [merged_scores[i] for i in sort_indices]
            track1.bboxes = [merged_bboxes[i] for i in sort_indices]
            track1.features = [merged_features[i] for i in sort_indices]
            track1.class_ids = [merged_class_ids[i] for i in sort_indices]
            # Keep track1's ID, remove track2
            current_tracklets.pop(tid2)

            # Update distance matrix: Remove row/col for track2, update row/col for track1
            # Remove track2 (idx2)
            dist = np.delete(dist, idx2, axis=0)
            dist = np.delete(dist, idx2, axis=1)
            # Update tids and tid_to_idx mapping
            current_tids.pop(idx2)
            current_tid_to_idx = {tid: i for i, tid in enumerate(current_tids)}
            # Get the new index for track1
            new_idx1 = current_tid_to_idx[tid1]

            # Recalculate distances for the merged track (track1)
            for i in range(dist.shape[0]):
                 if i == new_idx1:
                      dist[new_idx1, i] = 0.0
                 else:
                      other_tid = current_tids[i]
                      new_dist = get_distance(current_tracklets[tid1], current_tracklets[other_tid])
                      dist[new_idx1, i] = new_dist
                      dist[i, new_idx1] = new_dist

        else:
            # Spatial constraint failed, prevent this pair from being merged again
            logger.debug(f"Spatial constraint failed for T{tid1} and T{tid2}. Setting dist to 1.0")
            dist[idx1, idx2] = 1.0
            dist[idx2, idx1] = 1.0
            # Need to ensure the loop progresses, maybe find next smallest?
            # For simplicity, just set to 1.0 and let the loop find the next minimum.

    logger.info(f"Merging finished. Final tracklet count: {len(current_tracklets)}")
    return current_tracklets


def detect_id_switch(embs, eps, min_samples, max_clusters):
    """Detects ID switches within a tracklet using DBSCAN."""
    if len(embs) < max(2, min_samples): # Not enough samples to cluster
        return False, np.zeros(len(embs), dtype=int) # Treat as single cluster

    # Subsample if too many embeddings to speed up DBSCAN
    subsample_rate = 1
    if len(embs) > 15000:
        subsample_rate = 2
    embs_subset = np.stack(embs[::subsample_rate])

    if embs_subset.shape[0] < min_samples: # Check after subsampling
         return False, np.zeros(len(embs), dtype=int)

    # Standardize features
    scaler = StandardScaler()
    embs_scaled = scaler.fit_transform(embs_subset)

    # Apply DBSCAN
    try:
        db = DBSCAN(eps=eps, min_samples=min_samples, metric='cosine').fit(embs_scaled)
        labels_subset = db.labels_
    except ValueError as e:
         logger.warning(f"DBSCAN failed: {e}. Treating as single cluster.")
         return False, np.zeros(len(embs), dtype=int)

    # Map labels back to original embeddings if subsampled
    labels = np.full(len(embs), -1, dtype=int) # Initialize all as noise
    original_indices = np.arange(len(embs))[::subsample_rate]
    labels[original_indices] = labels_subset

    # Count clusters (excluding noise -1)
    unique_labels = np.unique(labels_subset[labels_subset != -1])
    n_clusters = len(unique_labels)

    # Handle noise points: assign to nearest cluster (optional, from refine_tracklets)
    # This part can be complex and might not be strictly necessary if noise is acceptable
    # Simplified: If noise exists AND multiple clusters exist, it might indicate switch
    has_noise = -1 in labels
    id_switch_potential = n_clusters > 1 or (n_clusters == 1 and has_noise) # Consider noise as potential switch if only one cluster found

    # Apply max_clusters constraint (if needed, complex merging logic omitted for now)
    # Simplified: If n_clusters > max_clusters, consider it an ID switch.
    if n_clusters > max_clusters:
         logger.info(f"Found {n_clusters} clusters, exceeding max_k={max_clusters}. Treating as ID switch.")
         # Here you would implement cluster merging if needed, for now just return True
         return True, labels # Return original labels before merging for splitting

    return id_switch_potential, labels # Return labels for splitting


def split_tracklets(tracklets_in, eps, max_k, min_samples, len_thres):
    """Splits tracklets based on internal feature clustering (ID switch detection)."""
    logger.info(f"Splitting tracklets (eps={eps}, min_samples={min_samples}, max_k={max_k}, len_thres={len_thres})...")
    tracklets_out = {}
    new_id_counter = (max(tracklets_in.keys()) if tracklets_in else 0) + 1

    for tid, trklet in tqdm(tracklets_in.items(), desc="Splitting tracklets"):
        if len(trklet) < len_thres:
            tracklets_out[tid] = trklet # Keep short tracklets as is
            continue

        # Detect ID switches using DBSCAN
        id_switch_detected, cluster_labels = detect_id_switch(
            trklet.features, eps=eps, min_samples=min_samples, max_clusters=max_k
        )

        if not id_switch_detected:
            tracklets_out[tid] = trklet # No split needed
        else:
            logger.debug(f"Splitting detected for T{tid}. Labels: {np.unique(cluster_labels)}")
            unique_labels = sorted(list(set(cluster_labels)))
            if -1 in unique_labels: unique_labels.remove(-1) # Ignore noise for splitting

            if not unique_labels: # Only noise found
                 tracklets_out[tid] = trklet
                 continue

            # Create new tracklets for each cluster
            for label in unique_labels:
                indices = np.where(cluster_labels == label)[0]
                if len(indices) < min_samples: continue # Skip very small clusters resulting from split

                new_tracklet = Tracklet(track_id=new_id_counter)
                new_tracklet.parent_id = trklet.track_id # Store original parent ID

                # Extract data for the new tracklet using indices
                new_tracklet.times = [trklet.times[i] for i in indices]
                new_tracklet.scores = [trklet.scores[i] for i in indices]
                new_tracklet.bboxes = [trklet.bboxes[i] for i in indices]
                new_tracklet.features = [trklet.features[i] for i in indices]
                new_tracklet.class_ids = [trklet.class_ids[i] for i in indices]

                if new_tracklet.times: # Ensure it's not empty
                    tracklets_out[new_id_counter] = new_tracklet
                    new_id_counter += 1

    logger.info(f"Splitting finished. Input tracklets: {len(tracklets_in)}, Output tracklets: {len(tracklets_out)}")
    return tracklets_out

In [37]:
# ----- Configuration -----
from torchreid.reid.utils import FeatureExtractor


SOURCE_VIDEO_PATH = "app/test_data/raw/0bfacc_0.mp4"
OUTPUT_VIDEO_PATH = "0bfacc_0_botsort_gta_linked_final.mp4"
YOLO_MODEL_PATH = "app/models/yolo11_football_v2/weights/best.pt"
BOTSORT_REID_WEIGHTS_PATH = Path('clip_market1501.pt') # Assumed in current dir

# --- GTA-Link Config ---
# Path relative to the notebook in the main directory
GTA_LINK_DIR = Path('./gta_link').resolve()
GTA_REID_CHECKPOINTS_DIR = GTA_LINK_DIR / 'reid_checkpoints'
GTA_REID_MODEL_PATH = str(GTA_REID_CHECKPOINTS_DIR / 'sports_model.pth.tar-60')

# --- Refine Tracklets Parameters ---
USE_SPLIT = True # Enable/disable splitting phase
USE_CONNECT = True # Enable/disable merging/connecting phase
# Splitting Params (Defaults from refine_tracklets.py args)
SPLIT_MIN_LEN = 100 # Min tracklet length to consider for splitting
SPLIT_EPS = 0.7 # DBSCAN eps
SPLIT_MIN_SAMPLES = 10 # DBSCAN min_samples
SPLIT_MAX_K = 3 # Max clusters allowed after splitting (not fully implemented in copied code)
# Merging Params (Defaults from refine_tracklets.py args)
MERGE_SPATIAL_FACTOR = 1.0 # Factor for calculating spatial range
MERGE_DIST_THRES = 0.4 # Cosine distance threshold for merging


DEVICE = torch.device(0) if torch.cuda.is_available() else torch.device('cpu')
logger.info(f"Using device: {DEVICE}")

# Check GTA ReID model exists
if not Path(GTA_REID_MODEL_PATH).is_file():
    logger.error(f"GTA-Link ReID model not found at {GTA_REID_MODEL_PATH}")
else:
    logger.info(f"Found GTA-Link ReID model: {GTA_REID_MODEL_PATH}")

# Class IDs
BALL_ID = 0
GOALKEEPER_ID = 1
PLAYER_ID = 2
REFEREE_ID = 3

# ----- Annotators -----
TEAM_COLORS = ['#00BFFF', '#FF1493', '#808080'] # Blue, Pink, Gray
BALL_COLOR = '#FFD700' # Yellow
ellipse_annotator = sv.EllipseAnnotator(color=sv.ColorPalette.from_hex(TEAM_COLORS), thickness=2)
label_annotator = sv.LabelAnnotator(color=sv.ColorPalette.from_hex(TEAM_COLORS), text_color=sv.Color.from_hex('#000000'), text_position=sv.Position.BOTTOM_CENTER, text_scale=0.5, text_thickness=1)
triangle_annotator = sv.TriangleAnnotator(color=sv.Color.from_hex(BALL_COLOR), base=25, height=21, outline_thickness=1)
logger.info("Annotators initialized.")

# ----- Tracker Initialization (BoTSORT) -----
logger.info("Initializing BoTSORT tracker...")
use_botsort_reid = BOTSORT_REID_WEIGHTS_PATH.is_file()
if not use_botsort_reid: logger.warning(f"BoTSORT ReID weights not found at {BOTSORT_REID_WEIGHTS_PATH}. Tracking without ReID.")
tracker = BotSort(
    reid_weights=Path('clip_market1501.pt'),
    device=DEVICE,
    half=False,
    with_reid=True,
)
logger.info(f"BoTSORT tracker initialized {'with' if use_botsort_reid else 'without'} ReID.")

# ----- GTA-Link Feature Extractor Initialization -----
logger.info("Initializing GTA-Link Feature Extractor...")
try:
    gta_val_transforms = T.Compose([
        T.Resize([256, 128]), T.ToTensor(),
        T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])
    gta_feature_extractor = FeatureExtractor(
        model_name='osnet_x1_0', model_path=GTA_REID_MODEL_PATH, # device=DEVICE
    )
    logger.info("GTA-Link Feature Extractor initialized successfully.")
except Exception as e:
    logger.error(f"Error initializing GTA-Link Feature Extractor: {e}")

# ----- Video Processing Setup -----
logger.info(f"Getting video info for: {SOURCE_VIDEO_PATH}")
try:
    # ... (video info loading code remains the same) ...
    video_info = sv.VideoInfo.from_video_path(SOURCE_VIDEO_PATH)
    width, height, fps = video_info.width, video_info.height, video_info.fps
    if not video_info.total_frames or video_info.total_frames == -1:
         cap = cv2.VideoCapture(SOURCE_VIDEO_PATH)
         total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
         cap.release()
         if total_frames <= 0: total_frames = int(fps * 30) # Estimate
    else:
        total_frames = video_info.total_frames
    logger.info(f"Video Info: {width}x{height}, FPS: {fps}, Total Frames: {total_frames}")
    if width == 0 or height == 0 or fps == 0: raise ValueError("Invalid video properties.")
except Exception as e:
    logger.error(f"Error getting video info: {e}. Exiting.")

frame_generator = sv.get_video_frames_generator(source_path=SOURCE_VIDEO_PATH, stride=1)
raw_tracklet_data = defaultdict(list) # Stores raw data before feature extraction


[32m2025-04-14 09:44:16.487[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m30[0m - [1mUsing device: cuda:0[0m
[32m2025-04-14 09:44:16.487[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m36[0m - [1mFound GTA-Link ReID model: /home/ubuntu/projects/sure-football-analysis/gta_link/reid_checkpoints/sports_model.pth.tar-60[0m
[32m2025-04-14 09:44:16.488[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m50[0m - [1mAnnotators initialized.[0m
[32m2025-04-14 09:44:16.488[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m53[0m - [1mInitializing BoTSORT tracker...[0m
[32m2025-04-14 09:44:16.490[0m | [1mINFO    [0m | [36mboxmot.utils.torch_utils[0m:[36mselect_device[0m:[36m52[0m - [1mYolo Tracking v12.0.5 🚀 Python-3.11.11 torch-2.5.1+cu121
CUDA:0 (NVIDIA L4, 22478MiB)[0m


Resized position embedding: %s to %s torch.Size([197, 768]) torch.Size([129, 768])
Position embedding resize to height:16 width: 8


[32m2025-04-14 09:44:18.253[0m | [32m[1mSUCCESS [0m | [36mboxmot.appearance.reid_model_factory[0m:[36mload_pretrained_weights[0m:[36m183[0m - [32m[1mLoaded pretrained weights from clip_market1501.pt[0m
[32m2025-04-14 09:44:18.289[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m62[0m - [1mBoTSORT tracker initialized with ReID.[0m
[32m2025-04-14 09:44:18.290[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m65[0m - [1mInitializing GTA-Link Feature Extractor...[0m
[32m2025-04-14 09:44:18.546[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m74[0m - [1mGTA-Link Feature Extractor initialized successfully.[0m
[32m2025-04-14 09:44:18.546[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m79[0m - [1mGetting video info for: app/test_data/raw/0bfacc_0.mp4[0m
[32m2025-04-14 09:44:18.563[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m91[0m - [1mVideo Info: 1920x1080, FPS: 25, Total 

Model: osnet_x1_0
- params: 2,193,616
- flops: 978,878,352
Successfully loaded pretrained weights from "/home/ubuntu/projects/sure-football-analysis/gta_link/reid_checkpoints/sports_model.pth.tar-60"
** The following layers are discarded due to unmatched keys or layer size: ['classifier.weight', 'classifier.bias']


In [38]:
# =============================================================================
# == Main Script Logic ==
# =============================================================================


# =============================================================================
# == PHASE 1: Initial Tracking & Raw Data Collection ==
# =============================================================================
logger.info("Starting Phase 1: Tracking and Raw Data Collection...")
with tqdm(total=total_frames, desc="Phase 1: Tracking") as pbar:
    # ... (Detection, Classification, BoTSORT Tracking loop remains the same) ...
    # ... Ensure raw_tracklet_data is populated correctly with 'crop' (RGB numpy) ...
    for frame_idx, frame in enumerate(frame_generator):
        if frame is None: break
        # --- Detection ---
        results = PLAYER_DETECTION_MODEL.predict(frame, conf=0.3, device=DEVICE, verbose=False)
        if not results or not hasattr(results[0], 'boxes') or results[0].boxes is None:
             pbar.update(1); continue
        detections = sv.Detections.from_ultralytics(results[0])
        # --- Pre-processing & Classification ---
        ball_detections = detections[detections.class_id == BALL_ID]
        people_detections = detections[detections.class_id != BALL_ID]
        if len(people_detections) > 0:
            people_detections = people_detections.with_nms(threshold=0.5, class_agnostic=True)
            players_mask = people_detections.class_id == PLAYER_ID
            goalkeepers_mask = people_detections.class_id == GOALKEEPER_ID
            referees_mask = people_detections.class_id == REFEREE_ID
            players_detections = people_detections[players_mask]
            goalkeepers_detections = people_detections[goalkeepers_mask]
            referees_detections = people_detections[referees_mask]
            if len(players_detections) > 0:
                players_crops = [sv.crop_image(frame, xyxy) for xyxy in players_detections.xyxy]
                player_team_ids = team_classifier.predict(players_crops)
                players_detections.class_id = player_team_ids
            if len(goalkeepers_detections) > 0:
                goalkeeper_team_ids = resolve_goalkeepers_team_id(players_detections, goalkeepers_detections)
                goalkeepers_detections.class_id = goalkeeper_team_ids
            if len(referees_detections) > 0:
                referees_detections.class_id = np.full(len(referees_detections), 2)
            detections_to_track = sv.Detections.merge([players_detections, goalkeepers_detections, referees_detections])
        else:
            detections_to_track = sv.Detections.empty()
        # --- Tracking using BoTSORT ---
        if len(detections_to_track) > 0:
            boxmot_input = np.hstack((detections_to_track.xyxy, detections_to_track.confidence[:, np.newaxis], detections_to_track.class_id[:, np.newaxis]))
            update_frame = frame if use_botsort_reid else None
            tracks = tracker.update(boxmot_input, update_frame)
            if tracks.shape[0] > 0:
                current_frame_tracks = sv.Detections(xyxy=tracks[:, 0:4], confidence=tracks[:, 5], class_id=tracks[:, 6].astype(int), tracker_id=tracks[:, 4].astype(int))
                # --- Collect Raw Data ---
                for track_idx, bot_sort_id in enumerate(current_frame_tracks.tracker_id):
                    bbox_xyxy = current_frame_tracks.xyxy[track_idx]
                    x1, y1, x2, y2 = map(int, bbox_xyxy); x1, y1 = max(0, x1), max(0, y1); x2, y2 = min(width - 1, x2), min(height - 1, y2)
                    crop_img = None
                    if x1 < x2 and y1 < y2: crop_img = cv2.cvtColor(frame[y1:y2, x1:x2], cv2.COLOR_BGR2RGB)
                    raw_entry = {'frame': frame_idx + 1, 'bbox_xyxy': bbox_xyxy, 'conf': current_frame_tracks.confidence[track_idx], 'crop': crop_img, 'class_id': current_frame_tracks.class_id[track_idx]}
                    raw_tracklet_data[bot_sort_id].append(raw_entry)
        else:
            tracker.update(np.empty((0, 6)), frame if use_botsort_reid else None)
        pbar.update(1)

logger.info(f"Finished Phase 1. Collected raw data for {len(raw_tracklet_data)} initial tracklets.")

[32m2025-04-14 09:44:18.575[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m9[0m - [1mStarting Phase 1: Tracking and Raw Data Collection...[0m
Phase 1: Tracking: 100%|██████████| 750/750 [00:44<00:00, 16.82it/s]
[32m2025-04-14 09:45:03.156[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m62[0m - [1mFinished Phase 1. Collected raw data for 0 initial tracklets.[0m


In [39]:
# =============================================================================
# == PHASE 2: Feature Extraction & Tracklet Object Generation ==
# =============================================================================
logger.info("Starting Phase 2: Feature Extraction & Tracklet Generation...")
tracklets_with_features = {} # Store final Tracklet objects {bot_sort_id: Tracklet}

with tqdm(total=len(raw_tracklet_data), desc="Phase 2a: Features") as pbar:
    # ... (Feature extraction loop remains the same, using the modified Tracklet class) ...
    # ... Ensure Tracklet objects are populated with class_id ...
    for bot_sort_id, entries in raw_tracklet_data.items():
        if not entries: continue
        tracklet_obj = Tracklet(track_id=bot_sort_id)
        pil_crops = []; valid_indices = []
        for i, entry in enumerate(entries):
            if entry['crop'] is not None and entry['crop'].size > 0:
                 pil_crops.append(Image.fromarray(entry['crop'])); valid_indices.append(i)
        if not pil_crops: pbar.update(1); continue
        transformed_crops = [gta_val_transforms(crop) for crop in pil_crops]
        input_batch = torch.stack(transformed_crops).to(DEVICE)
        with torch.no_grad(): features = gta_feature_extractor(input_batch)
        feats_np = features.cpu().numpy()
        feature_idx = 0
        for i, entry in enumerate(entries):
             if i in valid_indices:
                 x1, y1, x2, y2 = entry['bbox_xyxy']; l, t = x1, y1; w, h = x2 - x1, y2 - y1; bbox_ltwh = [l, t, w, h]
                 feat = feats_np[feature_idx]; feat /= np.linalg.norm(feat)
                 # Append det including class_id
                 tracklet_obj.append_det(entry['frame'], entry['conf'], bbox_ltwh, entry['class_id'])
                 tracklet_obj.append_feat(feat)
                 feature_idx += 1
        if len(tracklet_obj) > 0: tracklets_with_features[bot_sort_id] = tracklet_obj
        pbar.update(1)

logger.info(f"Generated {len(tracklets_with_features)} tracklets with features.")

[32m2025-04-14 09:45:03.165[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m4[0m - [1mStarting Phase 2: Feature Extraction & Tracklet Generation...[0m
Phase 2a: Features: 0it [00:00, ?it/s]
[32m2025-04-14 09:45:03.167[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m34[0m - [1mGenerated 0 tracklets with features.[0m
Phase 2a: Features: 0it [00:00, ?it/s]
[32m2025-04-14 09:45:03.167[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m34[0m - [1mGenerated 0 tracklets with features.[0m


In [40]:
# =============================================================================
# == PHASE 3: Tracklet Splitting and Merging (GTA-Link Core Logic) ==
# =============================================================================
logger.info("Starting Phase 3: Tracklet Splitting and Merging...")
processed_tracklets = copy.deepcopy(tracklets_with_features)

# --- 3a. Splitting ---
if USE_SPLIT:
    if not processed_tracklets:
         logger.warning("Skipping splitting: No tracklets with features available.")
    else:
         processed_tracklets = split_tracklets(
             processed_tracklets,
             eps=SPLIT_EPS,
             max_k=SPLIT_MAX_K,
             min_samples=SPLIT_MIN_SAMPLES,
             len_thres=SPLIT_MIN_LEN
         )
else:
    logger.info("Splitting phase skipped by configuration.")

# --- 3b. Merging (Connecting) ---
final_tracklets = {}
if USE_CONNECT:
    if not processed_tracklets:
         logger.warning("Skipping merging: No tracklets available after potential splitting.")
         final_tracklets = {} # Ensure it's an empty dict
    else:
        logger.info("Calculating distance matrix for merging...")
        dist_matrix, tid_to_idx, tids = get_distance_matrix(processed_tracklets)
        logger.info(f"Distance matrix calculated ({dist_matrix.shape}).")

        logger.info("Calculating spatial constraints...")
        max_x_range, max_y_range = get_spatial_constraints(processed_tracklets, MERGE_SPATIAL_FACTOR)
        logger.info(f"Spatial constraints: max_x_range={max_x_range:.2f}, max_y_range={max_y_range:.2f}")

        logger.info(f"Merging tracklets (threshold={MERGE_DIST_THRES})...")
        final_tracklets = merge_tracklets(
            processed_tracklets,
            dist_matrix,
            tid_to_idx,
            tids,
            max_x_range=max_x_range,
            max_y_range=max_y_range,
            merge_dist_thres=MERGE_DIST_THRES
        )
else:
    logger.info("Merging phase skipped by configuration.")
    final_tracklets = processed_tracklets # Use tracklets after splitting (or original if split was skipped)

logger.info(f"Finished Phase 3. Final tracklet count: {len(final_tracklets)}")


[32m2025-04-14 09:45:03.174[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m4[0m - [1mStarting Phase 3: Tracklet Splitting and Merging...[0m
[32m2025-04-14 09:45:03.176[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m51[0m - [1mFinished Phase 3. Final tracklet count: 0[0m
[32m2025-04-14 09:45:03.176[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m51[0m - [1mFinished Phase 3. Final tracklet count: 0[0m


In [31]:
# =============================================================================
# == PHASE 4: Final Annotation and Video Writing ==
# =============================================================================
logger.info("Starting Phase 4: Final annotation and video writing...")
detections_for_annotation = defaultdict(lambda: {'tracked': sv.Detections.empty(), 'ball': sv.Detections.empty()})

# --- Populate detections_for_annotation with DEBUG logging ---
if final_tracklets:
    logger.info("Mapping final IDs back to detections for annotation...")
    final_keys_sorted = sorted(final_tracklets.keys())
    final_key_to_global_id = {key: i + 1 for i, key in enumerate(final_keys_sorted)}
    logger.debug(f"Final Key to Global ID mapping: {final_key_to_global_id}")

    for final_key, tracklet_obj in final_tracklets.items():
        global_id = final_key_to_global_id[final_key]
        if not isinstance(tracklet_obj, Tracklet): logger.warning(f"Key {final_key} not a Tracklet. Skipping."); continue
        logger.debug(f"Processing final tracklet Key={final_key}, GlobalID={global_id}, Len={len(tracklet_obj)}")

        for i, frame_num in enumerate(tracklet_obj.times):
            if i >= len(tracklet_obj.bboxes) or i >= len(tracklet_obj.scores) or i >= len(tracklet_obj.class_ids):
                logger.warning(f"Data mismatch in T:{final_key}/G:{global_id} at index {i}. Skipping."); continue

            frame_idx_0based = frame_num - 1
            bbox_ltwh = tracklet_obj.bboxes[i]
            conf = tracklet_obj.scores[i]
            class_id = tracklet_obj.class_ids[i]

            if len(bbox_ltwh) == 4:
                 l, t, w, h = bbox_ltwh; x1, y1, x2, y2 = l, t, l + w, t + h
                 bbox_xyxy = np.array([x1, y1, x2, y2])
            else: logger.warning(f"Invalid bbox {bbox_ltwh} in T:{final_key}/G:{global_id}. Skipping."); continue

            det = sv.Detections(
                xyxy=np.array([bbox_xyxy]), confidence=np.array([conf]),
                class_id=np.array([class_id]), tracker_id=np.array([global_id])
            )
            # DEBUG: Log the detection being added
            logger.debug(f"  Frame {frame_idx_0based}: Adding det GID={global_id}, Class={class_id}, xyxy={bbox_xyxy.tolist()}")
            detections_for_annotation[frame_idx_0based]['tracked'] = sv.Detections.merge([
                detections_for_annotation[frame_idx_0based]['tracked'], det
            ])
else: logger.warning("No final tracklets found. Annotation will only include ball.")

# --- Annotation Loop with DEBUG logging ---
logger.info("Annotating frames and writing video...")
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
video_writer = cv2.VideoWriter(OUTPUT_VIDEO_PATH, fourcc, fps, (width, height))
frame_generator = sv.get_video_frames_generator(source_path=SOURCE_VIDEO_PATH, stride=1)

# Select specific frames to log detailed info for (e.g., first few frames with expected players)
FRAMES_TO_DEBUG = {10, 50, 100}

with tqdm(total=total_frames, desc="Phase 4: Annotating") as pbar:
    for frame_idx, frame in enumerate(frame_generator):
        if frame is None: break
        annotated_frame = frame.copy()
        # Retrieve tracked detections for the current frame
        tracked_detections_for_frame = detections_for_annotation[frame_idx]['tracked']

        # --- Log details for specific frames ---
        if frame_idx in FRAMES_TO_DEBUG:
             logger.debug(f"--- Debugging Frame {frame_idx} ---")
             logger.debug(f"Retrieved 'tracked_detections_for_frame': {tracked_detections_for_frame}")
             if len(tracked_detections_for_frame) > 0:
                  logger.debug(f"  Track IDs: {tracked_detections_for_frame.tracker_id}")
                  logger.debug(f"  Class IDs: {tracked_detections_for_frame.class_id}")
                  logger.debug(f"  BBoxes (xyxy): {tracked_detections_for_frame.xyxy}")

        # --- Annotate Tracked Objects ---
        if len(tracked_detections_for_frame) > 0:
            try:
                # Ensure class IDs are integers for the palette
                tracked_detections_for_frame.class_id = tracked_detections_for_frame.class_id.astype(int)

                labels = [f"#{tid} T{cid}" if cid in [0, 1] else f"#{tid} R"
                          for tid, cid in zip(tracked_detections_for_frame.tracker_id, tracked_detections_for_frame.class_id)]

                if frame_idx in FRAMES_TO_DEBUG: logger.debug(f"  Attempting to annotate {len(labels)} tracked objects.")

                annotated_frame = ellipse_annotator.annotate(scene=annotated_frame, detections=tracked_detections_for_frame)
                annotated_frame = label_annotator.annotate(scene=annotated_frame, detections=tracked_detections_for_frame, labels=labels)

                if frame_idx in FRAMES_TO_DEBUG: logger.debug(f"  Annotation applied successfully for Frame {frame_idx}.")

            except Exception as e:
                 logger.error(f"Error during annotation on Frame {frame_idx}: {e}")
                 if frame_idx in FRAMES_TO_DEBUG:
                      logger.error(f"  Problematic Detections Data: {tracked_detections_for_frame}")


        # --- Annotate Ball (Re-detect) ---
        results = PLAYER_DETECTION_MODEL.predict(frame, classes=[BALL_ID], conf=0.1, device=DEVICE, verbose=False)
        if results and hasattr(results[0], 'boxes') and results[0].boxes is not None:
             all_detections = sv.Detections.from_ultralytics(results[0])
             ball_detections = all_detections[all_detections.class_id == BALL_ID]
             if len(ball_detections) > 0:
                 ball_detections.xyxy = sv.pad_boxes(xyxy=ball_detections.xyxy, px=10)
                 annotated_frame = triangle_annotator.annotate(scene=annotated_frame, detections=ball_detections)


        video_writer.write(annotated_frame)
        pbar.update(1)

# Release video writer
video_writer.release()
logger.info(f"Finished Phase 4. Annotated video saved to: {OUTPUT_VIDEO_PATH}")

[32m2025-04-14 09:34:38.605[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m4[0m - [1mStarting Phase 4: Final annotation and video writing...[0m
[32m2025-04-14 09:34:38.609[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m45[0m - [1mAnnotating frames and writing video...[0m
[32m2025-04-14 09:34:38.609[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m45[0m - [1mAnnotating frames and writing video...[0m
Phase 4: Annotating: 100%|██████████| 750/750 [00:39<00:00, 18.98it/s]
[32m2025-04-14 09:35:18.153[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m106[0m - [1mFinished Phase 4. Annotated video saved to: 0bfacc_0_botsort_gta_linked_final.mp4[0m
