In [1]:
import os
from pathlib import Path
from dotenv import load_dotenv

# Path of the current notebook file
NOTEBOOK_DIR = Path().resolve()

# Search upward for project root containing "notebooks/.env"
for parent in [NOTEBOOK_DIR] + list(NOTEBOOK_DIR.parents):
    env_candidate = parent / "notebooks" / ".env"
    if env_candidate.exists():
        ENV_PATH = env_candidate
        break
else:
    raise FileNotFoundError("Could not find notebooks/.env in any parent directory!")

print("Loading .env from:", ENV_PATH)

# Load it
load_dotenv(ENV_PATH, override=True)

# Ensure HF_TOKEN is exported
hf_token = os.getenv("HF_TOKEN")
if not hf_token:
    raise RuntimeError("HF_TOKEN not found in the .env file")

roboflow_key = os.getenv("ROBOFLOW_API_KEY")


os.environ["HF_TOKEN"] = hf_token
os.environ["HF_TOKEN"] = roboflow_key


Loading .env from: /home/valentinweyer/projects/handball-computer-vision/notebooks/.env


In [2]:
!nvidia-smi

Thu Nov 27 23:04:14 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 580.95.05              Driver Version: 580.95.05      CUDA Version: 13.0     |
+-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GB10                    On  |   0000000F:01:00.0  On |                  N/A |
| N/A   43C    P0             11W /  N/A  | Not Supported          |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+

+----------------------------------------------

In [3]:
import torch
import torchvision

print("PyTorch version:", torch.__version__)
print("Torchvision version:", torchvision.__version__)
print("CUDA is available:", torch.cuda.is_available())

PyTorch version: 2.9.1+cu130
Torchvision version: 0.24.1
CUDA is available: True


In [4]:
import os
HOME = Path.cwd()

print("Current working directory:", HOME)

Current working directory: /home/valentinweyer/projects/handball-computer-vision/notebooks


In [5]:
def install_decord_ffmpeg6():
    """Build & install decord 0.6.0 with CUDA + FFmpeg 6 support into CURRENT env.
    - Builds in a temp dir (no clutter in your repo)
    - Applies the minimal FFmpeg-6 compatibility patches
    """
    import sys, subprocess, tempfile, shutil, os
    from pathlib import Path

    print("Using Python:", sys.executable)

    # 0) Uninstall any broken/old decord
    print("[-] Uninstalling existing decord (if any)…")
    for _ in range(3):
        subprocess.run([sys.executable, "-m", "pip", "uninstall", "-y", "decord"], check=False)

    # 1) Work in a temporary directory
    tmpdir = Path(tempfile.mkdtemp(prefix="build_decord_"))
    print("[+] Temp dir:", tmpdir)

    print("[+] Cloning decord…")
    subprocess.run(
        ["git", "clone", "--recursive", "https://github.com/dmlc/decord.git"],
        cwd=tmpdir,
        check=True,
    )
    decord_root = tmpdir / "decord"

    # --- Patches for FFmpeg 6 ---

    # 1) ffmpeg_common.h – add bsf.h
    ffmpeg_common = decord_root / "src" / "video" / "ffmpeg" / "ffmpeg_common.h"
    txt = ffmpeg_common.read_text()
    needle = "#include <libavcodec/avcodec.h>\n"
    insert = needle + "#include <libavcodec/bsf.h>\n"
    if "#include <libavcodec/bsf.h>" not in txt:
        if needle not in txt:
            raise RuntimeError("avcodec.h include not found in ffmpeg_common.h")
        ffmpeg_common.write_text(txt.replace(needle, insert))

    # 2) video_reader.cc – const AVCodec + cast in av_find_best_stream
    video_reader = decord_root / "src" / "video" / "video_reader.cc"
    vr = video_reader.read_text()
    vr = vr.replace("    AVCodec *dec = nullptr;\n", "    const AVCodec *dec = nullptr;\n")
    vr = vr.replace("    AVCodec *dec = NULL;\n", "    const AVCodec *dec = nullptr;\n")
    if "&dec, 0);" in vr:
        vr = vr.replace("&dec, 0);", "(const AVCodec**)&dec, 0);")
    video_reader.write_text(vr)

    # 3) cuda_threaded_decoder.{h,cc} – AVInputFormat* -> const AVInputFormat*
    cuda_h  = decord_root / "src" / "video" / "nvcodec" / "cuda_threaded_decoder.h"
    cuda_cc = decord_root / "src" / "video" / "nvcodec" / "cuda_threaded_decoder.cc"

    h = cuda_h.read_text()
    h = h.replace(
        "CUThreadedDecoder(int device_id, AVCodecParameters *codecpar, AVInputFormat *iformat);",
        "CUThreadedDecoder(int device_id, AVCodecParameters *codecpar, const AVInputFormat *iformat);",
    )
    h = h.replace(
        "void InitBitStreamFilter(AVCodecParameters *codecpar, AVInputFormat *iformat);",
        "void InitBitStreamFilter(AVCodecParameters *codecpar, const AVInputFormat *iformat);",
    )
    cuda_h.write_text(h)

    c = cuda_cc.read_text()
    c = c.replace(
        "CUThreadedDecoder::CUThreadedDecoder(int device_id, AVCodecParameters *codecpar, AVInputFormat *iformat)",
        "CUThreadedDecoder::CUThreadedDecoder(int device_id, AVCodecParameters *codecpar, const AVInputFormat *iformat)",
    )
    c = c.replace(
        "void CUThreadedDecoder::InitBitStreamFilter(AVCodecParameters *codecpar, AVInputFormat *iformat)",
        "void CUThreadedDecoder::InitBitStreamFilter(AVCodecParameters *codecpar, const AVInputFormat *iformat)",
    )
    cuda_cc.write_text(c)

    # --- Build C++ backend ---
    build_dir = decord_root / "build"
    build_dir.mkdir()
    print("[+] Running CMake…")
    subprocess.run(
        ["cmake", "..", "-DUSE_CUDA=ON", "-DCMAKE_BUILD_TYPE=Release"],
        cwd=build_dir,
        check=True,
    )
    print("[+] Running make…")
    subprocess.run(
        ["make", f"-j{os.cpu_count() or 4}"],
        cwd=build_dir,
        check=True,
    )

    # --- Install Python package ---
    print("[+] Installing Python bindings…")
    subprocess.run(
        [sys.executable, "-m", "pip", "install", "."],
        cwd=decord_root / "python",
        check=True,
    )

    # --- Verify & cleanup ---
    import sys as _sys
    _sys.modules.pop("decord", None)
    import decord
    from decord import cpu, VideoReader

    print("[✓] decord installed from:", getattr(decord, "__file__", None))
    print("[✓] cpu:", cpu, "VideoReader:", VideoReader)

    print("[+] Cleaning temp dir:", tmpdir)
    shutil.rmtree(tmpdir)
    print("[✓] Done.")


# >>> Run the installer
install_decord_ffmpeg6()

Using Python: /home/valentinweyer/miniforge3/envs/handball-computer-vision/bin/python
[-] Uninstalling existing decord (if any)…
Found existing installation: decord 0.6.0
Uninstalling decord-0.6.0:
  Successfully uninstalled decord-0.6.0


[0mCloning into 'decord'...


[+] Temp dir: /tmp/build_decord_iy8b9ur5
[+] Cloning decord…


Submodule '3rdparty/dlpack' (https://github.com/dmlc/dlpack) registered for path '3rdparty/dlpack'
Submodule '3rdparty/dmlc-core' (https://github.com/dmlc/dmlc-core) registered for path '3rdparty/dmlc-core'
Cloning into '/tmp/build_decord_iy8b9ur5/decord/3rdparty/dlpack'...
Cloning into '/tmp/build_decord_iy8b9ur5/decord/3rdparty/dmlc-core'...


Submodule path '3rdparty/dlpack': checked out '5c792cef3aee54ad8b7000111c9dc1797f327b59'
Submodule path '3rdparty/dmlc-core': checked out 'd07fb7a443b5db8a89d65a15a024af6a425615a5'
[+] Running CMake…
-- The C compiler identification is GNU 13.3.0
-- The CXX compiler identification is GNU 13.3.0
-- Detecting C compiler ABI info
-- Detecting C compiler ABI info - done
-- Check for working C compiler: /usr/bin/cc - skipped
-- Detecting C compile features
-- Detecting C compile features - done
-- Detecting CXX compiler ABI info


  Compatibility with CMake < 3.10 will be removed from a future version of
  CMake.

  Update the VERSION argument <min> value.  Or, use the <min>...<max> syntax
  to tell CMake that the project requires at least <min> but has been updated
  to work with policies introduced by <max> or earlier.

[0m


-- Detecting CXX compiler ABI info - done
-- Check for working CXX compiler: /usr/bin/c++ - skipped
-- Detecting CXX compile features
-- Detecting CXX compile features - done
-- Found PkgConfig: /usr/bin/pkg-config (found version "1.8.1")
-- Checking for module 'libavcodec'
--   Found libavcodec, version 60.31.102
-- Checking for module 'libavformat'
--   Found libavformat, version 60.16.100
-- Checking for module 'libavutil'
--   Found libavutil, version 58.29.100
-- Checking for module 'libavdevice'
--   Package 'libavdevice', required by 'virtual:world', not found
-- Checking for module 'libavfilter'
--   Found libavfilter, version 9.12.100
-- Checking for module 'libswresample'
--   Found libswresample, version 4.12.100
-- Unable to find libavdevice, device input API will not work!
-- Found FFMPEG or Libav: /usr/lib/aarch64-linux-gnu/libavformat.so;/usr/lib/aarch64-linux-gnu/libavfilter.so;/usr/lib/aarch64-linux-gnu/libavcodec.so;/usr/lib/aarch64-linux-gnu/libavutil.so;/usr/lib/aar

[0mFFMPEG_INCLUDE_DIR = /usr/include/aarch64-linux-gnu [0m
[0mFFMPEG_LIBRARIES = /usr/lib/aarch64-linux-gnu/libavformat.so;/usr/lib/aarch64-linux-gnu/libavfilter.so;/usr/lib/aarch64-linux-gnu/libavcodec.so;/usr/lib/aarch64-linux-gnu/libavutil.so;/usr/lib/aarch64-linux-gnu/libswresample.so [0m
  Policy CMP0146 is not set: The FindCUDA module is removed.  Run "cmake
  --help-policy CMP0146" for policy details.  Use the cmake_policy command to

Call Stack (most recent call first):
  cmake/modules/CUDA.cmake:19 (find_cuda)
  CMakeLists.txt:92 (include)
[0m
  Policy CMP0104 is not set: CMAKE_CUDA_ARCHITECTURES now detected for NVCC,
  empty CUDA_ARCHITECTURES not allowed.  Run "cmake --help-policy CMP0104"
  for policy details.  Use the cmake_policy command to set the policy and

  CUDA_ARCHITECTURES is empty for target "decord".
[0m


-- Performing Test CMAKE_HAVE_LIBC_PTHREAD - Success
-- Found Threads: TRUE
-- Found CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-13.0
-- Found CUDA_CUDA_LIBRARY=/usr/local/cuda-13.0/lib64/stubs/libcuda.so
-- Found CUDA_CUDART_LIBRARY=/usr/local/cuda-13.0/lib64/libcudart.so
-- Found CUDA_NVRTC_LIBRARY=/usr/local/cuda-13.0/lib64/libnvrtc.so
-- Found CUDA_CUDNN_LIBRARY=/usr/lib/aarch64-linux-gnu/libcudnn.so
-- Found CUDA_CUBLAS_LIBRARY=/usr/local/cuda-13.0/lib64/libcublas.so
-- Found CUDA_NVIDIA_ML_LIBRARY=/usr/local/cuda-13.0/lib64/stubs/libnvidia-ml.so
-- Found CUDA_NVCUVID_LIBRARY=/usr/local/cuda-13.0/lib64/libnvcuvid.so
-- Build with CUDA support
-- Configuring done (1.6s)
-- Generating done (0.0s)
-- Build files have been written to: /tmp/build_decord_iy8b9ur5/decord/build
[+] Running make…
[  2%] [32mBuilding CXX object CMakeFiles/decord.dir/src/audio/audio_interface.cc.o[0m
[  5%] [32mBuilding CXX object CMakeFiles/decord.dir/src/runtime/cpu_device_api.cc.o[0m
[  8%] [32mBuilding CX

/tmp/build_decord_iy8b9ur5/decord/src/audio/audio_reader.cc: In member function ‘int decord::AudioReader::Decode(std::string, int)’:
  131 |                 numChannels = tempCodecParameters->channels;
      |                                                    ^~~~~~~~
In file included from /usr/include/aarch64-linux-gnu/libavcodec/avcodec.h:53,
                 from /tmp/build_decord_iy8b9ur5/decord/src/audio/../../include/decord/../../src/video/ffmpeg/ffmpeg_common.h:23,
                 from /tmp/build_decord_iy8b9ur5/decord/src/audio/../../include/decord/audio_interface.h:9,
                 from /tmp/build_decord_iy8b9ur5/decord/src/audio/audio_reader.h:10,
                 from /tmp/build_decord_iy8b9ur5/decord/src/audio/audio_reader.cc:5:
/usr/include/aarch64-linux-gnu/libavcodec/codec_par.h:166:14: note: declared here
  166 |     int      channels;
      |              ^~~~~~~~
  131 |                 numChannels = tempCodecParameters->channels;
      |                         

[ 67%] [32mBuilding CXX object CMakeFiles/decord.dir/src/video/ffmpeg/filter_graph.cc.o[0m
[ 70%] [32mBuilding CXX object CMakeFiles/decord.dir/src/video/ffmpeg/threaded_decoder.cc.o[0m
[ 72%] [32mBuilding CXX object CMakeFiles/decord.dir/src/video/nvcodec/cuda_context.cc.o[0m
[ 75%] [32mBuilding CXX object CMakeFiles/decord.dir/src/video/nvcodec/cuda_decoder_impl.cc.o[0m


/tmp/build_decord_iy8b9ur5/decord/src/video/video_reader.cc: In member function ‘virtual double decord::VideoReader::GetRotation() const’:
  557 |     uint8_t* displaymatrix = av_stream_get_side_data(active_st, AV_PKT_DATA_DISPLAYMATRIX, NULL);
      |                              ~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
In file included from /tmp/build_decord_iy8b9ur5/decord/src/video/ffmpeg/ffmpeg_common.h:25,
                 from /tmp/build_decord_iy8b9ur5/decord/src/video/threaded_decoder_interface.h:10,
                 from /tmp/build_decord_iy8b9ur5/decord/src/video/video_reader.h:10,
                 from /tmp/build_decord_iy8b9ur5/decord/src/video/video_reader.cc:7:
/usr/include/aarch64-linux-gnu/libavformat/avformat.h:1913:10: note: declared here
 1913 | uint8_t *av_stream_get_side_data(const AVStream *stream,
      |          ^~~~~~~~~~~~~~~~~~~~~~~
/tmp/build_decord_iy8b9ur5/decord/src/video/ffmpeg/threaded_decoder.cc: In member function ‘void de

[ 78%] [32mBuilding CXX object CMakeFiles/decord.dir/src/video/nvcodec/cuda_mapped_frame.cc.o[0m
[ 81%] [32mBuilding CXX object CMakeFiles/decord.dir/src/video/nvcodec/cuda_parser.cc.o[0m
[ 83%] [32mBuilding CXX object CMakeFiles/decord.dir/src/video/nvcodec/cuda_stream.cc.o[0m
[ 86%] [32mBuilding CXX object CMakeFiles/decord.dir/src/video/nvcodec/cuda_texture.cc.o[0m
[ 89%] [32mBuilding CXX object CMakeFiles/decord.dir/src/video/nvcodec/cuda_threaded_decoder.cc.o[0m
[ 91%] [32mBuilding CXX object CMakeFiles/decord.dir/src/runtime/cuda/cuda_device_api.cc.o[0m
[ 94%] [32mBuilding CXX object CMakeFiles/decord.dir/src/runtime/cuda/cuda_module.cc.o[0m
[ 97%] [32mBuilding CUDA object CMakeFiles/decord.dir/src/improc/improc.cu.o[0m


In function ‘const char* decord::cuda::GetVideoCodecString(cudaVideoCodec)’,
    inlined from ‘const char* decord::cuda::GetVideoCodecString(cudaVideoCodec)’ at /tmp/build_decord_iy8b9ur5/decord/src/video/nvcodec/cuda_decoder_impl.cc:15:14:
   44 |             return aCodecName[eCodec].name;
      |                    ~~~~~~~~~~~~~~~~~^
/tmp/build_decord_iy8b9ur5/decord/src/video/nvcodec/cuda_decoder_impl.cc: In function ‘const char* decord::cuda::GetVideoCodecString(cudaVideoCodec)’:
/tmp/build_decord_iy8b9ur5/decord/src/video/nvcodec/cuda_decoder_impl.cc:19:7: note: while referencing ‘aCodecName’
   19 |     } aCodecName [] = {
      |       ^~~~~~~~~~
/tmp/build_decord_iy8b9ur5/decord/src/video/nvcodec/cuda_threaded_decoder.cc: In member function ‘void decord::cuda::CUThreadedDecoder::LaunchThread()’:
  304 |   } catch (dmlc::Error error) {
      |                        ^~~~~


[100%] [1m[32mLinking CXX shared library libdecord.so[0m
[100%] Built target decord
[+] Installing Python bindings…
Processing /tmp/build_decord_iy8b9ur5/decord/python
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Building wheels for collected packages: decord
  Building wheel for decord (pyproject.toml): started
  Building wheel for decord (pyproject.toml): finished with status 'done'
  Created wheel for decord: filename=decord-0.6.0-cp311-cp311-linux_aarch64.whl size=5913590 sha256=f11ac6f1efafcf34150091dd26861b4a0beead760e26740fee809e2c08632ab6
  Stored in directory: /tmp/pip-ephem-wheel-cache-jcxg9zg9/wheels/96/34/7b/44cdeb71fb003e75b2ab4fca0cdb99586439cb75879cb2484a
Successfully built decord
Install

In [6]:
%cd ../
!git clone https://github.com/facebookresearch/sam3.git
%cd sam3
!pip install -e .
%cd ../
!rm -rf sam3/


/home/valentinweyer/projects/handball-computer-vision
fatal: destination path 'sam3' already exists and is not an empty directory.
/home/valentinweyer/projects/handball-computer-vision/sam3
Obtaining file:///home/valentinweyer/projects/handball-computer-vision/sam3
  Installing build dependencies ... [?25ldone
[?25h  Checking if build backend supports build_editable ... [?25ldone
[?25h  Getting requirements to build editable ... [?25ldone
[?25h  Preparing editable metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: sam3
  Building editable for sam3 (pyproject.toml) ... [?25ldone
[?25h  Created wheel for sam3: filename=sam3-0.1.0-0.editable-py3-none-any.whl size=15232 sha256=4b2814b1c664a3dfa867528eeed684e3cb47e35edc89f9d329f43951427e06ad
  Stored in directory: /tmp/pip-ephem-wheel-cache-c0yn3y2l/wheels/12/21/ec/794e47b3d9c99484dc80e108804fb5126dbbebb5f20f9d2aae
Successfully built sam3
Installing collected packages: sam3
  Attempting uninstall: sam3


In [7]:
!pip install -q supervision jupyter_bbox_widget

In [8]:
import torch

torch.autocast(device_type="cuda", dtype=torch.bfloat16).__enter__()

if torch.cuda.get_device_properties(0).major >= 8:
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.allow_tf32 = True

    Found GPU0 NVIDIA GB10 which is of cuda capability 12.1.
    Minimum and Maximum cuda capability supported by this version of PyTorch is
    (8.0) - (12.0)
    


## Use SAM3

In [None]:
from sam3.model.sam3_image_processor import Sam3Processor

from sam3.model_builder import build_sam3_image_model


model = build_sam3_image_model()
processor = Sam3Processor(model, confidence_threshold=0.3)

In [None]:
import supervision as sv

def from_sam(sam_result: dict) -> sv.Detections:
    xyxy = sam_result["boxes"].to(torch.float32).cpu().numpy()
    confidence = sam_result["scores"].to(torch.float32).cpu().numpy()

    mask = sam_result["masks"].to(torch.bool)
    mask = mask.reshape(mask.shape[0], mask.shape[2], mask.shape[3]).cpu().numpy()

    return sv.Detections(
        xyxy=xyxy,
        confidence=confidence,
        mask=mask
    )

from PIL import Image
from typing import Optional


COLOR = sv.ColorPalette.from_hex([
    "#ffff00", "#ff9b00", "#ff8080", "#ff66b2", "#ff66ff", "#b266ff",
    "#9999ff", "#3399ff", "#66ffff", "#33ff99", "#66ff66", "#99ff00"
])


def annotate(image: Image.Image, detections: sv.Detections, label: Optional[str] = None) -> Image.Image:
    text_scale = sv.calculate_optimal_text_scale(resolution_wh=image.size)

    mask_annotator = sv.MaskAnnotator(
        color=COLOR,
        color_lookup=sv.ColorLookup.INDEX,
        opacity=0.6
    )
    box_annotator = sv.BoxAnnotator(
        color=COLOR,
        color_lookup=sv.ColorLookup.INDEX,
        thickness=1
    )
    label_annotator = sv.LabelAnnotator(
        color=COLOR,
        color_lookup=sv.ColorLookup.INDEX,
        text_scale=0.4,
        text_padding=5,
        text_color=sv.Color.BLACK,
        text_thickness=1
    )

    annotated_image = image.copy()
    annotated_image = mask_annotator.annotate(annotated_image, detections)
    annotated_image = box_annotator.annotate(annotated_image, detections)

    if label:
        labels = [
            f"{label} {confidence:.2f}"
            for confidence in detections.confidence
        ]
        annotated_image = label_annotator.annotate(annotated_image, detections, labels)

    return annotated_image


Install the `roboflow` library using pip as requested to enable dataset downloading.



In [None]:
!pip install -q roboflow

## Download Roboflow Dataset



In [None]:

from roboflow import Roboflow


# Initialize Roboflow
rf = Roboflow(api_key=roboflow_key)

# Access workspace and project
project = rf.workspace("valentin-weyer-xasiu").project("player-and-handball-detection-3z9xf")

# Download dataset
dataset = project.version(1).download("yolov8")

print("Dataset downloaded successfully.")

## Run SAM3 Inference and Visualize




In [None]:
import glob
import random
import os
from PIL import Image
from IPython.display import display

# Define path to validation images (adjust if needed based on dataset structure)
IMAGE_DIR = os.path.join(dataset.location, "valid", "images")

# Get all jpg images
image_paths = glob.glob(os.path.join(IMAGE_DIR, "*.jpg"))

# Select a random sample if images exist
if len(image_paths) > 0:
    sample_paths = random.sample(image_paths, min(3, len(image_paths)))

    PROMPT = "jersey number"
    print(f"Running inference on {len(sample_paths)} images with prompt: '{PROMPT}'\n")

    for img_path in sample_paths:
        # Load image
        image = Image.open(img_path).convert("RGB")

        # Run SAM3 Inference
        inference_state = processor.set_image(image)
        inference_state = processor.set_text_prompt(state=inference_state, prompt=PROMPT)

        # Process results
        detections = from_sam(sam_result=inference_state)
        detections = detections[detections.confidence > 0.4]

        # Visualize
        print(f"Image: {os.path.basename(img_path)}")
        print(f"Found {len(detections)} detections.")
        annotated_image = annotate(image, detections, label=PROMPT)
        display(annotated_image)
        print("-" * 50)
else:
    print(f"No images found in {IMAGE_DIR}")

# Task
Parse the `data.yaml` file from the downloaded dataset (located at `dataset.location`) to retrieve class names. Iterate through the train, valid, and test image sets to convert existing YOLO labels to COCO format. For each image, also run SAM3 inference to detect "jersey number", assigning these detections a new category ID. Consolidate all annotations into a single `_annotations.coco.json` file. Upload this file to the Roboflow project to update annotations, trigger the generation of dataset Version 2 via the API, and summarize the process.

## Load Dataset Metadata

### Subtask:
Parse the `data.yaml` file to extract existing class names and define the ID for the new class.


In [None]:
import yaml
import os

# Construct path to data.yaml
yaml_path = os.path.join(dataset.location, "data.yaml")

# Load the yaml file
with open(yaml_path, 'r') as f:
    data_yaml = yaml.safe_load(f)

# Extract class names
# YOLO data.yaml 'names' is typically a list of strings or a dictionary.
raw_names = data_yaml.get('names', [])

# Create a dictionary mapping ID to class name
if isinstance(raw_names, dict):
    existing_classes = {int(k): v for k, v in raw_names.items()}
else:
    existing_classes = {i: name for i, name in enumerate(raw_names)}

# Define new class details
new_class_name = "jersey number"
new_class_id = len(existing_classes)

print("Existing classes:", existing_classes)
print(f"New class '{new_class_name}' will be assigned ID: {new_class_id}")

## Generate Consolidated COCO JSON

### Subtask:
Iterate through the dataset, converting existing YOLO labels and generating new SAM3 predictions into a single COCO JSON file.


In [None]:
import json
import glob
import os   
from tqdm import tqdm
from PIL import Image
import numpy as np

# Initialize COCO structure
coco_dataset = {
    "info": {
        "description": "Merged Dataset with SAM3 Predictions",
        "url": "",
        "version": "1.0",
        "year": 2025,
        "contributor": "Agent",
        "date_created": "2025-11-21"
    },
    "licenses": [],
    "images": [],
    "annotations": [],
    "categories": []
}

# Add existing categories
for class_id, class_name in existing_classes.items():
    coco_dataset["categories"].append({
        "id": int(class_id),
        "name": class_name,
        "supercategory": "none"
    })

# Add the new category
coco_dataset["categories"].append({
    "id": new_class_id,
    "name": new_class_name,
    "supercategory": "none"
})

# Helper for YOLO to COCO conversion
def yolo_to_coco(x_center, y_center, w, h, img_w, img_h):
    w_pixel = w * img_w
    h_pixel = h * img_h
    x_min = (x_center * img_w) - (w_pixel / 2)
    y_min = (y_center * img_h) - (h_pixel / 2)
    return [x_min, y_min, w_pixel, h_pixel]

annotation_id = 0
image_id_counter = 0

splits = ['train', 'valid', 'test']

print("Starting dataset processing...")

for split in splits:
    image_dir = os.path.join(dataset.location, split, "images")
    label_dir = os.path.join(dataset.location, split, "labels")

    if not os.path.exists(image_dir):
        print(f"Skipping {split} (directory not found)")
        continue

    image_files = glob.glob(os.path.join(image_dir, "*.jpg"))
    print(f"Processing {len(image_files)} images in {split}...")

    for img_path in tqdm(image_files):
        filename = os.path.basename(img_path)

        try:
            image = Image.open(img_path).convert("RGB")
            img_w, img_h = image.size

            # Add image entry
            image_entry = {
                "id": image_id_counter,
                "file_name": filename,
                "width": img_w,
                "height": img_h,
                "license": None,
                "date_captured": None
            }
            coco_dataset["images"].append(image_entry)

            # 1. Process Existing YOLO Labels
            label_path = os.path.join(label_dir, filename.replace(".jpg", ".txt"))
            if os.path.exists(label_path):
                with open(label_path, "r") as f:
                    lines = f.readlines()
                    for line in lines:
                        parts = line.strip().split()
                        cls_id = int(parts[0])
                        # YOLO format: class x_center y_center width height
                        bbox = yolo_to_coco(
                            float(parts[1]), float(parts[2]),
                            float(parts[3]), float(parts[4]),
                            img_w, img_h
                        )

                        annotation = {
                            "id": annotation_id,
                            "image_id": image_id_counter,
                            "category_id": cls_id,
                            "bbox": bbox,
                            "area": bbox[2] * bbox[3],
                            "segmentation": [],
                            "iscrowd": 0
                        }
                        coco_dataset["annotations"].append(annotation)
                        annotation_id += 1

            # 2. Process SAM3 Predictions (New Class)
            inference_state = processor.set_image(image)
            inference_state = processor.set_text_prompt(state=inference_state, prompt=new_class_name)
            sam_results = from_sam(sam_result=inference_state)

            # Filter by confidence
            mask = sam_results.confidence > 0.3
            filtered_detections = sam_results[mask]

            for i, xyxy in enumerate(filtered_detections.xyxy):
                x_min = float(xyxy[0])
                y_min = float(xyxy[1])
                w_box = float(xyxy[2] - xyxy[0])
                h_box = float(xyxy[3] - xyxy[1])
                bbox = [x_min, y_min, w_box, h_box]

                annotation = {
                    "id": annotation_id,
                    "image_id": image_id_counter,
                    "category_id": new_class_id,
                    "bbox": bbox,
                    "area": w_box * h_box,
                    "segmentation": [],
                    "iscrowd": 0,
                    "score": float(filtered_detections.confidence[i])
                }
                coco_dataset["annotations"].append(annotation)
                annotation_id += 1

        except Exception as e:
            print(f"Error processing {filename}: {e}")
            continue

        image_id_counter += 1

output_path = "_annotations.coco.json"
with open(output_path, "w") as f:
    json.dump(coco_dataset, f)

print(f"\nSuccessfully created {output_path}")
print(f"Total Images: {len(coco_dataset['images'])}")
print(f"Total Annotations: {len(coco_dataset['annotations'])}")

In [None]:
import json
import glob
import os
from tqdm import tqdm
from PIL import Image
import numpy as np
import torch
import torchvision
import yaml

# --- Configuration ---
CONFIDENCE_THRESHOLD = 0.4
SCOREBOARD_CONFIDENCE = 0.5
NMS_THRESHOLD = 0.25
# ---------------------

# Ensure we have the class names
yaml_path = os.path.join(dataset.location, "data.yaml")
with open(yaml_path, 'r') as f:
    data_yaml = yaml.safe_load(f)

raw_names = data_yaml.get('names', [])
if isinstance(raw_names, dict):
    existing_classes = {int(k): v for k, v in raw_names.items()}
else:
    existing_classes = {i: name for i, name in enumerate(raw_names)}

# Identify Referee Class ID automatically
referee_ids = [k for k, v in existing_classes.items() if 'referee' in v.lower()]
referee_id = referee_ids[0] if referee_ids else -1
if referee_id != -1:
    print(f"Referee filtering enabled for Class ID: {referee_id} ({existing_classes[referee_id]})")
else:
    print("Warning: 'Referee' class not found. Referee filtering will be skipped.")

new_class_name = "jersey number"
new_class_id = len(existing_classes)

# Initialize COCO structure
coco_dataset = {
    "info": {
        "description": "Dataset with Scoreboard and Referee Filtering",
        "version": "1.4",
        "year": 2025,
        "contributor": "Agent",
        "date_created": "2025-11-21"
    },
    "licenses": [],
    "images": [],
    "annotations": [],
    "categories": []
}

# Add categories
for class_id, class_name in existing_classes.items():
    coco_dataset["categories"].append({
        "id": int(class_id),
        "name": class_name,
        "supercategory": "none"
    })
coco_dataset["categories"].append({
        "id": new_class_id,
        "name": new_class_name,
        "supercategory": "none"
    })

def yolo_to_coco(x_center, y_center, w, h, img_w, img_h):
    w_pixel = w * img_w
    h_pixel = h * img_h
    x_min = (x_center * img_w) - (w_pixel / 2)
    y_min = (y_center * img_h) - (h_pixel / 2)
    return [x_min, y_min, w_pixel, h_pixel]

def is_center_inside_any(bbox, target_boxes):
    """
    Checks if the center of the bbox is inside any of the target boxes.
    bbox: [x1, y1, x2, y2]
    target_boxes: list of [x1, y1, x2, y2]
    """
    x1, y1, x2, y2 = bbox
    center_x = (x1 + x2) / 2
    center_y = (y1 + y2) / 2
    
    for t_box in target_boxes:
        tx1, ty1, tx2, ty2 = t_box
        if tx1 <= center_x <= tx2 and ty1 <= center_y <= ty2:
            return True
    return False

annotation_id = 0
image_id_counter = 0
splits = ['train', 'valid', 'test']

print("Starting full dataset processing with DYNAMIC Scoreboard & REFEREE filtering...")

for split in splits:
    image_dir = os.path.join(dataset.location, split, "images")
    label_dir = os.path.join(dataset.location, split, "labels")
    
    if not os.path.exists(image_dir):
        print(f"Skipping {split}: directory not found.")
        continue
        
    image_files = glob.glob(os.path.join(image_dir, "*.jpg"))
    print(f"Processing {len(image_files)} images in {split}...")
    
    for img_path in tqdm(image_files):
        filename = os.path.basename(img_path)
        try:
            image = Image.open(img_path).convert("RGB")
            img_w, img_h = image.size
            
            # Image entry
            coco_dataset["images"].append({
                "id": image_id_counter,
                "file_name": filename,
                "width": img_w,
                "height": img_h,
                "license": None,
                "date_captured": None
            })
            
            referee_boxes = []  # Store referee boxes for this image

            # 1. Ground Truth (YOLO)
            label_path = os.path.join(label_dir, filename.replace(".jpg", ".txt"))
            if os.path.exists(label_path):
                with open(label_path, "r") as f:
                    for line in f:
                        parts = line.strip().split()
                        if len(parts) >= 5:
                            cls_id = int(parts[0])
                            bbox = yolo_to_coco(float(parts[1]), float(parts[2]), float(parts[3]), float(parts[4]), img_w, img_h)
                            
                            # Add to dataset
                            coco_dataset["annotations"].append({
                                "id": annotation_id,
                                "image_id": image_id_counter,
                                "category_id": cls_id,
                                "bbox": bbox,
                                "area": bbox[2] * bbox[3],
                                "segmentation": [],
                                "iscrowd": 0
                            })
                            annotation_id += 1
                            
                            # Collect Referee Boxes for filtering
                            if cls_id == referee_id:
                                # Convert COCO [x, y, w, h] to [x1, y1, x2, y2]
                                rx1, ry1 = bbox[0], bbox[1]
                                rx2, ry2 = rx1 + bbox[2], ry1 + bbox[3]
                                referee_boxes.append([rx1, ry1, rx2, ry2])
            
            # 2. Set Image for Inference
            inference_state = processor.set_image(image)
            
            # 3. Detect Scoreboard (Dynamic Filter)
            inference_state = processor.set_text_prompt(state=inference_state, prompt="scoreboard")
            sb_results = from_sam(sam_result=inference_state)
            
            sb_boxes = []
            if len(sb_results.xyxy) > 0:
                sb_mask = sb_results.confidence > SCOREBOARD_CONFIDENCE
                sb_detections = sb_results[sb_mask]
                sb_boxes = sb_detections.xyxy.tolist()

            # 4. Detect Jersey Numbers
            inference_state = processor.set_text_prompt(state=inference_state, prompt=new_class_name)
            jn_results = from_sam(sam_result=inference_state)
            
            # Filter by confidence
            mask = jn_results.confidence > CONFIDENCE_THRESHOLD
            filtered = jn_results[mask]
            
            # Filter 1: NMS (Overlap Removal)
            if len(filtered.xyxy) > 0:
                boxes_t = torch.from_numpy(filtered.xyxy).float()
                scores_t = torch.from_numpy(filtered.confidence).float()
                keep_indices = torchvision.ops.nms(boxes_t, scores_t, iou_threshold=NMS_THRESHOLD)
                filtered = filtered[keep_indices.numpy()]
            
            # Add filtered predictions to COCO
            for i, xyxy in enumerate(filtered.xyxy):
                x1, y1, x2, y2 = float(xyxy[0]), float(xyxy[1]), float(xyxy[2]), float(xyxy[3])
                w_box = x2 - x1
                h_box = y2 - y1
                bbox_xyxy = [x1, y1, x2, y2]
                
                # Filter 2: Dynamic Scoreboard Check
                if is_center_inside_any(bbox_xyxy, sb_boxes):
                    continue
                
                # Filter 3: Referee Overlap Check
                if is_center_inside_any(bbox_xyxy, referee_boxes):
                    continue
                
                # Add to annotations
                bbox_coco = [x1, y1, w_box, h_box]
                coco_dataset["annotations"].append({
                    "id": annotation_id,
                    "image_id": image_id_counter,
                    "category_id": new_class_id,
                    "bbox": bbox_coco,
                    "area": w_box * h_box,
                    "segmentation": [],
                    "iscrowd": 0,
                    "score": float(filtered.confidence[i])
                })
                annotation_id += 1
                
        except Exception as e:
            print(f"Error processing {filename}: {e}")
            
        image_id_counter += 1

output_path = "_annotations.coco.json"
with open(output_path, "w") as f:
    json.dump(coco_dataset, f)

print(f"\nSuccessfully created {output_path}")
print(f"Total Images: {len(coco_dataset['images'])}")
print(f"Total Annotations: {len(coco_dataset['annotations'])}")

## Visualize Existing COCO Annotations

### Subtask:
Load the generated COCO annotations file and visualize a random sample of images with bounding boxes to verify the merged labels.


**Reasoning**:
Load the COCO annotations JSON file, parse it to extract image paths and annotations, and then visualize a few random samples using the supervision library to verify the correctness of the merged dataset.



In [None]:
import json
import os
import random
from PIL import Image
import supervision as sv
import numpy as np
from IPython.display import display

# Path to the consolidated annotations file
annotation_file = "_annotations.coco.json"

if os.path.exists(annotation_file):
    # Load the JSON
    with open(annotation_file, "r") as f:
        coco_data = json.load(f)

    images = coco_data["images"]
    annotations = coco_data["annotations"]
    categories = coco_data["categories"]

    print(f"Loaded JSON with {len(images)} images, {len(annotations)} annotations, and {len(categories)} categories.")

    # Create category ID to Name mapping
    category_map = {cat["id"]: cat["name"] for cat in categories}
    print("Category Mapping:", category_map)

    # Select random images
    selected_images = random.sample(images, min(10, len(images)))

    for img_entry in selected_images:
        file_name = img_entry["file_name"]
        image_id = img_entry["id"]

        # Find the image path
        found_path = None
        for split in ["train", "valid", "test"]:
            potential_path = os.path.join(dataset.location, split, "images", file_name)
            if os.path.exists(potential_path):
                found_path = potential_path
                break

        if found_path:
            # Open image
            image = Image.open(found_path).convert("RGB")

            # Filter annotations for this image
            img_anns = [ann for ann in annotations if ann["image_id"] == image_id]

            if not img_anns:
                print(f"No annotations found for {file_name}")
                display(image)
                print("-" * 50)
                continue

            # Prepare data for detections
            boxes = []
            class_ids = []
            scores = []

            for ann in img_anns:
                x, y, w, h = ann["bbox"]
                # Convert xywh to xyxy
                boxes.append([x, y, x + w, y + h])
                class_ids.append(ann["category_id"])
                scores.append(ann.get("score", 1.0))

            # Create Detections object
            detections = sv.Detections(
                xyxy=np.array(boxes),
                class_id=np.array(class_ids),
                confidence=np.array(scores)
            )

            # Annotators
            box_annotator = sv.BoxAnnotator()
            label_annotator = sv.LabelAnnotator()

            # Generate labels
            labels = [
                f"{category_map[class_id]} {confidence:.2f}"
                for class_id, confidence in zip(class_ids, scores)
            ]

            # Annotate and display
            annotated_image = image.copy()
            annotated_image = box_annotator.annotate(scene=annotated_image, detections=detections)
            annotated_image = label_annotator.annotate(scene=annotated_image, detections=detections, labels=labels)

            print(f"Visualizing: {file_name}")
            display(annotated_image)
            print("-" * 50)
        else:
            print(f"Image file not found: {file_name}")
else:
    print(f"Annotation file {annotation_file} not found.")

## Upload annotations to roboflow

In [None]:
import json
import os
import glob
from tqdm import tqdm

# Load the COCO JSON we just created
annotation_file = "_annotations.coco.json"
with open(annotation_file, "r") as f:
    coco_data = json.load(f)

# Map categories: ID -> Name
category_map = {cat["id"]: cat["name"] for cat in coco_data["categories"]}

# Map Image ID -> Annotations List
image_ann_map = {}
for ann in coco_data["annotations"]:
    img_id = ann["image_id"]
    if img_id not in image_ann_map:
        image_ann_map[img_id] = []
    image_ann_map[img_id].append(ann)

# Map Filename -> Image Info
filename_to_img = {img["file_name"]: img for img in coco_data["images"]}

def create_pascal_voc_xml(filename, width, height, annotations, output_path):
    xml_content = []
    xml_content.append("<annotation>")
    xml_content.append(f"    <folder></folder>")
    xml_content.append(f"    <filename>{filename}</filename>")
    xml_content.append("    <size>")
    xml_content.append(f"        <width>{width}</width>")
    xml_content.append(f"        <height>{height}</height>")
    xml_content.append("        <depth>3</depth>")
    xml_content.append("    </size>")

    for ann in annotations:
        cat_id = ann["category_id"]
        name = category_map.get(cat_id, "unknown")
        
        # COCO bbox is [x_min, y_min, width, height]
        bbox = ann["bbox"]
        xmin = bbox[0]
        ymin = bbox[1]
        xmax = bbox[0] + bbox[2]
        ymax = bbox[1] + bbox[3]

        xml_content.append("    <object>")
        xml_content.append(f"        <name>{name}</name>")
        xml_content.append("        <bndbox>")
        xml_content.append(f"            <xmin>{xmin}</xmin>")
        xml_content.append(f"            <ymin>{ymin}</ymin>")
        xml_content.append(f"            <xmax>{xmax}</xmax>")
        xml_content.append(f"            <ymax>{ymax}</ymax>")
        xml_content.append("        </bndbox>")
        xml_content.append("    </object>")

    xml_content.append("</annotation>")
    
    with open(output_path, "w") as f:
        f.write("\n".join(xml_content))

print("Starting upload process... This may take a while as we iterate through images.")

splits = ['train', 'valid', 'test']

for split in splits:
    image_dir = os.path.join(dataset.location, split, "images")
    if not os.path.exists(image_dir):
        continue

    image_files = glob.glob(os.path.join(image_dir, "*.jpg"))
    print(f"Uploading {len(image_files)} images from {split}...")

    for img_path in tqdm(image_files):
        filename = os.path.basename(img_path)
        
        # Get image info from COCO map
        img_info = filename_to_img.get(filename)
        if not img_info:
            continue
            
        img_id = img_info["id"]
        anns = image_ann_map.get(img_id, [])
        
        # Create temporary XML annotation file
        xml_path = img_path.replace(".jpg", ".xml")
        create_pascal_voc_xml(filename, img_info["width"], img_info["height"], anns, xml_path)
        
        try:
            # Upload Image + XML to Roboflow
            # This updates the existing image with new annotations
            project.upload(image_path=img_path, annotation_path=xml_path, split=split, num_retry_uploads=3, batch_name="SAM3_Inference_filtered")
        except Exception as e:
            print(f"Failed to upload {filename}: {e}")
        finally:
            # Cleanup temp file
            if os.path.exists(xml_path):
                os.remove(xml_path)

print("Upload complete.")