
# COCO → Pascal VOC Converter (Read Image Size from Actual Files)

This notebook converts an **MS COCO** annotation JSON (e.g., `instances_train.json`) into **Pascal VOC** XML files.
It **reads image width/height from the actual image files** (via PIL), not from the JSON.

**Features**
- Per-image VOC XML in `Annotations/`
- Optional copy of images into `JPEGImages/`
- Optional generation of `ImageSets/Main/<set-name>.txt`
- Optional category filtering: keep only selected class names
- Clamps bboxes to image bounds; discards invalid/empty boxes
- Maps COCO `iscrowd=1` → VOC `difficult=1`

> Tip: If an image file is missing and size cannot be read, that sample will be **skipped** by default.


In [None]:

from pathlib import Path
import json
import shutil
import xml.etree.ElementTree as ET
from PIL import Image

# ==== User config (example) ====
COCO_JSON = Path("/path/to/annotations/instances_train.json")
IMAGES_DIR = Path("/path/to/images")           # where the actual images live
OUT_DIR    = Path("/path/to/output_voc")       # will contain Annotations/, JPEGImages/, ImageSets/Main/
SET_NAME   = "train"                            # name for ImageSets/Main/<SET_NAME>.txt
GEN_IMAGE_SET = True                            # generate ImageSets list
COPY_IMAGES   = True                            # copy images to OUT_DIR/JPEGImages
CATEGORIES    = None                            # e.g., {"person","car","dog"} to keep only these; None = keep all

# ==== End user config ====


In [None]:

def ensure_dir(p: Path):
    p.mkdir(parents=True, exist_ok=True)

def clamp(v, lo, hi):
    return max(lo, min(hi, v))

def indent_xml(elem: ET.Element, level: int = 0):
    i = "\n" + level*"  "
    if len(elem):
        if not elem.text or not elem.text.strip():
            elem.text = i + "  "
        for e in elem:
            indent_xml(e, level+1)
        if not e.tail or not e.tail.strip():
            e.tail = i
    if level and (not elem.tail or not elem.tail.strip()):
        elem.tail = i


In [None]:

def coco_to_voc_xml(
    img_info: dict,
    objs: list,
    cat_id_to_name: dict,
    src_images_dir: Path,
    dst_images_dir: Path | None = None,
) -> ET.Element | None:
    """Build a Pascal VOC XML Element for a given image and its annotations.
    
    - Reads width/height from the **actual image** using PIL.
    - If the image file cannot be opened, returns None (caller may skip).
    - If dst_images_dir is provided, it's used only to fill the 'path' and 'folder' metadata,
      and to know where a copied image would live; reading size always uses src_images_dir.
    """
    file_name = img_info.get("file_name")
    src_img_path = src_images_dir / file_name

    try:
        with Image.open(src_img_path) as im:
            width, height = im.size  # (W,H)
    except Exception as e:
        print(f"[WARN] Cannot open image for size: {src_img_path} ({e}). Skipping.")
        return None

    # XML root
    annotation = ET.Element("annotation")

    # folder & filename & path
    folder_name = (dst_images_dir or src_images_dir).name
    ET.SubElement(annotation, "folder").text = folder_name
    ET.SubElement(annotation, "filename").text = file_name
    ET.SubElement(annotation, "path").text = str(((dst_images_dir or src_images_dir) / file_name).resolve())

    # source
    source = ET.SubElement(annotation, "source")
    ET.SubElement(source, "database").text = "Unknown"

    # size
    size = ET.SubElement(annotation, "size")
    ET.SubElement(size, "width").text = str(int(width))
    ET.SubElement(size, "height").text = str(int(height))
    ET.SubElement(size, "depth").text = "3"

    ET.SubElement(annotation, "segmented").text = "0"

    # objects
    for ann in objs:
        cat_id = ann["category_id"]
        name = cat_id_to_name.get(cat_id, str(cat_id))

        bbox = ann.get("bbox", [0,0,0,0])
        if not bbox or len(bbox) != 4:
            continue
        x, y, w, h = bbox
        if w <= 0 or h <= 0:
            continue

        xmin = int(round(x))
        ymin = int(round(y))
        xmax = int(round(x + w))
        ymax = int(round(y + h))

        xmin = clamp(xmin, 1, width)
        ymin = clamp(ymin, 1, height)
        xmax = clamp(xmax, 1, width)
        ymax = clamp(ymax, 1, height)

        if xmax <= xmin or ymax <= ymin:
            continue

        obj = ET.SubElement(annotation, "object")
        ET.SubElement(obj, "name").text = name
        ET.SubElement(obj, "pose").text = "Unspecified"
        difficult = 1 if int(ann.get("iscrowd", 0)) == 1 else 0
        ET.SubElement(obj, "truncated").text = "0"
        ET.SubElement(obj, "difficult").text = str(difficult)

        bndbox = ET.SubElement(obj, "bndbox")
        ET.SubElement(bndbox, "xmin").text = str(xmin)
        ET.SubElement(bndbox, "ymin").text = str(ymin)
        ET.SubElement(bndbox, "xmax").text = str(xmax)
        ET.SubElement(bndbox, "ymax").text = str(ymax)

    return annotation


In [None]:

def convert_coco_to_voc(
    coco_json_path: Path,
    images_dir: Path,
    out_dir: Path,
    set_name: str = "all",
    gen_image_set: bool = False,
    copy_images: bool = False,
    categories_keep: set | None = None,
):
    data = json.loads(coco_json_path.read_text(encoding="utf-8"))
    images = data.get("images", [])
    annotations = data.get("annotations", [])
    categories = data.get("categories", [])

    img_id_to_info = {im["id"]: im for im in images}
    cat_id_to_name = {c["id"]: c["name"] for c in categories}

    # Index annotations by image_id
    img_to_anns = {}
    for ann in annotations:
        img_id = ann["image_id"]
        img_to_anns.setdefault(img_id, []).append(ann)

    # Prepare output dirs
    ann_dir = out_dir / "Annotations"
    jpg_dir = out_dir / "JPEGImages"
    set_dir = out_dir / "ImageSets" / "Main"
    ensure_dir(ann_dir)
    if copy_images:
        ensure_dir(jpg_dir)
    if gen_image_set:
        ensure_dir(set_dir)

    kept_stems = []
    total = len(img_id_to_info)
    for i, (img_id, info) in enumerate(img_id_to_info.items(), start=1):
        file_name = info.get("file_name")
        stem = Path(file_name).stem
        src_img_path = images_dir / file_name

        anns = img_to_anns.get(img_id, [])
        if categories_keep is not None:
            anns = [a for a in anns if cat_id_to_name.get(a["category_id"], "") in categories_keep]

        xml_root = coco_to_voc_xml(
            info,
            anns,
            cat_id_to_name,
            src_images_dir=images_dir,
            dst_images_dir=(jpg_dir if copy_images else images_dir),
        )
        if xml_root is None:
            # cannot read image size → skip
            continue

        indent_xml(xml_root)
        tree = ET.ElementTree(xml_root)
        tree.write(ann_dir / f"{stem}.xml", encoding="utf-8", xml_declaration=True)

        if copy_images:
            dst = jpg_dir / Path(file_name).name
            dst.parent.mkdir(parents=True, exist_ok=True)
            try:
                shutil.copy2(src_img_path, dst)
            except FileNotFoundError:
                print(f"[WARN] Missing source image (copy skipped): {src_img_path}")

        kept_stems.append(stem)

        if i % 100 == 0 or i == total:
            print(f"[{i}/{total}] Wrote {stem}.xml")

    if gen_image_set and kept_stems:
        list_path = set_dir / f"{set_name}.txt"
        list_path.write_text("\n".join(kept_stems), encoding="utf-8")
        print(f"Wrote image set list: {list_path} ({len(kept_stems)} ids)")

    print("Done.")



## Run Conversion
Adjust the **User config** cell above, then execute the cell below.


In [None]:

# Prepare category filter (if any)
cats = set(CATEGORIES) if isinstance(CATEGORIES, (set, list, tuple)) else (CATEGORIES if CATEGORIES is None else set(CATEGORIES))

convert_coco_to_voc(
    coco_json_path=COCO_JSON,
    images_dir=IMAGES_DIR,
    out_dir=OUT_DIR,
    set_name=SET_NAME,
    gen_image_set=GEN_IMAGE_SET,
    copy_images=COPY_IMAGES,
    categories_keep=cats
)
