# 01 — Bootstrap Labels & Synthesize YOLO-Seg Dataset

**Pipeline:** raw board photos → legacy OpenCV detection → geometric QA →
YOLO-seg labels (real) → synthetic composites → YOLO-seg labels (synthetic) →
final dataset with `data.yaml`

**Inputs:**
- `data/raw/` — 38 board photos (4000×3000 JPGs)
- `data/backgrounds/` — 9 background textures (desk, fabric, wood)

**Output:**
- `data/yolo-seg-board/` — complete YOLO-seg dataset ready for `yolo segment train`

## 1. Setup & Configuration

In [None]:
import cv2
import math
import random
import shutil

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import yaml
from pathlib import Path
from tqdm.notebook import tqdm

# Reproducibility
SEED = 67
random.seed(SEED)
np.random.seed(SEED)

# --- Paths ---
PROJECT_ROOT = Path.cwd().parent  # prototyping/
DATA_DIR = PROJECT_ROOT / "prototyping/data"
RAW_DIR = DATA_DIR / "raw"
BG_DIR = DATA_DIR / "backgrounds"

# Output dataset (YOLO format)
DATASET_DIR = DATA_DIR / "yolo-seg-board"
DATASET_IMAGES_TRAIN = DATASET_DIR / "images" / "train"
DATASET_IMAGES_VAL = DATASET_DIR / "images" / "val"
DATASET_LABELS_TRAIN = DATASET_DIR / "labels" / "train"
DATASET_LABELS_VAL = DATASET_DIR / "labels" / "val"

# --- Config ---
RESIZE_HEIGHT = 1500  # matches parse_boggle_board() default
SYNTH_CANVAS_SIZE = (1280, 1280)  # uniform canvas for synthetic images

# Detection params (tuned values from parse_boggle_board, NOT the
# detect_boggle_board_contour defaults — these work better on our photos)
DETECTION_PARAMS = dict(
    n_top_contours_to_consider=200,
    min_board_area_threshold=0.15,
    max_board_area_threshold=0.8,
    board_contour_expansion_size=25,
    polygon_approximation_epsilon=0.05,
    binary_threshold_value=100,
)

print(f"Raw photos:  {RAW_DIR} ({len(list(RAW_DIR.glob('*.jpg')))} files)")
print(f"Backgrounds: {BG_DIR} ({len(list(BG_DIR.glob('*')))} files)")
print(f"Output:      {DATASET_DIR}")

## 2. Board Detection Functions (inlined from legacy)

Copied from `prototyping/legacy/board_detection.py` with broken imports removed.
Only the board-contour-detection pipeline is needed — tile detection and OCR are not used.

In [None]:
def convert_to_greyscale(img):
    """Convert a BGR image to greyscale."""
    return cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)


def apply_binary_thresholding(img, threshold=127):
    """Apply binary thresholding to a greyscale image."""
    _, thresholded_image = cv2.threshold(img, threshold, 255, cv2.THRESH_BINARY)
    return thresholded_image


def detect_contours(
    img,
    hierarchy_algorithm=cv2.RETR_TREE,
    contour_approximation=cv2.CHAIN_APPROX_NONE,
    apply_preprocessing=False,
):
    """Detect contours in an image. Optionally preprocesses (greyscale + threshold)."""
    if apply_preprocessing:
        img = convert_to_greyscale(img)
        img = apply_binary_thresholding(img)
    contours, hierarchy = cv2.findContours(img, hierarchy_algorithm, contour_approximation)
    return contours, hierarchy


def hierarchy_to_dataframe(hierarchy):
    """Convert cv2.findContours hierarchy into a DataFrame with contour relationships."""

    def dfs(contour_idx, level, max_levels=4):
        if max_levels is not None and level >= max_levels:
            return
        _, _, first_child, parent = hierarchy[0, contour_idx]
        children = []
        if first_child != -1:
            child = first_child
            while True:
                children.append(child)
                next_sibling, _, _, _ = hierarchy[0, child]
                if next_sibling != -1:
                    child = next_sibling
                else:
                    break
        rows.append([
            contour_idx,
            level,
            parent if parent != -1 else None,
            children if children else None,
        ])
        for child in children:
            dfs(child, level + 1, max_levels)

    rows = []
    root_contours = np.where(hierarchy[0, :, 3] == -1)[0]
    for contour_idx in root_contours:
        dfs(contour_idx, 0)

    df = pd.DataFrame(rows, columns=["contour_idx", "hierarchy_level", "parent", "children"])
    df["n_children"] = df["children"].apply(lambda x: len(x) if x else 0)
    return df.drop_duplicates(subset=["contour_idx"])


def approximate_polygon_from_contour(contour, epsilon=0.05):
    """Approximate a polygon from a contour using cv2.approxPolyDP."""
    perimeter = cv2.arcLength(contour, True)
    return cv2.approxPolyDP(contour, epsilon * perimeter, True)


def expand_contour(input_image, contour, dilation_size=5):
    """Expand a contour by dilating its filled mask and re-extracting."""
    mask = np.zeros(input_image.shape[:2], dtype=np.uint8)
    cv2.drawContours(mask, [contour], -1, (255), thickness=-1)
    kernel = np.ones((dilation_size, dilation_size), np.uint8)
    dilated_mask = cv2.dilate(mask, kernel, iterations=1)
    dilated_contours, _ = cv2.findContours(
        dilated_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
    )
    dilated_contour = max(dilated_contours, key=cv2.contourArea)
    dilated_contour = cv2.approxPolyDP(
        dilated_contour, 0.02 * cv2.arcLength(dilated_contour, True), True
    )
    return dilated_contour


def detect_boggle_board_contour(
    input_img,
    n_top_contours_to_consider=20,
    min_board_area_threshold=0.2,
    max_board_area_threshold=0.9,
    board_contour_expansion_size=5,
    polygon_approximation_epsilon=0.05,
    binary_threshold_value=127,
):
    """Detect the Boggle board contour in an image. Returns a polygon contour.

    Raises Exception if no suitable quadrilateral contour is found.
    """
    img_area = input_img.shape[0] * input_img.shape[1]
    greyscale_img = convert_to_greyscale(input_img)
    thresholded_img = apply_binary_thresholding(greyscale_img, threshold=binary_threshold_value)
    contours, hierarchy = detect_contours(thresholded_img)
    hierarchy_df = hierarchy_to_dataframe(hierarchy)
    hierarchy_df["contour"] = hierarchy_df["contour_idx"].apply(
        lambda idx: contours[idx] if idx >= 0 else None
    )

    contours_to_consider = hierarchy_df.sort_values("n_children", ascending=False).head(
        n_top_contours_to_consider
    )
    contours_to_consider["approx_polygon"] = contours_to_consider["contour"].apply(
        lambda contour: approximate_polygon_from_contour(
            contour, epsilon=polygon_approximation_epsilon
        )
    )
    contours_to_consider["approx_polygon_n_sides"] = contours_to_consider[
        "approx_polygon"
    ].apply(lambda x: len(x))
    contours_to_consider = contours_to_consider.query("approx_polygon_n_sides == 4").copy()

    if len(contours_to_consider) == 0:
        raise Exception("There were no square contours detected in the image.")

    contours_to_consider["approx_polygon_area"] = contours_to_consider[
        "approx_polygon"
    ].apply(lambda x: cv2.contourArea(x))
    contours_to_consider["approx_polygon_area_pct_of_image"] = (
        contours_to_consider["approx_polygon_area"] / img_area
    )

    contours_to_consider = (
        contours_to_consider.query(
            "approx_polygon_area_pct_of_image >= @min_board_area_threshold"
        )
        .query("approx_polygon_area_pct_of_image <= @max_board_area_threshold")
        .copy()
    )
    if len(contours_to_consider) == 0:
        raise Exception(
            "No contours were found that were within the specified area thresholds."
        )

    largest_contour_row = contours_to_consider.sort_values(
        "approx_polygon_area_pct_of_image", ascending=False
    ).iloc[0]
    largest_contour_dict = largest_contour_row.to_dict()

    largest_contour_dict["expanded_contour"] = expand_contour(
        input_img,
        largest_contour_dict["contour"],
        dilation_size=board_contour_expansion_size,
    )

    return approximate_polygon_from_contour(
        largest_contour_dict["expanded_contour"], epsilon=polygon_approximation_epsilon
    )


def resize_image(image, desired_height):
    """Resize an image to a desired height, maintaining aspect ratio."""
    height, width = image.shape[:2]
    aspect_ratio = width / height
    desired_width = int(desired_height * aspect_ratio)
    return cv2.resize(image, (desired_width, desired_height), interpolation=cv2.INTER_AREA)


print("Legacy functions loaded.")

## 3. Detect Board Contours in All Raw Photos

Runs the legacy OpenCV pipeline on each photo. Catches exceptions for outright failures
and applies geometric sanity checks to flag suspicious detections.

In [None]:
def geometric_sanity_checks(contour, img_shape):
    """Run geometric sanity checks on a detected contour.

    Returns a list of issue strings (empty if all checks pass).
    """
    issues = []
    pts = contour.reshape(-1, 2).astype(float)
    img_h, img_w = img_shape[:2]
    img_area = img_h * img_w

    # 1. Point count (should be 4 for a quadrilateral)
    if len(pts) != 4:
        issues.append(f"n_points={len(pts)}")

    # 2. Aspect ratio (board is square — ratio should be 0.7–1.3)
    _, (w, h), _ = cv2.minAreaRect(contour)
    if max(w, h) > 0:
        aspect_ratio = min(w, h) / max(w, h)
        if aspect_ratio < 0.7:
            issues.append(f"aspect_ratio={aspect_ratio:.2f}")

    # 3. Convexity
    if not cv2.isContourConvex(contour):
        issues.append("non_convex")

    # 4. Area reasonableness (10–85% of image)
    contour_area = cv2.contourArea(contour)
    area_pct = contour_area / img_area
    if not (0.10 <= area_pct <= 0.85):
        issues.append(f"area_pct={area_pct:.2f}")

    # 5. Interior angles (60–120 degrees for each corner)
    if len(pts) >= 3:
        for i in range(len(pts)):
            v1 = pts[(i - 1) % len(pts)] - pts[i]
            v2 = pts[(i + 1) % len(pts)] - pts[i]
            denom = np.linalg.norm(v1) * np.linalg.norm(v2)
            if denom < 1e-8:
                issues.append(f"degenerate_corner_{i}")
                continue
            cos_angle = np.dot(v1, v2) / denom
            angle_deg = np.degrees(np.arccos(np.clip(cos_angle, -1, 1)))
            if not (60 <= angle_deg <= 120):
                issues.append(f"corner_{i}_angle={angle_deg:.1f}")

    return issues

In [None]:
raw_files = sorted(f for f in RAW_DIR.iterdir() if f.suffix.lower() in (".jpg", ".jpeg", ".png"))
print(f"Found {len(raw_files)} raw images")

results = []

for fpath in tqdm(raw_files, desc="Detecting boards"):
    record = {
        "filename": fpath.name,
        "filepath": fpath,
        "status": "ok",
        "contour": None,
        "resized_img": None,
        "issues": [],
    }

    img = cv2.imread(str(fpath))
    if img is None:
        record["status"] = "load_failed"
        record["issues"].append("cv2.imread returned None")
        results.append(record)
        continue

    img = resize_image(img, RESIZE_HEIGHT)
    record["resized_img"] = img

    try:
        contour = detect_boggle_board_contour(img, **DETECTION_PARAMS)
    except Exception as e:
        record["status"] = "detection_failed"
        record["issues"].append(str(e))
        results.append(record)
        continue

    record["contour"] = contour

    # Geometric sanity checks
    issues = geometric_sanity_checks(contour, img.shape)
    if issues:
        record["status"] = "suspicious"
        record["issues"] = issues

    results.append(record)

# Summary
results_df = pd.DataFrame([{
    "filename": r["filename"],
    "status": r["status"],
    "n_points": len(r["contour"].reshape(-1, 2)) if r["contour"] is not None else None,
    "issues": "; ".join(r["issues"]) if r["issues"] else "",
} for r in results])

print("\n--- Detection Summary ---")
print(results_df["status"].value_counts().to_string())
print()
display(results_df)

## 4. Visual QA — Review All Detections

Grid of all images with detected contours overlaid.
- **Green** = ok
- **Orange** = suspicious (geometric check flagged)
- **Red border** = detection failed

Review the grid and add any bad filenames to `EXCLUDE_FILES` in the next cell.

In [None]:
STATUS_COLORS = {
    "ok": (0, 255, 0),
    "suspicious": (0, 165, 255),
    "detection_failed": (0, 0, 255),
    "load_failed": (0, 0, 255),
}

n_images = len(results)
n_cols = 6
n_rows = math.ceil(n_images / n_cols)

fig, axes = plt.subplots(n_rows, n_cols, figsize=(3.5 * n_cols, 3.5 * n_rows))
axes = axes.flatten()

for idx, record in enumerate(results):
    ax = axes[idx]
    img = record["resized_img"]

    if img is None:
        ax.text(0.5, 0.5, "LOAD\nFAILED", ha="center", va="center", fontsize=12, color="red")
        ax.set_facecolor("black")
    else:
        vis = img.copy()
        if record["contour"] is not None:
            color = STATUS_COLORS[record["status"]]
            cv2.drawContours(vis, [record["contour"]], -1, color, thickness=8)
        vis_rgb = cv2.cvtColor(vis, cv2.COLOR_BGR2RGB)
        # Downsample for display
        display_h = 300
        display_w = int(display_h * vis_rgb.shape[1] / vis_rgb.shape[0])
        vis_small = cv2.resize(vis_rgb, (display_w, display_h))
        ax.imshow(vis_small)

    status = record["status"]
    title_color = {"ok": "green", "suspicious": "orange", "detection_failed": "red", "load_failed": "red"}[status]
    title = f"{record['filename'][:20]}\n[{status}]"
    ax.set_title(title, fontsize=7, color=title_color)
    ax.axis("off")

for idx in range(n_images, len(axes)):
    axes[idx].set_visible(False)

plt.tight_layout()
plt.show()

### Exclusion List

Review the grid above. Add filenames of any images with bad detections to the
`EXCLUDE_FILES` set below. Detection failures are automatically excluded.
Re-run from Section 5 onward after editing.

In [None]:
# --- USER EDIT: Add filenames to exclude from the dataset ---
# Detection failures and suspicious detections are automatically excluded.
# Add any "ok" images that look wrong upon visual inspection.
EXCLUDE_FILES = {
    "20230809_201820(1).jpg",
    "20230809_201820.jpg",
}

# Automatically exclude detection failures and suspicious detections
auto_excluded = {r["filename"] for r in results if r["status"] in ("detection_failed", "load_failed", "suspicious")}
all_excluded = EXCLUDE_FILES | auto_excluded

good_results = [r for r in results if r["filename"] not in all_excluded and r["contour"] is not None]

print(f"Total images:     {len(results)}")
print(f"Auto-excluded:    {len(auto_excluded)} (detection failures + suspicious)")
print(f"Manual-excluded:  {len(EXCLUDE_FILES)}")
print(f"Good detections:  {len(good_results)}")

if auto_excluded:
    print("\nAuto-excluded files:")
    for f in sorted(auto_excluded):
        print(f"  - {f}")

## 5. Generate YOLO-Seg Labels for Real Photos

For each good detection, normalize the polygon coordinates to `[0, 1]` and write
a YOLO-seg format label file. Also save the resized images for the dataset.

In [None]:
def contour_to_yolo_seg_label(contour, img_height, img_width, class_id=0):
    """Convert an OpenCV contour to a YOLO-seg format label line.

    YOLO-seg format: <class_id> <x1> <y1> <x2> <y2> ... (normalized 0-1)
    """
    pts = contour.reshape(-1, 2).astype(float)
    normalized = []
    for x, y in pts:
        normalized.append(f"{x / img_width:.6f}")
        normalized.append(f"{y / img_height:.6f}")
    return f"{class_id} " + " ".join(normalized)


def write_yolo_label(label_path, contour, img_h, img_w):
    """Write a single YOLO-seg label file."""
    label_line = contour_to_yolo_seg_label(contour, img_h, img_w, class_id=0)
    label_path.write_text(label_line + "\n")


# Store real image/label pairs for later dataset assembly
real_pairs = []  # list of {"stem": ..., "img": ..., "contour": ...}

for r in good_results:
    stem = Path(r["filename"]).stem
    real_pairs.append({
        "stem": stem,
        "img": r["resized_img"],
        "contour": r["contour"],
    })

print(f"Prepared {len(real_pairs)} real image/label pairs")

## 6. Synthetic Composite Generation

Cut out each detected board, paste onto varied backgrounds with randomized
augmentation. Labels are computed from the known placement coordinates — they're free.

In [None]:
def extract_board_cutout(img, contour):
    """Extract board cutout with alpha channel from image using contour mask.

    Returns:
        bgra: Board region with transparent background (BGRA).
        local_contour: Polygon in cropped coordinate system (Nx2 float).
    """
    mask = np.zeros(img.shape[:2], dtype=np.uint8)
    cv2.fillPoly(mask, [contour.reshape(-1, 1, 2)], 255)

    x, y, w, h = cv2.boundingRect(contour)
    cropped_img = img[y : y + h, x : x + w]
    cropped_mask = mask[y : y + h, x : x + w]

    bgra = cv2.cvtColor(cropped_img, cv2.COLOR_BGR2BGRA)
    bgra[:, :, 3] = cropped_mask

    local_contour = contour.reshape(-1, 2).astype(float) - np.array([x, y], dtype=float)
    return bgra, local_contour


# Extract cutouts from all good detections
cutouts = []
for r in tqdm(good_results, desc="Extracting cutouts"):
    bgra, local_contour = extract_board_cutout(r["resized_img"], r["contour"])
    cutouts.append({
        "filename": r["filename"],
        "bgra": bgra,
        "local_contour": local_contour,
    })

print(f"Extracted {len(cutouts)} board cutouts")

In [None]:
# Load backgrounds
bg_files = sorted(f for f in BG_DIR.iterdir() if f.suffix.lower() in (".png", ".jpg", ".jpeg"))
backgrounds = []
for bf in bg_files:
    bg = cv2.imread(str(bf))
    if bg is not None:
        backgrounds.append({"filename": bf.name, "img": bg})

print(f"Loaded {len(backgrounds)} backgrounds:")
for b in backgrounds:
    h, w = b["img"].shape[:2]
    print(f"  {b['filename']}: {w}x{h}")

In [None]:
def generate_synthetic_composite(
    cutout_bgra,
    local_contour,
    bg_img,
    canvas_size=(1280, 1280),
    scale_range=(0.40, 0.80),
    rotation_range=(-15.0, 15.0),
    brightness_range=(0.7, 1.3),
    contrast_range=(0.8, 1.2),
    noise_sigma_max=15.0,
    perspective_strength=0.05,
    rng=None,
):
    """Generate one synthetic composite image with computed polygon label.

    Returns:
        composite: BGR image of shape canvas_size.
        polygon_pts: Nx2 float array of polygon points in canvas coordinates.
    """
    if rng is None:
        rng = np.random.default_rng()

    canvas_h, canvas_w = canvas_size

    # 1. Resize background to fill canvas
    bg_resized = cv2.resize(bg_img, (canvas_w, canvas_h))

    # 2. Scale the board cutout
    scale = rng.uniform(*scale_range)
    angle_deg = rng.uniform(*rotation_range)

    cutout_h, cutout_w = cutout_bgra.shape[:2]
    longest_edge = max(cutout_h, cutout_w)
    target_longest = int(min(canvas_h, canvas_w) * scale)
    resize_factor = target_longest / longest_edge
    new_w = int(cutout_w * resize_factor)
    new_h = int(cutout_h * resize_factor)

    resized_cutout = cv2.resize(cutout_bgra, (new_w, new_h), interpolation=cv2.INTER_AREA)
    resized_contour = (local_contour * resize_factor).astype(np.float32)

    # 3. Rotate
    center = (new_w / 2, new_h / 2)
    M_rot = cv2.getRotationMatrix2D(center, angle_deg, 1.0)

    cos_a = abs(M_rot[0, 0])
    sin_a = abs(M_rot[0, 1])
    rot_w = int(new_h * sin_a + new_w * cos_a)
    rot_h = int(new_h * cos_a + new_w * sin_a)
    M_rot[0, 2] += (rot_w - new_w) / 2
    M_rot[1, 2] += (rot_h - new_h) / 2

    rotated_cutout = cv2.warpAffine(
        resized_cutout, M_rot, (rot_w, rot_h),
        flags=cv2.INTER_LINEAR,
        borderMode=cv2.BORDER_CONSTANT,
        borderValue=(0, 0, 0, 0),
    )

    # Transform contour through rotation
    ones = np.ones((len(resized_contour), 1), dtype=np.float32)
    pts_hom = np.hstack([resized_contour, ones])
    rotated_contour = (M_rot @ pts_hom.T).T

    # 4. Optional mild perspective distortion
    if perspective_strength > 0:
        src_pts = np.array(
            [[0, 0], [rot_w, 0], [rot_w, rot_h], [0, rot_h]], dtype=np.float32
        )
        jitter = (
            rng.uniform(-perspective_strength, perspective_strength, size=(4, 2))
            * np.array([rot_w, rot_h])
        )
        dst_pts = (src_pts + jitter).astype(np.float32)
        M_persp = cv2.getPerspectiveTransform(src_pts, dst_pts)
        rotated_cutout = cv2.warpPerspective(
            rotated_cutout, M_persp, (rot_w, rot_h),
            borderMode=cv2.BORDER_CONSTANT,
            borderValue=(0, 0, 0, 0),
        )
        pts_h = np.hstack(
            [rotated_contour, np.ones((len(rotated_contour), 1))]
        ).astype(np.float32)
        transformed = (M_persp @ pts_h.T).T
        rotated_contour = transformed[:, :2] / transformed[:, 2:3]

    # 5. Random placement within canvas
    max_x = canvas_w - rot_w
    max_y = canvas_h - rot_h
    if max_x < 0 or max_y < 0:
        # Cutout too big — shrink to fit
        shrink = min(canvas_w / rot_w, canvas_h / rot_h) * 0.9
        rot_w_new = int(rot_w * shrink)
        rot_h_new = int(rot_h * shrink)
        rotated_cutout = cv2.resize(rotated_cutout, (rot_w_new, rot_h_new))
        rotated_contour = rotated_contour * shrink
        rot_w, rot_h = rot_w_new, rot_h_new
        max_x = canvas_w - rot_w
        max_y = canvas_h - rot_h

    offset_x = rng.integers(0, max(max_x, 1))
    offset_y = rng.integers(0, max(max_y, 1))

    # 6. Alpha-blend onto background
    composite = bg_resized.copy()
    alpha = rotated_cutout[:, :, 3:4] / 255.0
    rgb = rotated_cutout[:, :, :3]

    roi = composite[offset_y : offset_y + rot_h, offset_x : offset_x + rot_w]
    blended = (rgb * alpha + roi * (1 - alpha)).astype(np.uint8)
    composite[offset_y : offset_y + rot_h, offset_x : offset_x + rot_w] = blended

    canvas_contour = rotated_contour + np.array([offset_x, offset_y])

    # 7. Photometric augmentation (applied to entire composite)
    brightness = rng.uniform(*brightness_range)
    contrast = rng.uniform(*contrast_range)
    composite = np.clip(
        composite.astype(np.float32) * contrast + (brightness - 1.0) * 127, 0, 255
    ).astype(np.uint8)

    # Gaussian noise
    sigma = rng.uniform(0, noise_sigma_max)
    if sigma > 0:
        noise = rng.normal(0, sigma, composite.shape).astype(np.float32)
        composite = np.clip(composite.astype(np.float32) + noise, 0, 255).astype(np.uint8)

    # Slight per-channel color shift
    for c in range(3):
        shift = rng.uniform(-10, 10)
        composite[:, :, c] = np.clip(
            composite[:, :, c].astype(np.float32) + shift, 0, 255
        ).astype(np.uint8)

    return composite, canvas_contour

In [None]:
# --- Generate synthetic composites ---
# Phase 1: one composite per (cutout, background) pair
# Phase 2: fill to target with random re-pairings

SYNTH_TARGET_COUNT = len(cutouts) * len(backgrounds)  # one per pair

synthetic_records = []
synth_idx = 0

# Phase 1
for cutout in tqdm(cutouts, desc="Generating synthetics"):
    for bg in backgrounds:
        rng = np.random.default_rng(SEED + synth_idx)
        composite, canvas_contour = generate_synthetic_composite(
            cutout["bgra"],
            cutout["local_contour"],
            bg["img"],
            canvas_size=SYNTH_CANVAS_SIZE,
            rng=rng,
        )
        synthetic_records.append({
            "synth_idx": synth_idx,
            "source_file": cutout["filename"],
            "bg_file": bg["filename"],
            "composite": composite,
            "contour": canvas_contour,
        })
        synth_idx += 1

# Phase 2: additional random pairings if we want more
SYNTH_EXTRA_COUNT = 50  # extra images on top of base combos
while len(synthetic_records) < SYNTH_TARGET_COUNT + SYNTH_EXTRA_COUNT:
    rng = np.random.default_rng(SEED + synth_idx)
    cutout = cutouts[rng.integers(len(cutouts))]
    bg = backgrounds[rng.integers(len(backgrounds))]
    composite, canvas_contour = generate_synthetic_composite(
        cutout["bgra"],
        cutout["local_contour"],
        bg["img"],
        canvas_size=SYNTH_CANVAS_SIZE,
        rng=rng,
    )
    synthetic_records.append({
        "synth_idx": synth_idx,
        "source_file": cutout["filename"],
        "bg_file": bg["filename"],
        "composite": composite,
        "contour": canvas_contour,
    })
    synth_idx += 1

print(f"Generated {len(synthetic_records)} synthetic composites")

In [None]:
# Visual QA: sample of synthetic composites with polygon overlays
n_sample = min(18, len(synthetic_records))
sample_indices = random.sample(range(len(synthetic_records)), n_sample)

n_cols_synth = 6
n_rows_synth = math.ceil(n_sample / n_cols_synth)

fig, axes = plt.subplots(n_rows_synth, n_cols_synth, figsize=(4.5 * n_cols_synth, 4.5 * n_rows_synth))
axes = axes.flatten()

for ax_idx, si in enumerate(sample_indices):
    rec = synthetic_records[si]
    vis = rec["composite"].copy()
    pts = rec["contour"].reshape(-1, 1, 2).astype(np.int32)
    cv2.polylines(vis, [pts], isClosed=True, color=(0, 255, 0), thickness=3)
    axes[ax_idx].imshow(cv2.cvtColor(vis, cv2.COLOR_BGR2RGB))
    axes[ax_idx].set_title(f"synth_{rec['synth_idx']:04d} | {rec['bg_file'][:12]}", fontsize=8)
    axes[ax_idx].axis("off")

for idx in range(n_sample, len(axes)):
    axes[idx].set_visible(False)

plt.suptitle("Synthetic Composite QA Sample", fontsize=14)
plt.tight_layout()
plt.show()

## 7. Assemble Final YOLO-Seg Dataset

Create the YOLO directory structure, perform train/val split, write all images
and labels, and generate `data.yaml`.

In [None]:
# Clean and create directory structure
if DATASET_DIR.exists():
    shutil.rmtree(DATASET_DIR)

for d in [DATASET_IMAGES_TRAIN, DATASET_IMAGES_VAL, DATASET_LABELS_TRAIN, DATASET_LABELS_VAL]:
    d.mkdir(parents=True, exist_ok=True)

# --- Train/Val Split ---
# Real images: 80/20 random split
real_indices = list(range(len(real_pairs)))
random.shuffle(real_indices)
n_real_val = max(1, int(len(real_indices) * 0.2))
real_val_indices = set(real_indices[:n_real_val])

# Synthetic images: 80/20 random split
synth_indices = list(range(len(synthetic_records)))
random.shuffle(synth_indices)
n_synth_val = max(1, int(len(synth_indices) * 0.2))
synth_val_indices = set(synth_indices[:n_synth_val])

n_real_train = len(real_pairs) - n_real_val
n_synth_train = len(synthetic_records) - n_synth_val

print(f"Real:      {n_real_train} train / {n_real_val} val")
print(f"Synthetic: {n_synth_train} train / {n_synth_val} val")
print(f"Total:     {n_real_train + n_synth_train} train / {n_real_val + n_synth_val} val")

In [None]:
# Write real images and labels
for idx, pair in enumerate(tqdm(real_pairs, desc="Writing real images")):
    split = "val" if idx in real_val_indices else "train"
    img_dst = DATASET_DIR / "images" / split / f"real_{pair['stem']}.jpg"
    lbl_dst = DATASET_DIR / "labels" / split / f"real_{pair['stem']}.txt"

    h, w = pair["img"].shape[:2]
    cv2.imwrite(str(img_dst), pair["img"])
    write_yolo_label(lbl_dst, pair["contour"], h, w)

# Write synthetic images and labels
for idx, rec in enumerate(tqdm(synthetic_records, desc="Writing synthetic images")):
    split = "val" if idx in synth_val_indices else "train"
    name = f"synth_{rec['synth_idx']:04d}"
    img_dst = DATASET_DIR / "images" / split / f"{name}.jpg"
    lbl_dst = DATASET_DIR / "labels" / split / f"{name}.txt"

    h, w = rec["composite"].shape[:2]
    cv2.imwrite(str(img_dst), rec["composite"])
    write_yolo_label(lbl_dst, rec["contour"], h, w)

print("Done writing images and labels.")

In [None]:
# Write data.yaml
data_yaml = {
    "path": str(DATASET_DIR.resolve()),
    "train": "images/train",
    "val": "images/val",
    "names": {
        0: "board",
    },
}

yaml_path = DATASET_DIR / "data.yaml"
with open(yaml_path, "w") as f:
    yaml.dump(data_yaml, f, default_flow_style=False, sort_keys=False)

print(f"Wrote {yaml_path}")
print()
print(yaml_path.read_text())

## 8. Final Verification

Spot-check random images from the written dataset by loading them back and rendering
the labels. Verify the data.yaml is parseable by Ultralytics.

In [None]:
def load_and_visualize_yolo_label(img_path, label_path, ax):
    """Load an image and its YOLO-seg label, draw the polygon."""
    img = cv2.imread(str(img_path))
    h, w = img.shape[:2]

    label_text = label_path.read_text().strip()
    parts = label_text.split()
    coords = [float(x) for x in parts[1:]]

    # Denormalize
    pts = []
    for i in range(0, len(coords), 2):
        px = coords[i] * w
        py = coords[i + 1] * h
        pts.append([px, py])
    pts = np.array(pts, dtype=np.int32)

    vis = img.copy()
    cv2.polylines(vis, [pts.reshape(-1, 1, 2)], isClosed=True, color=(0, 255, 0), thickness=3)

    # Also fill with semi-transparent overlay
    overlay = vis.copy()
    cv2.fillPoly(overlay, [pts.reshape(-1, 1, 2)], (0, 255, 0))
    vis = cv2.addWeighted(overlay, 0.15, vis, 0.85, 0)

    ax.imshow(cv2.cvtColor(vis, cv2.COLOR_BGR2RGB))
    ax.set_title(img_path.name, fontsize=7)
    ax.axis("off")


# Sample 3 from train, 3 from val
train_imgs = sorted(DATASET_IMAGES_TRAIN.glob("*.jpg"))
val_imgs = sorted(DATASET_IMAGES_VAL.glob("*.jpg"))

sample = (
    random.sample(train_imgs, min(3, len(train_imgs)))
    + random.sample(val_imgs, min(3, len(val_imgs)))
)

fig, axes = plt.subplots(2, 3, figsize=(15, 10))
for ax, img_path in zip(axes.flatten(), sample):
    label_path = (
        img_path.parent.parent.parent / "labels" / img_path.parent.name / (img_path.stem + ".txt")
    )
    load_and_visualize_yolo_label(img_path, label_path, ax)

plt.suptitle("Final Dataset Spot Check (green = label polygon)", fontsize=14)
plt.tight_layout()
plt.show()

In [None]:
# Dataset statistics and integrity check
train_count = len(list(DATASET_IMAGES_TRAIN.glob("*.jpg")))
val_count = len(list(DATASET_IMAGES_VAL.glob("*.jpg")))
train_labels = len(list(DATASET_LABELS_TRAIN.glob("*.txt")))
val_labels = len(list(DATASET_LABELS_VAL.glob("*.txt")))

print("=== YOLO-Seg Dataset Summary ===")
print(f"Train images:  {train_count}")
print(f"Train labels:  {train_labels}")
print(f"Val images:    {val_count}")
print(f"Val labels:    {val_labels}")
print(f"Total:         {train_count + val_count}")
print(f"data.yaml:     {DATASET_DIR / 'data.yaml'}")
print()

# Integrity assertions
assert train_count == train_labels, f"Train image/label mismatch: {train_count} vs {train_labels}"
assert val_count == val_labels, f"Val image/label mismatch: {val_count} vs {val_labels}"
assert train_count > 0, "No training images!"
assert val_count > 0, "No validation images!"

# Verify every image has a matching label and vice versa
for split in ["train", "val"]:
    img_stems = {p.stem for p in (DATASET_DIR / "images" / split).glob("*.jpg")}
    lbl_stems = {p.stem for p in (DATASET_DIR / "labels" / split).glob("*.txt")}
    assert img_stems == lbl_stems, f"{split}: image/label stem mismatch"

print("All integrity checks passed.")

In [None]:
# Verify Ultralytics can parse the dataset
from ultralytics.data.utils import check_det_dataset

dataset_info = check_det_dataset(str(DATASET_DIR / "data.yaml"))
print("Ultralytics dataset check passed!")
print(f"  Train: {dataset_info['train']}")
print(f"  Val:   {dataset_info['val']}")
print(f"  Names: {dataset_info['names']}")