### System Setup

In [7]:
# Libraries

import json
import os
import re
from PIL import Image, ImageDraw

In [8]:
# Variables

# Base directory
source_base_dir = "../yuvi"  # Base directory for source data

# JSON file name
json_file_name = "_annotations.coco.json"

### Prepare Data

In [9]:
# Function to process a single directory
def process_directory(source_dir):
    json_path = os.path.join(source_dir, json_file_name)
    output_json_path = json_path  # Save the updated JSON in the same directory

    # Read the original COCO JSON file
    with open(json_path, "r") as f:
        coco_data = json.load(f)

    # Get list of images in the source directory (long names)
    image_files = {img for img in os.listdir(source_dir) if img.lower().endswith(('.jpg', '.jpeg', '.png'))}

    # Create a mapping dictionary long names -> short names
    name_mapping = {}

    for img_name in image_files:
        # Extract the main identifier from the short name (example: "image_00003")
        base_name_match = re.match(r"(image_\d+)", img_name)
        if base_name_match:
            base_name = base_name_match.group(1) + ".jpg"  # Desired short format
            name_mapping[img_name] = base_name

    # Rename files in the source directory
    for old_name, new_name in name_mapping.items():
        old_path = os.path.join(source_dir, old_name)
        new_path = os.path.join(source_dir, new_name)
        os.rename(old_path, new_path)

    # Filter images and update names in the JSON
    filtered_images = []
    for img in coco_data["images"]:
        if img["file_name"] in name_mapping:
            img["file_name"] = name_mapping[img["file_name"]]  # Replace with the short name
            filtered_images.append(img)

    # Get the IDs of the selected images
    selected_image_ids = {img["id"] for img in filtered_images}

    # Filter annotations related to the selected images
    filtered_annotations = [ann for ann in coco_data["annotations"] if ann["image_id"] in selected_image_ids]

    # Create the new JSON with corrected names
    filtered_coco = {
        "images": filtered_images,
        "annotations": filtered_annotations,
        "categories": coco_data["categories"]  # Keep original categories
    }

    # Save the new JSON in the same directory
    with open(output_json_path, "w") as f:
        json.dump(filtered_coco, f, indent=4)

    print(f"✅ Files renamed and new JSON saved at: {output_json_path}")

In [10]:
# Process all subdirectories in the source base directory
for subdir in os.listdir(source_base_dir):
    source_dir = os.path.join(source_base_dir, subdir)
    if os.path.isdir(source_dir):
        process_directory(source_dir)

✅ Files renamed and new JSON saved at: ../yuvi/train/_annotations.coco.json


### Process Images

In [11]:
# Function to crop and save images
def crop_and_save(image_path, bbox, output_path):
    image = Image.open(image_path)
    left, upper, width, height = bbox
    right = left + width
    lower = upper + height
    cropped_image = image.crop((left, upper, right, lower))
    cropped_image.save(output_path)

# Function to draw bounding boxes on the image and save it
def draw_bounding_boxes(image_path, bboxes, output_path):
    image = Image.open(image_path)
    draw = ImageDraw.Draw(image)
    for bbox in bboxes:
        left, upper, width, height = bbox
        right = left + width
        lower = upper + height
        draw.rectangle([left, upper, right, lower], outline="red", width=2)
    image.save(output_path)

# Function to process a single directory
def process_directory(source_dir):
    json_path = os.path.join(source_dir, json_file_name)
    processed_dir = os.path.join(source_dir, 'processed')
    os.makedirs(processed_dir, exist_ok=True)

    # Load annotations from the JSON file
    with open(json_path, 'r') as f:
        data = json.load(f)

    # Create a dictionary to map image_id to file_name
    image_id_to_filename = {image['id']: image['file_name'] for image in data['images']}

    # Dictionary to group bounding boxes by image_id
    image_bboxes = {}

    # Iterate over annotations and group bounding boxes by image_id
    for annotation in data['annotations']:
        image_id = annotation['image_id']
        bbox = annotation['bbox']
        if image_id not in image_bboxes:
            image_bboxes[image_id] = []
        image_bboxes[image_id].append(bbox)

    # Iterate over images and process them
    for image_id, bboxes in image_bboxes.items():
        image_filename = image_id_to_filename[image_id]
        image_path = os.path.join(source_dir, image_filename)
        
        # Save the image with bounding boxes drawn
        output_path_with_boxes = os.path.join(processed_dir, f"{os.path.splitext(image_filename)[0]}_with_boxes.jpg")
        draw_bounding_boxes(image_path, bboxes, output_path_with_boxes)
        
        # Crop and save each bounding box individually
        for i, bbox in enumerate(bboxes):
            output_path = os.path.join(processed_dir, f"{os.path.splitext(image_filename)[0]}_crop_{i}.jpg")
            crop_and_save(image_path, bbox, output_path)
        
        print(f"Processed {image_filename} with {len(bboxes)} bounding boxes")

In [12]:
# Process all subdirectories in the source base directory
for subdir in os.listdir(source_base_dir):
    source_dir = os.path.join(source_base_dir, subdir)
    if os.path.isdir(source_dir):
        process_directory(source_dir)

Processed image_02115.jpg with 1 bounding boxes
Processed image_00983.jpg with 1 bounding boxes
Processed image_02814.jpg with 4 bounding boxes
Processed image_00700.jpg with 1 bounding boxes
Processed image_01282.jpg with 134 bounding boxes
Processed image_07910.jpg with 1 bounding boxes
Processed image_04030.jpg with 20 bounding boxes
Processed image_06102.jpg with 3 bounding boxes
Processed image_07774.jpg with 5 bounding boxes
Processed image_07847.jpg with 30 bounding boxes
Processed image_01063.jpg with 1 bounding boxes
Processed image_04951.jpg with 6 bounding boxes
Processed image_01912.jpg with 1 bounding boxes
Processed image_04915.jpg with 7 bounding boxes
Processed image_00052.jpg with 9 bounding boxes
Processed image_04326.jpg with 5 bounding boxes
Processed image_03545.jpg with 7 bounding boxes
Processed image_00805.jpg with 1 bounding boxes
Processed image_03531.jpg with 39 bounding boxes
Processed image_07905.jpg with 1 bounding boxes
Processed image_06693.jpg with 2 bo

ValueError: cannot write empty image as JPEG