In [None]:
# --- Import Required Libraries ---

# File and data handling
import os
import json
import random
import shutil
from collections import defaultdict
from pathlib import Path

# Data processing and visualization
from sklearn.model_selection import train_test_split
import yaml

# Deep learning
import torch
from ultralytics import YOLO
from tqdm.notebook import tqdm  

In [None]:
NOTEBOOK_DIR = os.getcwd()

if not NOTEBOOK_DIR.endswith("YOLO_Ultralytics"):
    raise ValueError("Please set the working directory to 'YOLO_Ultralytics' folder. Currently it is set to: " + NOTEBOOK_DIR)

BASE_DIR = os.path.join(NOTEBOOK_DIR, "..", "..", "..")

In [None]:
# --- 1. Configuration Section ---

# Paths to your data and model files
JSON_DIR = os.path.join(BASE_DIR, 'data', 'MangaSegmentation/jsons_processed') 
IMAGE_ROOT_DIR = os.path.join(BASE_DIR, 'data', 'Manga109_released_2023_12_07','images')  
DATASET_DIR = os.path.join(BASE_DIR, 'data', 'YOLO_data')
PROJECT_DIR = os.path.join(BASE_DIR, 'models', 'bubble-detection', 'YOLOv8n_Training_Results')
yaml_path = Path(DATASET_DIR) / 'dataset.yaml'

# Validate paths
print("\nValidating directories...")
for path in [JSON_DIR, IMAGE_ROOT_DIR]:
    if not os.path.exists(path):
        raise FileNotFoundError(f"Directory not found: {path}")
    else:
        print(f"Found directory: {path}")
        # List some contents
        contents = os.listdir(path)[:5]
        print(f"Sample contents: {contents}")

# Create dataset directories
print("\nCreating dataset directories...")
for split in ['train', 'val']:
    for subdir in ['images', 'labels']:  
        dir_path = os.path.join(DATASET_DIR, f'{subdir}/{split}')
        os.makedirs(dir_path, exist_ok=True)
        print(f"Created: {dir_path}")

# Set category information directly
TARGET_CATEGORY_ID = 5  # Fixed category ID for balloon
TARGET_CATEGORY_NAME = "balloon"  # Fixed category name

print("Target Category Configuration:")
print("Category ID: {TARGET_CATEGORY_ID}")
print("Category Name: {TARGET_CATEGORY_NAME}")

In [None]:
# --- 2. Data Preparation ---

print("\n--- 2. Preparing Data from Processed JSONs ---")

def prepare_manga_balloon_data(json_dir, image_root):
    """
    Loads pre-processed JSON files (with polygons), filters for the target 
    category, and returns a list of image records. This logic is adapted 
    from your working 'train_v3 copy.ipynb'.
    """
    all_images = {}
    all_annotations = defaultdict(list)

    print("Loading and parsing PRE-PROCESSED JSON files...")
    json_files = [f for f in os.listdir(json_dir) if f.endswith('.json')]
    
    for json_file in tqdm(json_files, desc="Processing JSONs"):
        with open(os.path.join(json_dir, json_file), 'r') as f:
            data = json.load(f)
            for img_info in data.get('images', []):
                all_images[img_info['id']] = img_info
            for ann_info in data.get('annotations', []):
                all_annotations[ann_info['image_id']].append(ann_info)

    print(f"Loaded data for {len(all_images)} total images.")

    dataset_records = []
    for img_id, img_info in all_images.items():
        # Create a base record for the image
        record = {
            "file_name": os.path.join(image_root, img_info['file_name']),
            "image_id": img_id,
            "height": img_info['height'],
            "width": img_info['width'],
        }
        
        # Filter for balloon annotations
        balloon_annotations = []
        for ann in all_annotations.get(img_id, []):
            if ann.get('category_id') == TARGET_CATEGORY_ID:
                # Ensure segmentation data is present and not empty
                if ann.get('segmentation'):
                    balloon_annotations.append({
                        "segmentation": ann['segmentation'],
                        "category_id": 0,  # All balloons will be class 0
                    })
        
        # Only add images that contain at least one balloon
        if balloon_annotations:
            record["annotations"] = balloon_annotations
            dataset_records.append(record)
            
    print(f"Data preparation complete. Found {len(dataset_records)} images containing '{TARGET_CATEGORY_NAME}'.")
    return dataset_records

# Run the data preparation
all_data = prepare_manga_balloon_data(JSON_DIR, IMAGE_ROOT_DIR)

In [None]:
# --- 3. Split and Prepare YOLO Dataset ---

# Prepare the data
all_data = prepare_manga_balloon_data(JSON_DIR, IMAGE_ROOT_DIR)

# --- Group data by manga title ---
print("\nGrouping data by manga series for a robust train/val split...")
grouped_data = defaultdict(list)
for record in all_data:
    manga_name = Path(record['file_name']).parts[-2]
    grouped_data[manga_name].append(record)
print(f"Found {len(grouped_data)} unique manga series.")

# Split manga titles to prevent data leakage
manga_titles = list(grouped_data.keys())
train_titles, val_titles = train_test_split(manga_titles, test_size=0.2, random_state=42)
print(f"Splitting into {len(train_titles)} training series and {len(val_titles)} validation series.")

# Reconstruct train/val lists based on the title split
train_data = [record for title in train_titles for record in grouped_data[title]]
val_data = [record for title in val_titles for record in grouped_data[title]]
random.Random(42).shuffle(train_data)
random.Random(42).shuffle(val_data)
print(f"Final training set size: {len(train_data)} images")
print(f"Final validation set size: {len(val_data)} images")

In [None]:
def process_dataset_split_segmentation(data_split, split_type):
    """
    Processes a dataset to create a YOLO INSTANCE SEGMENTATION dataset.
    This function will write normalized polygon coordinates to a .txt file.
    """
    total_annotations = 0
    
    for record in tqdm(data_split, desc=f"Processing {split_type} split"):
        original_img_path = record['file_name']
        img_height = record['height']
        img_width = record['width']
        
        # Check if image exists
        if not os.path.exists(original_img_path):
            print(f"Warning: Image not found at {original_img_path}. Skipping.")
            continue
            
        # Create unique identifiers for images to avoid duplicate names
        manga_title = Path(original_img_path).parts[-2]
        img_stem = Path(original_img_path).stem
        img_identifier = f"{manga_title}_{img_stem}"
        
        # 1. Copy images to folder train/val
        dest_img_path = os.path.join(DATASET_DIR, f'images/{split_type}', f"{img_identifier}.jpg")
        shutil.copy2(original_img_path, dest_img_path)
        
        # 2. Create corresponding .txt label file
        label_path = os.path.join(DATASET_DIR, f'labels/{split_type}', f"{img_identifier}.txt")
        
        # 3. Write normalized polygon coordinates to label file
        with open(label_path, 'w') as f:
            for ann in record.get('annotations', []):
                # Each 'ann' corresponds to a balloon (an object)
                segmentation = ann.get('segmentation')
                if not segmentation:
                    continue
                
                # Each object can have multiple polygons (complex cases)
                for poly in segmentation:
                    # Normalize the coordinates of the polygon
                    # poly is a list number [x1, y1, x2, y2, ...]
                    normalized_poly = []
                    for i in range(0, len(poly), 2):
                        x = poly[i] / img_width
                        y = poly[i+1] / img_height
                        normalized_poly.extend([x, y])
                    
                    # Write to file in format: class_id x1 y1 x2 y2 ...
                    # Class ID is always 0 because we only have 1 class "balloon"
                    if normalized_poly:
                        f.write(f"0 {' '.join(map(str, normalized_poly))}\n")
                        total_annotations += 1
    
    return len(data_split), total_annotations

In [None]:
# --- Execute data conversion with debugged function ---
print("Processing training split...")
train_images_count, train_annotations_count = process_dataset_split_segmentation(train_data, 'train')
print("Processing validation split...")
val_images_count, val_annotations_count = process_dataset_split_segmentation(val_data, 'val')

# --- Check and verify ---
print("Dataset created successfully:")
print(f"Training images: {train_images_count}")
print(f"Training annotations (polygons): {train_annotations_count}")
print(f"Validation images: {val_images_count}")
print(f"Validation annotations (polygons): {val_annotations_count}")

final_train_images = len(os.listdir(os.path.join(DATASET_DIR, 'images/train')))
final_val_images = len(os.listdir(os.path.join(DATASET_DIR, 'images/val')))

print("Final verification from disk:")
print(f"Total training images in folder: {final_train_images}")
print(f"Total validation images in folder: {final_val_images}")

if final_train_images == len(train_data) and final_val_images == len(val_data):
    print("\nVerification successful: All images were copied correctly.")
else:
    print("\nVerification WARNING: Mismatch in image counts.")

In [None]:
# Create YAML Configuration
print("\n--- 4. Creating dataset.yaml Configuration File ---")

dataset_config = {
    'path': os.path.abspath(DATASET_DIR),
    'train': 'images/train',
    'val': 'images/val',
    'names': {
        0: TARGET_CATEGORY_NAME
    }
}

with open(yaml_path, 'w') as f:
    yaml.dump(dataset_config, f, default_flow_style=False, sort_keys=False)

print(f"YAML configuration saved to: {yaml_path}")
print("\nYAML Content:")
print(yaml.dump(dataset_config))