### 1. Set Up the Environment

In [None]:
!pip install torch torchvision transformers

### 2. Split dataset to train/val/test folders

In [None]:
import os
import shutil
import random

image_root = '../Manga109/images/'
output_root = '../Manga109/dataset_split/'
train_ratio = 0.7
val_ratio = 0.15  

train_dir = os.path.join(output_root, 'train')
val_dir = os.path.join(output_root, 'val')
test_dir = os.path.join(output_root, 'test')

os.makedirs(train_dir, exist_ok=True)
os.makedirs(val_dir, exist_ok=True)
os.makedirs(test_dir, exist_ok=True)

def split_dataset(image_root, train_ratio, val_ratio):
    for manga_title in os.listdir(image_root):
        manga_path = os.path.join(image_root, manga_title)
        
        if not os.path.isdir(manga_path):
            continue
        
        # get all image files in the manga title folder
        image_files = sorted([f for f in os.listdir(manga_path) if f.endswith('.jpg')])

        # shuffle images to ensure random splits
        random.shuffle(image_files)

        # calculate the number of images for each set
        total_images = len(image_files)
        train_count = int(total_images * train_ratio)
        val_count = int(total_images * val_ratio)

        # split images into train, val, and test
        train_files = image_files[:train_count]
        val_files = image_files[train_count:train_count + val_count]
        test_files = image_files[train_count + val_count:]

        manga_train_dir = os.path.join(train_dir, manga_title)
        manga_val_dir = os.path.join(val_dir, manga_title)
        manga_test_dir = os.path.join(test_dir, manga_title)

        os.makedirs(manga_train_dir, exist_ok=True)
        os.makedirs(manga_val_dir, exist_ok=True)
        os.makedirs(manga_test_dir, exist_ok=True)

        move_files(manga_path, manga_train_dir, train_files)
        move_files(manga_path, manga_val_dir, val_files)
        move_files(manga_path, manga_test_dir, test_files)

        print(f"Processed {manga_title}: {train_count} train, {val_count} val, {len(test_files)} test files.")

def move_files(src_folder, dest_folder, files):
    for file in files:
        src_path = os.path.join(src_folder, file)
        dest_path = os.path.join(dest_folder, file)
        shutil.copyfile(src_path, dest_path)

split_dataset(image_root, train_ratio, val_ratio)


### 3. Convert Manga109 annotations to COCO format for DETR usage

In [4]:
import xml.etree.ElementTree as ET
import os

def convert_bbox_to_coco_format(x1, y1, x2, y2):
    x = x1
    y = y1
    width = x2 - x1
    height = y2 - y1
    return [x, y, width, height]

def convert_manga109_annotations_to_coco_format(xml_path):
    tree = ET.parse(xml_path)
    root = tree.getroot()

    objects = ['face', 'body', 'text', 'frame']
    
    annotations = []
    for page in root.findall(".//page"):
        page_index = page.get('index')
        for textbox in page:
            bbox = textbox.attrib
            annotations.append({
                "page_index": page_index,
                "object_id": bbox['id'],
                "category": textbox.tag,
                "category_id": objects.index(textbox.tag),
                "bbox": convert_bbox_to_coco_format(int(bbox['xmin']), int(bbox['ymin']), int(bbox['xmax']), int(bbox['ymax']))
            })
    
    categories_data = {
        "categories": [
            {"name": 'face', "id": 0},
            {"name": 'body', "id": 1},
            {"name": 'text', "id": 2},
            {"name": 'frame', "id": 3},
        ]
    }

    return {
        "annotations": annotations,
        **categories_data  
    }


manga109_annotation_file = '../Manga109/annotations/AisazuNihaIrarenai.xml'

parsed_xml_coco_annotation = convert_manga109_annotations_to_coco_format(manga109_annotation_file)

In [5]:
import json

json_string = json.dumps(parsed_xml_coco_annotation, indent=4)

print(json_string)

{
    "annotations": [
        {
            "page_index": "2",
            "object_id": "0000097a",
            "category": "face",
            "category_id": 0,
            "bbox": [
                1071,
                687,
                41,
                32
            ]
        },
        {
            "page_index": "2",
            "object_id": "0000097c",
            "category": "frame",
            "category_id": 3,
            "bbox": [
                480,
                700,
                266,
                411
            ]
        },
        {
            "page_index": "2",
            "object_id": "0000097d",
            "category": "text",
            "category_id": 2,
            "bbox": [
                664,
                458,
                58,
                218
            ]
        },
        {
            "page_index": "2",
            "object_id": "0000097e",
            "category": "body",
            "category_id": 1,
            "bbox": [
      