### 1. Set Up the Environment

In [None]:
!pip install torch torchvision transformers

### 2. Split dataset to train/val/test folders

In [1]:
import os
import shutil
import random

# Define paths and ratios
image_root = '../Manga109/images/'
output_root = '../Manga109/dataset_split/'
train_ratio = 0.7
val_ratio = 0.15  

train_dir = os.path.join(output_root, 'train')
val_dir = os.path.join(output_root, 'val')
test_dir = os.path.join(output_root, 'test')

# Create output directories if they don't exist
os.makedirs(train_dir, exist_ok=True)
os.makedirs(val_dir, exist_ok=True)
os.makedirs(test_dir, exist_ok=True)

def split_dataset(image_root, train_ratio, val_ratio):
    for manga_title in os.listdir(image_root):
        manga_path = os.path.join(image_root, manga_title)
        
        # Skip non-directory items
        if not os.path.isdir(manga_path):
            continue

        # Get all image files in the manga title folder
        image_files = sorted([f for f in os.listdir(manga_path) if f.endswith('.jpg')])

        # Shuffle images to ensure random splits
        random.shuffle(image_files)

        # Calculate the number of images for each set
        total_images = len(image_files)
        train_count = int(total_images * train_ratio)
        val_count = int(total_images * val_ratio)

        # Split images into train, val, and test
        train_files = image_files[:train_count]
        val_files = image_files[train_count:train_count + val_count]
        test_files = image_files[train_count + val_count:]

        # Move and rename files to the corresponding split folders
        move_files(manga_path, train_dir, train_files, manga_title)
        move_files(manga_path, val_dir, val_files, manga_title)
        move_files(manga_path, test_dir, test_files, manga_title)

        print(f"Processed {manga_title}: {train_count} train, {val_count} val, {len(test_files)} test files.")

def move_files(src_folder, dest_folder, files, manga_title):
    for idx, file in enumerate(files):
        src_path = os.path.join(src_folder, file)
        
        # Create new filename using manga title and index
        new_filename = f"{manga_title}_{idx:03}.jpg"
        dest_path = os.path.join(dest_folder, new_filename)

        # Copy file to the destination folder with new name
        shutil.copyfile(src_path, dest_path)

split_dataset(image_root, train_ratio, val_ratio)


Processed AisazuNihaIrarenai: 65 train, 14 val, 15 test files.
Processed AkkeraKanjinchou: 64 train, 13 val, 15 test files.
Processed Akuhamu: 56 train, 12 val, 13 test files.
Processed AosugiruHaru: 73 train, 15 val, 17 test files.
Processed AppareKappore: 67 train, 14 val, 16 test files.
Processed Arisa: 67 train, 14 val, 16 test files.
Processed ARMS: 56 train, 12 val, 13 test files.
Processed BakuretsuKungFuGirl: 67 train, 14 val, 16 test files.
Processed Belmondo: 69 train, 14 val, 16 test files.
Processed BEMADER_P: 79 train, 17 val, 18 test files.
Processed BokuHaSitatakaKun: 69 train, 14 val, 16 test files.
Processed BurariTessenTorimonocho: 78 train, 16 val, 18 test files.
Processed ByebyeC-BOY: 65 train, 14 val, 15 test files.
Processed Count3DeKimeteAgeru: 69 train, 14 val, 16 test files.
Processed DollGun: 67 train, 14 val, 15 test files.
Processed Donburakokko: 62 train, 13 val, 14 test files.
Processed DualJustice: 69 train, 14 val, 16 test files.
Processed EienNoWith: 87

### 3. Convert Manga109 annotations to COCO format for DETR usage

In [2]:
import xml.etree.ElementTree as ET
import json
import os
from datetime import datetime
from PIL import Image  

def get_photo_creation_date(file_path):
    creation_time = os.path.getctime(file_path)
    creation_datetime = datetime.fromtimestamp(creation_time)
    
    # format "YYYY-MM-DD HH:MM:SS"
    formatted_date = creation_datetime.strftime('%Y-%m-%d %H:%M:%S')
    
    return formatted_date

def convert_bbox_to_coco_format(x1, y1, x2, y2):
    x = x1
    y = y1
    width = x2 - x1
    height = y2 - y1
    return [x, y, width, height]

def gather_images(manga_title, split_dir, xml_root):
    images = []
    page_file_list = os.listdir(split_dir)
    page_list = [page.replace('.jpg', '') for page in page_file_list]

    for page in xml_root.findall(".//page"):
        page_index = page.get('index')
        page_index_padded = page_index.zfill(3)

        if f"{manga_title}_{page_index_padded}" in page_list:
            split_img_path = os.path.join(split_dir, f"{manga_title}_{page_index_padded}.jpg")
            images.append({
                "id": int(page_index),
                "width": 1654,
                "height": 1170,
                "file_name": f"{manga_title}_{page_index_padded}.jpg",
                "date_captured": get_photo_creation_date(split_img_path)
            })

    return images

def gather_annotations(manga_title, split_dir, xml_root):
    annotations = []
    page_file_list = os.listdir(split_dir)

    page_list = [page.replace('.jpg', '') for page in page_file_list]
    objects = ['face', 'body', 'text', 'frame']

    for page in xml_root.findall(".//page"):
        page_index = page.get('index')
        page_index_padded = page_index.zfill(3)

        if f"{manga_title}_{page_index_padded}" in page_list:
            for textbox in page:
                bbox = textbox.attrib
                annotations.append({
                    "id": bbox['id'],
                    "image_id": int(page_index),
                    "category_id": objects.index(textbox.tag),
                    "bbox": convert_bbox_to_coco_format(
                        int(bbox['xmin']), int(bbox['ymin']),
                        int(bbox['xmax']), int(bbox['ymax'])
                    )
                })

    return annotations

def gather_categories():
    return {
        "categories": [
            {"id": 0, "name": 'face'},
            {"id": 1, "name": 'body'},
            {"id": 2, "name": 'text'},
            {"id": 3, "name": 'frame'},
        ]
    }

def save_coco_annotations(images, annotations, categories_data, coco_json_destination):
    coco_data = {
        "images": images,
        "annotations": annotations,
        **categories_data  
    }

    # Write to JSON file
    with open(coco_json_destination, 'w') as f:
        json.dump(coco_data, f, indent=4)

def create_json_for_splits(fp_split_dataset):

    for split in ['train', 'val', 'test']:
        image_field = []
        annotations_field = []
        categories_field = gather_categories()  
        
        for manga_title in os.listdir('../Manga109/images'):
            split_dir = os.path.join(fp_split_dataset, split)
            xml_path = f'../Manga109/annotations/{manga_title}.xml'

            if not os.path.exists(xml_path):
                print(f'Warning: {xml_path} does not exist. Skipping this manga title.')
                continue

            tree = ET.parse(xml_path)
            xml_root = tree.getroot()

            images = gather_images(manga_title, split_dir, xml_root)
            annotations = gather_annotations(manga_title, split_dir, xml_root)

            image_field += images
            annotations_field += annotations

            coco_json_destination = os.path.join(split_dir, 'annotations.json')  
        
        print(f'Finished converting xml to json format for {split} dataset')
        save_coco_annotations(image_field, annotations_field, categories_field, coco_json_destination)
        print(f'Saved annotation.json file at {split} folder')

fp_split_dataset = '../Manga109/dataset_split/'
create_json_for_splits(fp_split_dataset)


Finished converting xml to json format for train dataset
Saved annotation.json file at train folder
Finished converting xml to json format for val dataset
Saved annotation.json file at val folder
Finished converting xml to json format for test dataset
Saved annotation.json file at test folder


### 4. 