### 1. Set Up the Environment

In [None]:
!pip install torch torchvision transformers

### 2. Split dataset to train/val/test folders

In [1]:
import os
import shutil
import random

def split_dataset(image_root, train_ratio, val_ratio):
    global unique_id_counter  
    unique_id_counter = 1  # Start counting IDs from 1

    for manga_title in os.listdir(image_root):
        manga_path = os.path.join(image_root, manga_title)
        
        if not os.path.isdir(manga_path):
            continue

        # Get all image files in the manga title folder
        image_files = sorted([f for f in os.listdir(manga_path) if f.endswith('.jpg')])

        # Shuffle images to ensure random splits
        random.shuffle(image_files)

        # Calculate the number of images for each set
        total_images = len(image_files)
        train_count = int(total_images * train_ratio)
        val_count = int(total_images * val_ratio)

        # Split images into train, val, and test
        train_files = image_files[:train_count]
        val_files = image_files[train_count:train_count + val_count]
        test_files = image_files[train_count + val_count:]

        # Move and rename files to the corresponding split folders
        move_files(manga_path, train_dir, train_files, manga_title)
        move_files(manga_path, val_dir, val_files, manga_title)
        move_files(manga_path, test_dir, test_files, manga_title)

        print(f"Processed {manga_title}: {train_count} train, {val_count} val, {len(test_files)} test files.")

def move_files(src_folder, dest_folder, files, manga_title):
    global unique_id_counter  
    for file in files:
        src_path = os.path.join(src_folder, file)
        
        # Use the global unique ID counter as part of the filename with 5 digits
        new_filename = f"{manga_title}_{unique_id_counter:05}.jpg"
        dest_path = os.path.join(dest_folder, new_filename)

        # Copy file to the destination folder with the new name
        shutil.copyfile(src_path, dest_path)

        unique_id_counter += 1  # Increment the counter for the next unique ID

# Define output directories
output_root = '../Manga109/dataset_split/'

train_dir = os.path.join(output_root, 'train')
val_dir = os.path.join(output_root, 'val')
test_dir = os.path.join(output_root, 'test')

# Create directories if they do not exist
os.makedirs(train_dir, exist_ok=True)
os.makedirs(val_dir, exist_ok=True)
os.makedirs(test_dir, exist_ok=True)

# Define image root and dataset split ratios
image_root = '../Manga109/images/'
train_ratio = 0.75
val_ratio = 0.15  

# Split the dataset and rename files with standardized IDs
split_dataset(image_root, train_ratio, val_ratio)


Processed AisazuNihaIrarenai: 70 train, 14 val, 10 test files.
Processed AkkeraKanjinchou: 69 train, 13 val, 10 test files.
Processed Akuhamu: 60 train, 12 val, 9 test files.
Processed AosugiruHaru: 78 train, 15 val, 12 test files.
Processed AppareKappore: 72 train, 14 val, 11 test files.
Processed Arisa: 72 train, 14 val, 11 test files.
Processed ARMS: 60 train, 12 val, 9 test files.
Processed BakuretsuKungFuGirl: 72 train, 14 val, 11 test files.
Processed Belmondo: 74 train, 14 val, 11 test files.
Processed BEMADER_P: 85 train, 17 val, 12 test files.
Processed BokuHaSitatakaKun: 74 train, 14 val, 11 test files.
Processed BurariTessenTorimonocho: 84 train, 16 val, 12 test files.
Processed ByebyeC-BOY: 70 train, 14 val, 10 test files.
Processed Count3DeKimeteAgeru: 74 train, 14 val, 11 test files.
Processed DollGun: 72 train, 14 val, 10 test files.
Processed Donburakokko: 66 train, 13 val, 10 test files.
Processed DualJustice: 74 train, 14 val, 11 test files.
Processed EienNoWith: 93 t

### 3. Convert Manga109 annotations to COCO format for DETR usage

In [None]:
import xml.etree.ElementTree as ET
import json
import os
from datetime import datetime
from PIL import Image  
import re

def get_photo_creation_date(file_path):
    creation_time = os.path.getctime(file_path)
    creation_datetime = datetime.fromtimestamp(creation_time)
    
    # format "YYYY-MM-DD HH:MM:SS"
    formatted_date = creation_datetime.strftime('%Y-%m-%d %H:%M:%S')
    
    return formatted_date

def convert_bbox_to_coco_format(x1, y1, x2, y2):
    x = x1
    y = y1
    width = x2 - x1
    height = y2 - y1
    return [x, y, width, height]

def gather_images(manga_title, split_dir, xml_root):
    # print(manga_title)

    images = []
    page_file_list = os.listdir(split_dir)

    filtered_files = [file for file in page_file_list if file.startswith(manga_title + '_')]
    # ['AisazuNihaIrarenai_00071.jpg', 'AisazuNihaIrarenai_00072.jpg', 'AisazuNihaIrarenai_00073.jpg', 'AisazuNihaIrarenai_00074.jpg', 'AisazuNihaIrarenai_00075.jpg', 'AisazuNihaIrarenai_00076.jpg', 'AisazuNihaIrarenai_00077.jpg', 'AisazuNihaIrarenai_00078.jpg', 'AisazuNihaIrarenai_00079.jpg', 'AisazuNihaIrarenai_00080.jpg', 'AisazuNihaIrarenai_00081.jpg', 'AisazuNihaIrarenai_00082.jpg', 'AisazuNihaIrarenai_00083.jpg', 'AisazuNihaIrarenai_00084.jpg']
    # print(filtered_files)

    page_list = [page.replace('.jpg', '') for page in filtered_files]
    # ['AisazuNihaIrarenai_00071', 'AisazuNihaIrarenai_00072', 'AisazuNihaIrarenai_00073', 'AisazuNihaIrarenai_00074', 'AisazuNihaIrarenai_00075', 'AisazuNihaIrarenai_00076', 'AisazuNihaIrarenai_00077', 'AisazuNihaIrarenai_00078', 'AisazuNihaIrarenai_00079', 'AisazuNihaIrarenai_00080', 'AisazuNihaIrarenai_00081', 'AisazuNihaIrarenai_00082', 'AisazuNihaIrarenai_00083', 'AisazuNihaIrarenai_00084']
    # print(page_list)

    page_id_str = [page.replace(str(manga_title + '_'), '') for page in page_list]
    # ['10578', '10579', '10580', '10581', '10582', '10583', '10584', '10585', '10586', '10587', '10588', '10589', '10590', '10591']
    # print(page_id_str)

    unique_page_list_2digits = [str(page)[-2:] for page in page_id_str]

    for page in xml_root.findall(".//page"):
        page_index = page.get('index')
        # page_index_padded = page_index.zfill(5)

        if page_index in unique_page_list_2digits:
            # print(page_index)

        # if f"{manga_title}_{page_index_padded}" in page_id_str:
            # split_img_path = os.path.join(split_dir, f"{manga_title}_{page_index_padded}.jpg")
            images.append({
                "id": int(page_index),
                "width": 1654,
                "height": 1170,
                "file_name": f"{manga_title}_{page_index}.jpg",
                "date_captured": ''
                # "date_captured": get_photo_creation_date(split_img_path)
            })
            # print(images[-1:])

    # fix numbering on the file name
    page_id_str_counter = 0
    for image in images:
        detected_num_in_file_name = re.search(r'(\d+)', image['file_name']).group(0)
        split_img_path = os.path.join(split_dir, f"{manga_title}_{page_id_str[page_id_str_counter]}.jpg")
        # print(split_img_path)

        image['id'] = int(page_id_str[page_id_str_counter])
        image['file_name'] = image['file_name'].replace(detected_num_in_file_name, page_id_str[page_id_str_counter])
        image['date_captured'] = get_photo_creation_date(split_img_path)

        # print(image)
        page_id_str_counter += 1

    print()

    return images

def gather_annotations(manga_title, split_dir, xml_root, images):
    # print(manga_title)

    annotations = []
    page_file_list = os.listdir(split_dir)

    filtered_files = [file for file in page_file_list if file.startswith(manga_title + '_')]

    page_list = [page.replace('.jpg', '') for page in filtered_files]
    # print(page_list)

    page_id_str = [page.replace(str(manga_title + '_'), '') for page in page_list]
    # ['00346', '00347', '00348', '00349', '00350', '00351', '00352', '00353', '00354', '00357', '00358', '00359', '00360']
    # print(page_id_str)

    unique_page_list_2digits = [str(page)[-2:] for page in page_id_str]
    # print(unique_page_list_2digits)
    
    objects = ['face', 'body', 'text', 'frame']

    element_holder = []
    for page in xml_root.findall(".//page"):
        page_index = page.get('index')

        if page_index in unique_page_list_2digits:
            # print(page_index)
            for element in page:
                element_holder.append(page_index)
                bbox = element.attrib
                category = element.tag
                # print(bbox, bbox_tag)
                annotations.append({
                    "id": bbox['id'],
                    "image_id": page_index,
                    "category_id": objects.index(category),
                    "bbox": convert_bbox_to_coco_format(
                        int(bbox['xmin']), int(bbox['ymin']),
                        int(bbox['xmax']), int(bbox['ymax'])
                    )
                })
                # print(annotations[-1:])

    page_id_int = list(map(int, page_id_str))
    # print(page_id_int)

    unique_page_list_2digits_int =list(map(int, unique_page_list_2digits))
    # print(unique_page_list_2digits_int)

    for annotation in annotations:
        if int(annotation['image_id']) in unique_page_list_2digits_int:
            annotation['image_id'] = page_id_int[unique_page_list_2digits_int.index(int(annotation['image_id']))]
        # print(annotation)


    return annotations

def gather_categories():
    return {
        "categories": [
            {"id": 0, "name": 'face'},
            {"id": 1, "name": 'body'},
            {"id": 2, "name": 'text'},
            {"id": 3, "name": 'frame'},
        ]
    }

def save_coco_annotations(images, annotations, categories_data, coco_json_destination):
    coco_data = {
        "images": images,
        "annotations": annotations,
        **categories_data  
    }

    # Write to JSON file
    with open(coco_json_destination, 'w') as f:
        json.dump(coco_data, f, indent=4)

def create_json_for_splits(fp_split_dataset):

    for split in ['train', 'val', 'test']:
    # for split in ['val']:
        image_field = []
        annotations_field = []
        categories_field = gather_categories()  
        
        for manga_title in os.listdir('../Manga109/images'):
            # print(manga_title)

            split_dir = os.path.join(fp_split_dataset, split)
            xml_path = f'../Manga109/annotations/{manga_title}.xml'

            if not os.path.exists(xml_path):
                print(f'Warning: {xml_path} does not exist. Skipping this manga title.')
                continue

            tree = ET.parse(xml_path)
            xml_root = tree.getroot()

            images = gather_images(manga_title, split_dir, xml_root)
            annotations = gather_annotations(manga_title, split_dir, xml_root, images)

            image_field += images
            annotations_field += annotations

            coco_json_destination = os.path.join(split_dir, 'annotations.json')  
        
        print(f'Finished converting xml to json format for {split} dataset')
        save_coco_annotations(image_field, annotations_field, categories_field, coco_json_destination)
        print(f'Saved annotation.json file at {split} folder')

fp_split_dataset = '../Manga109/dataset_split/'
create_json_for_splits(fp_split_dataset)


### 4. 