### 1. Set Up the Environment

In [None]:
!pip install torch torchvision transformers

### 2. Split dataset to train/val/test folders

In [None]:
import os
import shutil
import random

def split_dataset(image_root, train_ratio, val_ratio):
    for index, manga_title in enumerate(os.listdir(image_root)):
        print(index, manga_title)
        manga_path = os.path.join(image_root, manga_title)
        
        if not os.path.isdir(manga_path):
            continue

        # get all image files in the manga title folder
        image_files = sorted([f for f in os.listdir(manga_path) if f.endswith('.jpg')])
        # print(image_files)

        # shuffle images to ensure random splits
        random.shuffle(image_files)
        # print(image_files)

        # calculate the number of images for each set
        total_images = len(image_files)
        train_count = int(total_images * train_ratio)
        val_count = int(total_images * val_ratio)
        # print(total_images)
        # print(train_count)
        # print(val_count)

        # split images into train, val, and test
        train_files = image_files[:train_count]
        val_files = image_files[train_count:train_count + val_count]
        test_files = image_files[train_count + val_count:]
        # print(train_files)
        # print(val_files)
        # print(test_files)
        # print()

        train_page_num = [int(str(num)[:-4]) for num in train_files]
        val_page_num = [int(str(num)[:-4]) for num in val_files]
        test_page_num = [int(str(num)[:-4]) for num in test_files]
        # print(train_page_num)
        # print(val_page_num)
        # print(test_page_num)
        # print()

        move_files(manga_path, train_dir, train_files, manga_title, index, train_page_num)
        print(f"train file count: {train_count}")

        move_files(manga_path, val_dir, val_files, manga_title, index, val_page_num)
        print(f"val file count: {val_count}")

        move_files(manga_path, test_dir, test_files, manga_title, index, test_page_num)
        print(f"test file count: {total_images - train_count - val_count}")

        # print(f"Processed {manga_title}: {train_count} train, {val_count} val, {len(test_files)} test files.")
        print()

def move_files(src_folder, dest_folder, files, manga_title, _index, page_num):
    for i, file in enumerate(files):
        src_path = os.path.join(src_folder, file)
        
        pg_num = str(page_num[i]).zfill(5)
        unique_id = str(_index) + pg_num[3:]
        padded_unique_id = unique_id.zfill(5)
        # print(padded_unique_id)
        # print(page_num)
        # print(str(page_num))

        if page_num[i] > 99:
            unique_id2 = str(_index) + str(page_num[i])
            padded_unique_id2 = unique_id2.zfill(5)
            # print(padded_unique_id2)
            
            new_filename = f"{manga_title}_{padded_unique_id2}.jpg"
            dest_path = os.path.join(dest_folder, new_filename)
            # print(new_filename)
        else:
            # print(padded_unique_id)
            new_filename = f"{manga_title}_{padded_unique_id}.jpg"
            dest_path = os.path.join(dest_folder, new_filename)
            # print(new_filename)

        # print(new_filename)

        shutil.copyfile(src_path, dest_path)
        # page_num += 1
    # return page_num

output_root = '../Manga109/dataset_split/'

train_dir = os.path.join(output_root, 'train')
val_dir = os.path.join(output_root, 'val')
test_dir = os.path.join(output_root, 'test')

os.makedirs(train_dir, exist_ok=True)
os.makedirs(val_dir, exist_ok=True)
os.makedirs(test_dir, exist_ok=True)

image_root = '../Manga109/images/'
train_ratio = 0.75
val_ratio = 0.15  

split_dataset(image_root, train_ratio, val_ratio)


### 3. Convert Manga109 annotations to COCO format for DETR usage
- delete image folders & xml files with '_vol' format for ease of annotation
- HighschoolKimengumi
- LoveHina
- MoeruOnisan
- SaladDays
- ShimatteIkouze

#### as well as manga containing numbers, due to annotations error where page unique id is inserdet at '3'
- GOOD_KISS_Ver2
- Count3DeKimeteAgeru
- PLANET7
- UchuKigekiM774

#### which leaves us to Manga095


In [None]:
import xml.etree.ElementTree as ET
import json
import os
from datetime import datetime
from PIL import Image  
import re

def get_photo_creation_date(file_path):
    creation_time = os.path.getctime(file_path)
    creation_datetime = datetime.fromtimestamp(creation_time)
    
    # format "YYYY-MM-DD HH:MM:SS"
    formatted_date = creation_datetime.strftime('%Y-%m-%d %H:%M:%S')
    
    return formatted_date

def convert_bbox_to_coco_format(x1, y1, x2, y2):
    x = x1
    y = y1
    width = x2 - x1
    height = y2 - y1
    return [x, y, width, height]

def gather_images(manga_title, split_dir, xml_root, filtered_books):
    # print(manga_title)
    # print('in gather_images()')

    images = []
    # page_file_list = os.listdir(split_dir)

    # filtered_files = [file for file in page_file_list if file.startswith(manga_title + '_')]
    # ['AisazuNihaIrarenai_00071.jpg', 'AisazuNihaIrarenai_00072.jpg', 'AisazuNihaIrarenai_00073.jpg', 'AisazuNihaIrarenai_00074.jpg', 'AisazuNihaIrarenai_00075.jpg', 'AisazuNihaIrarenai_00076.jpg', 'AisazuNihaIrarenai_00077.jpg', 'AisazuNihaIrarenai_00078.jpg', 'AisazuNihaIrarenai_00079.jpg', 'AisazuNihaIrarenai_00080.jpg', 'AisazuNihaIrarenai_00081.jpg', 'AisazuNihaIrarenai_00082.jpg', 'AisazuNihaIrarenai_00083.jpg', 'AisazuNihaIrarenai_00084.jpg']
    # print(filtered_files)
    filtered_files = [file for file in filtered_books if file.startswith(manga_title + '_')]
    # print(filtered_files)
    # print(len(filtered_files))
    # print()

    page_list = [page.replace('.jpg', '') for page in filtered_files]
    # ['AisazuNihaIrarenai_00071', 'AisazuNihaIrarenai_00072', 'AisazuNihaIrarenai_00073', 'AisazuNihaIrarenai_00074', 'AisazuNihaIrarenai_00075', 'AisazuNihaIrarenai_00076', 'AisazuNihaIrarenai_00077', 'AisazuNihaIrarenai_00078', 'AisazuNihaIrarenai_00079', 'AisazuNihaIrarenai_00080', 'AisazuNihaIrarenai_00081', 'AisazuNihaIrarenai_00082', 'AisazuNihaIrarenai_00083', 'AisazuNihaIrarenai_00084']
    # print(page_list)

    page_id_str = [page.replace(str(manga_title + '_'), '') for page in page_list]
    # ['10578', '10579', '10580', '10581', '10582', '10583', '10584', '10585', '10586', '10587', '10588', '10589', '10590', '10591']
    # print(page_id_str)
    # print(len(page_id_str))
    # print()

    unique_page_list_2digits = [str(page)[-2:] for page in page_id_str]
    unique_page_list_2digits_int = [int(num) for num in unique_page_list_2digits]
    # print('unique_page_list_2digits_int')
    # print(unique_page_list_2digits_int)
    # print(len(unique_page_list_2digits_int))
    # print()

    # pad the list unique_page_list_2digits_int
    unique_page_list_2digits_int_padded = []
    previous_num = unique_page_list_2digits_int[0]
    for num in unique_page_list_2digits_int:
        if num < previous_num:  # If the current number is less than the previous one, add 100
            num += 100
        unique_page_list_2digits_int_padded.append(num)
        previous_num = num  # Update previous_num to current
    
    # go through each image annotations
    page_split_counter = 0
    for page in xml_root.findall(".//page"):
        page_index = page.get('index')
        page_index_int = int(page_index)
        # print(page_index_int)

        if int(page_index) in unique_page_list_2digits_int_padded:
            # print(int(page_index), unique_page_list_2digits_int_padded[page_split_counter])
            # print(page_index, unique_page_list_2digits[0])
            split_img_path = os.path.join(split_dir, f"{manga_title}_{page_id_str[page_split_counter]}.jpg")
            images.append({
                "id": int(page_index),
                "width": 1654,
                "height": 1170,
                "file_name": f"{manga_title}_{page_index}.jpg",
                # "date_captured": ''
                "date_captured": get_photo_creation_date(split_img_path)
            })
            page_split_counter += 1       
            # print(images[-1:])

    # fix numbering on the file name
    page_id_str_counter = 0
    for image in images:
        detected_num_in_file_name = re.search(r'(\d+)', image['file_name']).group(0)
        split_img_path = os.path.join(split_dir, f"{manga_title}_{page_id_str[page_id_str_counter]}.jpg")
        # print(split_img_path)

        image['id'] = int(page_id_str[page_id_str_counter])
        image['file_name'] = image['file_name'].replace(detected_num_in_file_name, page_id_str[page_id_str_counter])
        image['date_captured'] = get_photo_creation_date(split_img_path)

        # print(image)
        page_id_str_counter += 1

    # print(f'Images annotated:', len(images))
    return images

def gather_annotations(manga_title, split_dir, xml_root, filtered_books):
    annotations = []

    filtered_files = [file for file in filtered_books if file.startswith(manga_title + '_')]
    page_list = [page.replace('.jpg', '') for page in filtered_files]
    page_id_str = [page.replace(str(manga_title + '_'), '') for page in page_list]

    # convert the strings into integers
    page_id_int = [int(page) for page in page_id_str]
    # then sort for proper ordering
    sorted_page_id_int = sorted(page_id_int)
    sorted_page_id_str = [str(page).zfill(5) for page in sorted_page_id_int]  # Fix indexing issues

    # create 2-digit suffixes for the sorted page IDs
    unique_page_list_2digits = [str(page)[-2:] for page in sorted_page_id_str]
    unique_page_list_2digits_int = [int(num) for num in unique_page_list_2digits]

    # adjust the 2-digit list to handle rollovers
    unique_page_list_2digits_int_padded = []
    previous_num = unique_page_list_2digits_int[0]
    for num in unique_page_list_2digits_int:
        if num < previous_num:  # If the current number is less than the previous one, add 100
            num += 100
        unique_page_list_2digits_int_padded.append(num)
        previous_num = num

    # create mapping from page_index_int to sorted page_id_str
    page_index_to_id_str = dict(zip(unique_page_list_2digits_int_padded, sorted_page_id_str))

    objects = ['face', 'body', 'text', 'frame']

    # variables to keep track of annotations
    element_holder = []
    page_split_counter = 0
    total_elements = 0
    element_split_counter = 0

    for page in xml_root.findall(".//page"):
        page_index = page.get('index')
        page_index_int = int(page_index)
        total_elements += len(page)

        # use the padded unique page indices for consistent mapping
        if page_index_int in page_index_to_id_str:
            # print(f'index: {page_index_int}\telement count: {len(page)}')
            # correctly map to the string page ID using the dictionary
            correct_page_id = page_index_to_id_str[page_index_int]
            
            for element in page:
                element_holder.append(page_index)
                bbox = element.attrib
                category = element.tag

                width = int(bbox['xmax']) - int(bbox['xmin'])
                height = int(bbox['ymax']) - int(bbox['ymin'])
                area = width * height
                
                annotations.append({
                    "id": bbox['id'],
                    "image_id": int(correct_page_id), 
                    "category_id": objects.index(category),
                    "area": area,
                    "bbox": convert_bbox_to_coco_format(
                        int(bbox['xmin']), int(bbox['ymin']),
                        int(bbox['xmax']), int(bbox['ymax'])
                    )
                })
                # print('\t', correct_page_id)
                # print(annotations[-1:])
                element_split_counter += 1

        page_split_counter += 1

    return annotations


def gather_categories():
    return {
        "categories": [
            {"id": 0, "name": 'face'},
            {"id": 1, "name": 'body'},
            {"id": 2, "name": 'text'},
            {"id": 3, "name": 'frame'},
        ]
    }

def save_coco_annotations(images, annotations, categories_data, coco_json_destination):
    coco_data = {
        "images": images,
        "annotations": annotations,
        **categories_data  
    }
    with open(coco_json_destination, 'w') as f:
        json.dump(coco_data, f, indent=4)

def create_json_for_splits(fp_split_dataset):

    for split in ['train', 'val', 'test']:
    # for split in ['train']:
        image_field = []
        annotations_field = []
        categories_field = gather_categories()  
        
        for manga_title in os.listdir('../Manga109/images'):
        # for manga_title in ['YumeNoKayoiji']:
            # print('in create_json_for_splits()')
            # print(manga_title)

            split_dir = os.path.join(fp_split_dataset, split)
            # print(split)
            # print(os.listdir(split_dir))
            # print(len(os.listdir(split_dir)))

            filtered_books = [title for title in os.listdir(split_dir) if manga_title in title]
            # print(filtered_books)
            # print(len(filtered_books))
            # print()

            xml_path = f'../Manga109/annotations/{manga_title}.xml'

            if not os.path.exists(xml_path):
                print(f'Warning: {xml_path} does not exist. Skipping this manga title.')
                continue

            tree = ET.parse(xml_path)
            xml_root = tree.getroot()

            # print(f'Annotating {manga_title}')
            images = gather_images(manga_title, split_dir, xml_root, filtered_books)
            annotations = gather_annotations(manga_title, split_dir, xml_root, filtered_books)

            image_field += images
            annotations_field += annotations

            coco_json_destination = os.path.join(split_dir, 'annotations.json')  
            # print('=======================================')
        
        print(f'Finished converting xml to json format for {split} dataset')
        save_coco_annotations(image_field, annotations_field, categories_field, coco_json_destination)
        print(f'Saved annotation.json file at {split} folder')

fp_split_dataset = '../Manga109/dataset_split/'
create_json_for_splits(fp_split_dataset)
