### 1. Set Up the Environment

In [None]:
!pip install torch torchvision transformers

### 2. Convert Manga109 annotations to COCO format for DETR usage

In [4]:
import xml.etree.ElementTree as ET
import os

def convert_bbox_to_coco_format(x1, y1, x2, y2):
    x = x1
    y = y1
    width = x2 - x1
    height = y2 - y1
    return [x, y, width, height]


import xml.etree.ElementTree as ET

def convert_bbox_to_coco_format(xmin, ymin, xmax, ymax):
    """
    Converts bounding box coordinates from (xmin, ymin, xmax, ymax) to COCO format (x, y, width, height).
    """
    x = xmin
    y = ymin
    width = xmax - xmin
    height = ymax - ymin
    return [x, y, width, height]

def convert_manga109_annotations_to_coco_format(xml_path):
    tree = ET.parse(xml_path)
    root = tree.getroot()

    # Define object categories
    objects = ['face', 'body', 'text', 'frame']
    
    annotations = []
    for page in root.findall(".//page"):
        page_index = page.get('index')
        for textbox in page:
            bbox = textbox.attrib
            annotations.append({
                "page_index": page_index,
                "object_id": bbox['id'],
                "category": textbox.tag,
                "category_id": objects.index(textbox.tag),
                "bbox": convert_bbox_to_coco_format(int(bbox['xmin']), int(bbox['ymin']), int(bbox['xmax']), int(bbox['ymax']))
            })
    
    # Create categories
    categories_data = {
        "categories": [
            {"name": 'face', "id": 0},
            {"name": 'body', "id": 1},
            {"name": 'text', "id": 2},
            {"name": 'frame', "id": 3},
        ]
    }

    # Return combined data
    return {
        "annotations": annotations,
        **categories_data  # This merges categories_data into the returned dictionary
    }


manga109_annotation_file = '../Manga109/annotations/AisazuNihaIrarenai.xml'

parsed_xml_coco_annotation = convert_manga109_annotations_to_coco_format(manga109_annotation_file)


In [5]:
import json

json_string = json.dumps(parsed_xml_coco_annotation, indent=4)

# print(json_string)