In [1]:
import os
import xml.etree.ElementTree as ET
import pandas as pd

data_path = "../Manga109/annotations"
output_path = "clean_manga109"

os.makedirs(output_path, exist_ok=True)

def parse_xml_file(xml_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()

    # init containers
    manga_data = {
        "title": root.get("title"),
        "characters": {},
        "pages": []
    }

    # extract character id and names
    for character in root.find("characters"):
        character_id = character.get("id")
        character_name = character.get('name')
        manga_data["characters"][character_id] = character_name

    # extract pages and objects
    for page in root.find("pages"):
        page_data = {
            "page_number": int(page.get("index")),
            "width": int(page.get("width")),
            "height": int(page.get("height")),
            "objects": []
        }

        for obj in page:
            # extract id and bounding box coordinates
            obj_data = {
                "id": obj.get("id"),
                "type": obj.tag,
                "xmin": int(obj.get("xmin")),
                "ymin": int(obj.get("ymin")),
                "xmax": int(obj.get("xmax")),
                "ymax": int(obj.get("ymax"))
            }

            # extract character ID and name, then associate them to face and body
            # extract text
            if obj.tag in ["face", "body"]:
                obj_data["character_id"] = obj.get("character")
                obj_data["character_name"] = manga_data["characters"].get(obj_data["character_id"], "Unknown")
            elif obj.tag == "text":
                obj_data["text_content"] = obj.text.strip() if obj.text else ""
            
            page_data["objects"].append(obj_data)

        manga_data["pages"].append(page_data)

    return manga_data


def clean_dataset(manga_data, output_dir):
    title = manga_data["title"]
    for page in manga_data["pages"]:
        page_df = pd.DataFrame(page["objects"])
        page_df["page_number"] = page["page_number"]

        output_file = os.path.join(output_dir, f"{title}_{page['page_number']:03}.csv")
        page_df.to_csv(output_file, index=False)



In [2]:
xml_files = [f for f in os.listdir(data_path) if f.endswith('.xml')]

for title in xml_files:
  manga_title = title[:-4]

  xml_file = os.path.join(data_path, f'{manga_title}.xml')

  manga_data = parse_xml_file(xml_file)

  output_path_manga = os.path.join(output_path, manga_title)
  os.makedirs(output_path_manga, exist_ok=True)

  clean_dataset(manga_data, output_path_manga)

  print(f"Data cleaned: {manga_data['title']}")


Data cleaned: AisazuNihaIrarenai
Data cleaned: AkkeraKanjinchou
Data cleaned: Akuhamu
Data cleaned: AosugiruHaru
Data cleaned: AppareKappore
Data cleaned: Arisa
Data cleaned: ARMS
Data cleaned: BakuretsuKungFuGirl
Data cleaned: Belmondo
Data cleaned: BEMADER_P
Data cleaned: BokuHaSitatakaKun
Data cleaned: BurariTessenTorimonocho
Data cleaned: ByebyeC-BOY
Data cleaned: Count3DeKimeteAgeru
Data cleaned: DollGun
Data cleaned: Donburakokko
Data cleaned: DualJustice
Data cleaned: EienNoWith
Data cleaned: EvaLady
Data cleaned: EverydayOsakanaChan
Data cleaned: GakuenNoise
Data cleaned: GarakutayaManta
Data cleaned: GinNoChimera
Data cleaned: GOOD_KISS_Ver2
Data cleaned: Hamlet
Data cleaned: HanzaiKousyouninMinegishiEitarou
Data cleaned: HaruichibanNoFukukoro
Data cleaned: HarukaRefrain
Data cleaned: HealingPlanet
Data cleaned: HeiseiJimen
Data cleaned: HighschoolKimengumi_vol01
Data cleaned: HighschoolKimengumi_vol20
Data cleaned: HinagikuKenzan
Data cleaned: HisokaReturns
Data cleaned: Jang