In [19]:
import os
import xml.etree.ElementTree as ET
import pandas as pd

data_path = "../Manga109/annotations"
output_path = "clean_manga109"

os.makedirs(output_path, exist_ok=True)

def parse_xml_file(xml_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()

    # init containers
    manga_data = {
        "title": root.get("title"),
        "characters": {},
        "pages": []
    }

    # extract character id and names
    for character in root.find("characters"):
        character_id = character.get("id")
        character_name = character.get('name')
        manga_data["characters"][character_id] = character_name

    # extract pages and objects
    for page in root.find("pages"):
        page_data = {
            "page_number": int(page.get("index")),
            "width": int(page.get("width")),
            "height": int(page.get("height")),
            "objects": []
        }

        for obj in page:
            # extract id and bounding box coordinates
            obj_data = {
                "id": obj.get("id"),
                "type": obj.tag,
                "xmin": int(obj.get("xmin")),
                "ymin": int(obj.get("ymin")),
                "xmax": int(obj.get("xmax")),
                "ymax": int(obj.get("ymax"))
            }

            # extract character ID and name, then associate them to face and body
            # extract text
            if obj.tag in ["face", "body"]:
                obj_data["character_id"] = obj.get("character")
                obj_data["character_name"] = manga_data["characters"].get(obj_data["character_id"], "Unknown")
            elif obj.tag == "text":
                obj_data["text_content"] = obj.text.strip() if obj.text else ""
            
            page_data["objects"].append(obj_data)

        manga_data["pages"].append(page_data)

    return manga_data


def clean_dataset(manga_data, output_dir):
    title = manga_data["title"]
    for page in manga_data["pages"]:
        page_df = pd.DataFrame(page["objects"])
        page_df["page_number"] = page["page_number"]

        output_file = os.path.join(output_dir, f"{title}_{page['page_number']:03}.csv")
        page_df.to_csv(output_file, index=False)



In [20]:
xml_files = [f for f in os.listdir(data_path) if f.endswith('.xml')]

for title in xml_files:
  manga_title = title[:-4]

  xml_file = os.path.join(data_path, f'{manga_title}.xml')

  manga_data = parse_xml_file(xml_file)

  output_path_manga = os.path.join(output_path, manga_title)
  os.makedirs(output_path_manga, exist_ok=True)

  clean_dataset(manga_data, output_path_manga)

  print(f"Data cleaned: {manga_data['title']}")


Data cleaned: AisazuNihaIrarenai
Data cleaned: AkkeraKanjinchou
Data cleaned: Akuhamu
Data cleaned: AosugiruHaru
Data cleaned: AppareKappore
Data cleaned: Arisa
Data cleaned: ARMS
Data cleaned: BakuretsuKungFuGirl
Data cleaned: Belmondo
Data cleaned: BEMADER_P
Data cleaned: BokuHaSitatakaKun
Data cleaned: BurariTessenTorimonocho
Data cleaned: ByebyeC-BOY
Data cleaned: AisazuNihaIrarenai
Data cleaned: Arisa
Data cleaned: Count3DeKimeteAgeru
Data cleaned: DollGun
Data cleaned: Donburakokko
Data cleaned: DualJustice
Data cleaned: EienNoWith
Data cleaned: EvaLady
Data cleaned: EverydayOsakanaChan
Data cleaned: GakuenNoise
Data cleaned: GarakutayaManta
Data cleaned: GinNoChimera
Data cleaned: GOOD_KISS_Ver2
Data cleaned: Hamlet
Data cleaned: HanzaiKousyouninMinegishiEitarou
Data cleaned: HaruichibanNoFukukoro
Data cleaned: HarukaRefrain
Data cleaned: HealingPlanet
Data cleaned: HeiseiJimen
Data cleaned: HighschoolKimengumi_vol01
Data cleaned: HighschoolKimengumi_vol20
Data cleaned: Hinagiku

In [37]:
def count_faces_in_panels(csv_file_path):
    # Load the DataFrame from the CSV file
    try:
        page_df = pd.read_csv(csv_file_path)
    except Exception as e:
        print(f"Error loading CSV file: {e}")
        return

    # Debugging: Print the columns and the first few rows
    # print("Columns in page_df:", page_df.columns)
    # print(page_df.head())

    # Filter to get only panels
    panels = page_df[page_df["type"] == "frame"]  

    print(f"Total number of panels found: {len(panels)}")  # Print total panels

    # Use enumerate for proper indexing
    for index, panel in enumerate(panels.iterrows()):  
        panel_index = index + 1  # Make it 1-based index
        # Get the bounding box of the panel
        xmin, ymin, xmax, ymax = panel[1]["xmin"], panel[1]["ymin"], panel[1]["xmax"], panel[1]["ymax"]

        # Count the number of faces inside this panel's bounding box
        faces_in_panel = page_df[
            (page_df["type"] == "face") &
            (page_df["xmin"] >= xmin) &
            (page_df["xmax"] <= xmax) &
            (page_df["ymin"] >= ymin) &
            (page_df["ymax"] <= ymax)
        ].shape[0]

        print(f"Panel {panel_index} has {faces_in_panel} face(s)")  

# Input usage:
csv_file_path = "clean_manga109\AisazuNihaIrarenai\AisazuNihaIrarenai_002.csv"  # Replace with the path to the CSV file
count_faces_in_panels(csv_file_path)


Total number of panels found: 12
Panel 1 has 2 face(s)
Panel 2 has 1 face(s)
Panel 3 has 0 face(s)
Panel 4 has 0 face(s)
Panel 5 has 0 face(s)
Panel 6 has 0 face(s)
Panel 7 has 1 face(s)
Panel 8 has 1 face(s)
Panel 9 has 0 face(s)
Panel 10 has 0 face(s)
Panel 11 has 0 face(s)
Panel 12 has 1 face(s)


In [35]:
def calculate_avg_faces_and_texts_across_manga(base_dir):
    # Dictionary to hold results for each manga
    results = {}

    # Iterate through each folder in the base directory
    for manga_folder in os.listdir(base_dir):
        manga_path = os.path.join(base_dir, manga_folder)

        # Check if it's a directory
        if os.path.isdir(manga_path):
            total_panels = 0
            total_faces = 0
            total_texts = 0

            # Iterate through each CSV file in the manga folder
            for csv_file in os.listdir(manga_path):
                if csv_file.endswith('.csv'):
                    csv_file_path = os.path.join(manga_path, csv_file)

                    # Load the DataFrame from the CSV file
                    try:
                        page_df = pd.read_csv(csv_file_path)
                        
                        # Check if 'type' column exists before proceeding
                        if 'type' not in page_df.columns:
                            # print(f"The 'type' column is missing in {csv_file_path}.")
                            continue

                        # Count total panels (frames), faces, and texts
                        total_panels += page_df[page_df["type"] == "frame"].shape[0]
                        total_faces += page_df[page_df["type"] == "face"].shape[0]
                        total_texts += page_df[page_df["type"] == "text"].shape[0]

                    except Exception as e:
                        print(f"Error processing file {csv_file_path}: {e}")

            # Calculate average number of faces and texts per panel for the current manga
            avg_num_faces_per_panel = total_faces / total_panels if total_panels > 0 else 0
            avg_num_texts_per_panel = total_texts / total_panels if total_panels > 0 else 0
            
            # Store results for the current manga
            results[manga_folder] = {
                "total_panels": total_panels,
                "total_faces": total_faces,
                "total_texts": total_texts,
                "avg_num_faces_per_panel": avg_num_faces_per_panel,
                "avg_num_texts_per_panel": avg_num_texts_per_panel
            }

    # Print results in the desired format
    print("manga_title | total_panel | total_faces | avg_num_faces_per_panel | total_text | avg_num_text_per_panel")
    for manga_title, data in results.items():
        print(f"{manga_title} | {data['total_panels']} | {data['total_faces']} | {data['avg_num_faces_per_panel']:.2f} | {data['total_texts']} | {data['avg_num_texts_per_panel']:.2f}")

# Example usage
calculate_avg_faces_and_texts_across_manga('clean_manga109')


manga_title | total_panel | total_faces | avg_num_faces_per_panel | total_text | avg_num_text_per_panel
AisazuNihaIrarenai | 1014 | 1139 | 1.12 | 1473 | 1.45
AkkeraKanjinchou | 975 | 736 | 0.75 | 1340 | 1.37
Akuhamu | 873 | 1207 | 1.38 | 1461 | 1.67
AosugiruHaru | 964 | 632 | 0.66 | 1360 | 1.41
AppareKappore | 772 | 930 | 1.20 | 1246 | 1.61
Arisa | 910 | 1228 | 1.35 | 1391 | 1.53
ARMS | 575 | 327 | 0.57 | 1062 | 1.85
BakuretsuKungFuGirl | 1010 | 1104 | 1.09 | 2189 | 2.17
Belmondo | 921 | 1000 | 1.09 | 1347 | 1.46
BEMADER_P | 1141 | 1354 | 1.19 | 1449 | 1.27
BokuHaSitatakaKun | 1065 | 2526 | 2.37 | 2255 | 2.12
BurariTessenTorimonocho | 1335 | 1235 | 0.93 | 1653 | 1.24
ByebyeC-BOY | 1125 | 1035 | 0.92 | 1439 | 1.28
Copy of AisazuNihaIrarenai | 1014 | 1139 | 1.12 | 1473 | 1.45
Copy of Arisa | 910 | 1228 | 1.35 | 1391 | 1.53
Count3DeKimeteAgeru | 947 | 784 | 0.83 | 1228 | 1.30
DollGun | 865 | 1155 | 1.34 | 1289 | 1.49
Donburakokko | 605 | 582 | 0.96 | 935 | 1.55
DualJustice | 783 | 440 | 0