In [1]:
import os
import pandas as pd

clean_dataset_path = f'../CLEAN_Dataset/clean_manga109'

manga_titles = [f for f in os.listdir(clean_dataset_path)]


### 1. Overall JP char count in EVERY Manga | Total JP char in each page

In [2]:
stat_overall_jp_char_count = {}
stat_total_jp_char_page = {}

for manga_title in os.listdir(clean_dataset_path):
  manga_title_path = os.path.join(clean_dataset_path, manga_title)

  jp_char_count_manga = 0

  for manga_page in os.listdir(manga_title_path):
    page = os.path.join (manga_title_path, manga_page)
    # print(page)

    df = pd.read_csv(page)

    if 'text_content' in df.columns:
      text_content_col = df['text_content']
      jp_char_count_page = 0

      for text in text_content_col[text_content_col.notna()]:
        if text.strip():  # check if text is not just whitespace
          jp_char_count_page += len(text) # Total JP character count in a page
          stat_total_jp_char_page[manga_page] = jp_char_count_page

      jp_char_count_manga += jp_char_count_page # OVERALL JP character count in a manga title
    
  stat_overall_jp_char_count[manga_title] = jp_char_count_manga
  # print(f'Overall JP character count on manga {manga_title}: {jp_char_count_manga}')


### 2 Manga Page count

In [3]:
stat_manga_page_count = {}

for manga_title in os.listdir(clean_dataset_path):
  manga_title_path = os.path.join(clean_dataset_path, manga_title)

  stat_manga_page_count[manga_title] = len(os.listdir(manga_title_path)) * 2

# print(stat_manga_page_count)


### 3. Genre/Demographic Target/Year extract

In [4]:
import requests
from io import StringIO
from bs4 import BeautifulSoup

stat_manga_genre = {}
stat_manga_demo_target = {}
stat_manga_year = {}

url = 'http://www.manga109.org/en/explore.html'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

table = soup.find('table', class_='tablesorter')

df = pd.read_html(StringIO(str(soup)), attrs = {'class': 'tablesorter'})[0]

for index, row in df.iterrows():
  folder_name = row["Folder Name"]

  manga_genre = row["Genre"]
  manga_demo_target = row["Target"]
  manga_year = row["Age"]

  stat_manga_genre[folder_name] = manga_genre
  stat_manga_demo_target[folder_name] = manga_demo_target
  stat_manga_year[folder_name] = manga_year


### 4. Average JP char count in each page of every manga

In [5]:
stat_avg_page_jp_char_count = {}

for title, count in stat_overall_jp_char_count.items():
    stat_avg_page_jp_char_count[title] = count / stat_manga_page_count[title]

# print(stat_avg_page_jp_char_count)

### 5. Number of UNIQUE characters in EVERY Manga

In [6]:
import regex as re

stat_manga_unique_chars_count = {}
stat_manga_unique_chars = {}

for manga_title in os.listdir(clean_dataset_path):
  manga_title_path = os.path.join(clean_dataset_path, manga_title)

  unique_vocab_count = set()

  for manga_page in os.listdir(manga_title_path):
    page = os.path.join (manga_title_path, manga_page)

    df = pd.read_csv(page)

    if 'text_content' in df.columns:
      text_content_col = df['text_content']

      for text in text_content_col[text_content_col.notna()]:
        if text.strip():  # check if text is not just whitespace
          no_punctuation_text = re.sub(r'\p{P}+', '', text) # remove any punctuations. only read letters

          for char in no_punctuation_text:
            unique_vocab_count.add(char)

  stat_manga_unique_chars_count[manga_title] = len(unique_vocab_count)
  stat_manga_unique_chars[manga_title] = unique_vocab_count



### 6. Average Face & Text Across Manga

In [7]:
import os
import pandas as pd

# Dictionaries to store the results for each statistic
stat_total_panel_count = {}
stat_total_face_count = {}
stat_total_textbox_count = {}
stat_avg_face_count_per_panel = {}
stat_avg_textbox_count_per_panel = {}

def calculate_avg_faces_and_texts_across_manga(base_dir):
    # Iterate through each folder in the base directory
    for manga_folder in os.listdir(base_dir):
        manga_path = os.path.join(base_dir, manga_folder)

        # Check if it's a directory
        if os.path.isdir(manga_path):
            total_panels = 0
            total_faces = 0
            total_textboxes = 0

            # Iterate through each CSV file in the manga folder
            for csv_file in os.listdir(manga_path):
                if csv_file.endswith('.csv'):
                    csv_file_path = os.path.join(manga_path, csv_file)

                    # Load the DataFrame from the CSV file
                    try:
                        page_df = pd.read_csv(csv_file_path)
                        
                        # Check if 'type' column exists before proceeding
                        if 'type' not in page_df.columns:
                            continue

                        # Count total panels (frames), faces, and texts
                        total_panels += page_df[page_df["type"] == "frame"].shape[0]
                        total_faces += page_df[page_df["type"] == "face"].shape[0]
                        total_textboxes += page_df[page_df["type"] == "text"].shape[0]

                    except Exception as e:
                        print(f"Error processing file {csv_file_path}: {e}")

            # Calculate average number of faces and texts per panel for the current manga
            avg_num_faces_per_panel = total_faces / total_panels if total_panels > 0 else 0
            avg_num_texts_per_panel = total_textboxes / total_panels if total_panels > 0 else 0
            
            # Store results for the current manga in the provided dictionaries
            stat_total_panel_count[manga_folder] = total_panels
            stat_total_face_count[manga_folder] = total_faces
            stat_total_textbox_count[manga_folder] = total_textboxes
            stat_avg_face_count_per_panel[manga_folder] = avg_num_faces_per_panel
            stat_avg_textbox_count_per_panel[manga_folder] = avg_num_texts_per_panel

# Example usage
calculate_avg_faces_and_texts_across_manga('../CLEAN_Dataset/clean_manga109')

# The results are now accessible in the provided dictionaries
print("Total Panels:", stat_total_panel_count)
print("Total Faces:", stat_total_face_count)
print("Total Textboxes:", stat_total_textbox_count)
print("Average Faces per Panel:", stat_avg_face_count_per_panel)
print("Average Texts per Panel:", stat_avg_textbox_count_per_panel)


Total Panels: {'AisazuNihaIrarenai': 1014, 'AkkeraKanjinchou': 975, 'Akuhamu': 873, 'AosugiruHaru': 964, 'AppareKappore': 772, 'Arisa': 910, 'ARMS': 575, 'BakuretsuKungFuGirl': 1010, 'Belmondo': 921, 'BEMADER_P': 1141, 'BokuHaSitatakaKun': 1065, 'BurariTessenTorimonocho': 1335, 'ByebyeC-BOY': 1125, 'Copy of AisazuNihaIrarenai': 1014, 'Copy of Arisa': 910, 'Count3DeKimeteAgeru': 947, 'DollGun': 865, 'Donburakokko': 605, 'DualJustice': 783, 'EienNoWith': 1385, 'EvaLady': 991, 'EverydayOsakanaChan': 911, 'GakuenNoise': 1076, 'GarakutayaManta': 1168, 'GinNoChimera': 751, 'GOOD_KISS_Ver2': 1104, 'Hamlet': 1626, 'HanzaiKousyouninMinegishiEitarou': 1207, 'HaruichibanNoFukukoro': 955, 'HarukaRefrain': 839, 'HealingPlanet': 791, 'HeiseiJimen': 1008, 'HighschoolKimengumi_vol01': 987, 'HighschoolKimengumi_vol20': 960, 'HinagikuKenzan': 817, 'HisokaReturns': 984, 'JangiriPonpon': 1445, 'JijiBabaFight': 862, 'Joouari': 952, 'Jyovolley': 781, 'KarappoHighschool': 1364, 'KimiHaBokuNoTaiyouDa': 1053, 

### 7. Average text length per textbox

In [8]:
stat_avg_text_length_in_textbox = {}

for key, value in stat_overall_jp_char_count.items():
  stat_avg_text_length_in_textbox[key] = (value) / (stat_total_face_count[key])

print(stat_avg_text_length_in_textbox)

{'AisazuNihaIrarenai': 16.501316944688323, 'AkkeraKanjinchou': 21.394021739130434, 'Akuhamu': 14.942833471416735, 'AosugiruHaru': 20.15981012658228, 'AppareKappore': 14.606451612903227, 'Arisa': 13.439739413680782, 'ARMS': 34.43730886850153, 'BakuretsuKungFuGirl': 22.035326086956523, 'Belmondo': 24.469, 'BEMADER_P': 14.368537666174298, 'BokuHaSitatakaKun': 12.600158353127474, 'BurariTessenTorimonocho': 28.711740890688258, 'ByebyeC-BOY': 16.327536231884057, 'Copy of AisazuNihaIrarenai': 16.501316944688323, 'Copy of Arisa': 13.439739413680782, 'Count3DeKimeteAgeru': 17.238520408163264, 'DollGun': 14.251948051948052, 'Donburakokko': 24.575601374570446, 'DualJustice': 24.506818181818183, 'EienNoWith': 20.20948275862069, 'EvaLady': 24.834272829763247, 'EverydayOsakanaChan': 48.57904411764706, 'GakuenNoise': 16.590956887486858, 'GarakutayaManta': 10.254856254856255, 'GinNoChimera': 16.627674750356633, 'GOOD_KISS_Ver2': 17.359197907585003, 'Hamlet': 16.349027237354086, 'HanzaiKousyouninMinegi

### 8. Average Textboxes per page

In [9]:
stat_avg_textbox_count_per_page = {}

for key, value in stat_total_face_count.items():
  stat_avg_textbox_count_per_page[key] = (value) / (stat_manga_page_count[key])

print(stat_avg_textbox_count_per_page)

{'AisazuNihaIrarenai': 6.058510638297872, 'AkkeraKanjinchou': 4.0, 'Akuhamu': 7.450617283950617, 'AosugiruHaru': 3.0095238095238095, 'AppareKappore': 4.793814432989691, 'Arisa': 6.329896907216495, 'ARMS': 2.0185185185185186, 'BakuretsuKungFuGirl': 5.690721649484536, 'Belmondo': 5.05050505050505, 'BEMADER_P': 5.93859649122807, 'BokuHaSitatakaKun': 12.757575757575758, 'BurariTessenTorimonocho': 5.513392857142857, 'ByebyeC-BOY': 5.50531914893617, 'Copy of AisazuNihaIrarenai': 6.058510638297872, 'Copy of Arisa': 6.329896907216495, 'Count3DeKimeteAgeru': 3.95959595959596, 'DollGun': 6.015625, 'Donburakokko': 3.269662921348315, 'DualJustice': 2.2222222222222223, 'EienNoWith': 4.64, 'EvaLady': 4.068807339449541, 'EverydayOsakanaChan': 3.1627906976744184, 'GakuenNoise': 4.803030303030303, 'GarakutayaManta': 6.247572815533981, 'GinNoChimera': 3.4029126213592233, 'GOOD_KISS_Ver2': 6.036842105263158, 'Hamlet': 6.945945945945946, 'HanzaiKousyouninMinegishiEitarou': 5.18, 'HaruichibanNoFukukoro': 3

### 9. Panel-to-Text Ratio:


In [10]:
# stat_ratio_panel_to_text = {}

# for key, value in stat_total_panel_count.items():
#   stat_ratio_panel_to_text[key] = (value) / (stat_total_textbox_count[key])

# print(stat_ratio_panel_to_text)

{'AisazuNihaIrarenai': 0.6883910386965377, 'AkkeraKanjinchou': 0.7276119402985075, 'Akuhamu': 0.5975359342915811, 'AosugiruHaru': 0.7088235294117647, 'AppareKappore': 0.6195826645264848, 'Arisa': 0.6542056074766355, 'ARMS': 0.5414312617702448, 'BakuretsuKungFuGirl': 0.46139789858382824, 'Belmondo': 0.6837416481069042, 'BEMADER_P': 0.7874396135265701, 'BokuHaSitatakaKun': 0.4722838137472284, 'BurariTessenTorimonocho': 0.8076225045372051, 'ByebyeC-BOY': 0.7817929117442669, 'Copy of AisazuNihaIrarenai': 0.6883910386965377, 'Copy of Arisa': 0.6542056074766355, 'Count3DeKimeteAgeru': 0.7711726384364821, 'DollGun': 0.6710628394103957, 'Donburakokko': 0.6470588235294118, 'DualJustice': 0.902073732718894, 'EienNoWith': 0.7905251141552512, 'EvaLady': 0.6580345285524568, 'EverydayOsakanaChan': 0.47846638655462187, 'GakuenNoise': 0.797037037037037, 'GarakutayaManta': 1.018308631211857, 'GinNoChimera': 0.9114077669902912, 'GOOD_KISS_Ver2': 0.7131782945736435, 'Hamlet': 0.5917030567685589, 'HanzaiK

### 10. Story Unique Character per manga count

In [49]:
import xml.etree.ElementTree as ET

data_path = "../Manga109/annotations"

def get_characters(xml_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()

    manga_data = []

    for character in root.find("characters"):
        character_id = character.get("id")
        character_name = character.get('name')
        manga_data.append(character_name)
      
    return manga_data

xml_files = [f for f in os.listdir(data_path) if f.endswith('.xml')]

stat_unique_faces = {}
for title in xml_files:
  manga_title = title[:-4]

  xml_file = os.path.join(data_path, f'{manga_title}.xml')

  stat_unique_faces[manga_title] = len(get_characters(xml_file))


### Store Data in CSV

In [51]:
import pandas as pd

data = {
    'path_folder': list(stat_manga_page_count.keys()),
    'page_count': list(stat_manga_page_count.values()),

    'total_jp_chars': list(stat_overall_jp_char_count.values()),
    'total_unique_jp_chars': list(stat_manga_unique_chars_count.values()),
    "total_panel_count": list(stat_total_panel_count.values()),
    "total_textbox_count": list(stat_total_textbox_count.values()),
    "total_face_count": list(stat_total_face_count.values()),
    'total_unique_faces': list(stat_unique_faces.values()),

    # 'genre': list(stat_manga_genre.values()),
    # 'target': list(stat_manga_demo_target.values()),
    # 'year': list(stat_manga_year.values()),

    "ratio_jpChars_to_textbox": list(stat_avg_text_length_in_textbox.values()),
    'ratio_jpChars_to_page': list(stat_avg_page_jp_char_count.values()),
    "ratio_textboxes_to_page": list(stat_avg_textbox_count_per_page.values()),
    "ratio_textboxes_to_panel": list(stat_avg_textbox_count_per_panel.values()),
    "ratio_faces_to_panel": list(stat_avg_face_count_per_panel.values()),
    
    # "ratio_panel_to_text": list(stat_ratio_panel_to_text.values()),
}

# df = pd.DataFrame(data)
df = pd.DataFrame(data)

output_csv_file = 'manga_statistics.csv'

df.to_csv(output_csv_file, index=False)

print(f"CSV file '{output_csv_file}' created successfully.")


CSV file 'manga_statistics.csv' created successfully.
