In [1]:
import os
import pandas as pd

clean_dataset_path = f'../CLEAN_Dataset/clean_manga109'

manga_titles = [f for f in os.listdir(clean_dataset_path)]


### 1. Overall JP char count in EVERY Manga | Total JP char in each page

In [2]:
stat_overall_jp_char_count = {}
stat_total_jp_char_page = {}

for manga_title in os.listdir(clean_dataset_path):
  manga_title_path = os.path.join(clean_dataset_path, manga_title)

  jp_char_count_manga = 0

  for manga_page in os.listdir(manga_title_path):
    page = os.path.join (manga_title_path, manga_page)
    # print(page)

    df = pd.read_csv(page)

    if 'text_content' in df.columns:
      text_content_col = df['text_content']
      jp_char_count_page = 0

      for text in text_content_col[text_content_col.notna()]:
        if text.strip():  # check if text is not just whitespace
          jp_char_count_page += len(text) # Total JP character count in a page
          stat_total_jp_char_page[manga_page] = jp_char_count_page

      jp_char_count_manga += jp_char_count_page # OVERALL JP character count in a manga title
    
  stat_overall_jp_char_count[manga_title] = jp_char_count_manga
  # print(f'Overall JP character count on manga {manga_title}: {jp_char_count_manga}')


### 2 Manga Page count

In [3]:
stat_manga_page_count = {}

for manga_title in os.listdir(clean_dataset_path):
  manga_title_path = os.path.join(clean_dataset_path, manga_title)

  stat_manga_page_count[manga_title] = len(os.listdir(manga_title_path)) * 2

# print(stat_manga_page_count)


### 4. Average JP char count in each page of every manga

In [4]:
stat_avg_jp_char_count_per_page = {}

for title, count in stat_overall_jp_char_count.items():
    stat_avg_jp_char_count_per_page[title] = count / stat_manga_page_count[title]

# print(stat_avg_jp_char_count_per_page)

### 5. Number of UNIQUE characters in EVERY Manga

In [5]:
import regex as re

stat_manga_unique_chars_count = {}
stat_manga_unique_chars = {}

for manga_title in os.listdir(clean_dataset_path):
  manga_title_path = os.path.join(clean_dataset_path, manga_title)

  unique_vocab_count = set()

  for manga_page in os.listdir(manga_title_path):
    page = os.path.join (manga_title_path, manga_page)

    df = pd.read_csv(page)

    if 'text_content' in df.columns:
      text_content_col = df['text_content']

      for text in text_content_col[text_content_col.notna()]:
        if text.strip():  # check if text is not just whitespace
          no_punctuation_text = re.sub(r'\p{P}+', '', text) # remove any punctuations. only read letters

          for char in no_punctuation_text:
            unique_vocab_count.add(char)

  stat_manga_unique_chars_count[manga_title] = len(unique_vocab_count)
  stat_manga_unique_chars[manga_title] = unique_vocab_count



### 6. Average Face & Text Across Manga

In [22]:
import os
import pandas as pd

# Dictionaries to store the results for each statistic
stat_total_panel_count = {}
stat_total_face_count = {}
stat_total_textbox_count = {}
stat_avg_face_count_per_panel = {}
stat_avg_textbox_count_per_panel = {}

def calculate_avg_faces_and_texts_across_manga(base_dir):
    # Iterate through each folder in the base directory
    for manga_folder in os.listdir(base_dir):
        manga_path = os.path.join(base_dir, manga_folder)

        # Check if it's a directory
        if os.path.isdir(manga_path):
            total_panels = 0
            total_faces = 0
            total_textboxes = 0

            # Iterate through each CSV file in the manga folder
            for csv_file in os.listdir(manga_path):
                if csv_file.endswith('.csv'):
                    csv_file_path = os.path.join(manga_path, csv_file)

                    # Load the DataFrame from the CSV file
                    try:
                        page_df = pd.read_csv(csv_file_path)
                        
                        # Check if 'type' column exists before proceeding
                        if 'type' not in page_df.columns:
                            continue

                        # Count total panels (frames), faces, and texts
                        total_panels += page_df[page_df["type"] == "frame"].shape[0]
                        total_faces += page_df[page_df["type"] == "face"].shape[0]
                        total_textboxes += page_df[page_df["type"] == "text"].shape[0]

                    except Exception as e:
                        print(f"Error processing file {csv_file_path}: {e}")

            # Calculate average number of faces and texts per panel for the current manga
            avg_num_faces_per_panel = total_faces / total_panels if total_panels > 0 else 0
            avg_num_texts_per_panel = total_textboxes / total_panels if total_panels > 0 else 0
            
            # Store results for the current manga in the provided dictionaries
            stat_total_panel_count[manga_folder] = total_panels
            stat_total_face_count[manga_folder] = total_faces
            stat_total_textbox_count[manga_folder] = total_textboxes
            stat_avg_face_count_per_panel[manga_folder] = avg_num_faces_per_panel
            stat_avg_textbox_count_per_panel[manga_folder] = avg_num_texts_per_panel

# Example usage
calculate_avg_faces_and_texts_across_manga('../CLEAN_Dataset/clean_manga109')

# The results are now accessible in the provided dictionaries
print("Total Panels:", stat_total_panel_count)
print("Total Faces:", stat_total_face_count)
print("Total Textboxes:", stat_total_textbox_count)
print("Average Faces per Panel:", stat_avg_face_count_per_panel)
print("Average Texts per Panel:", stat_avg_textbox_count_per_panel)


Total Panels: {'AisazuNihaIrarenai': 1014, 'AkkeraKanjinchou': 975, 'Akuhamu': 873, 'AosugiruHaru': 964, 'AppareKappore': 772, 'Arisa': 910, 'ARMS': 575, 'BakuretsuKungFuGirl': 1010, 'Belmondo': 921, 'BEMADER_P': 1141, 'BokuHaSitatakaKun': 1065, 'BurariTessenTorimonocho': 1335, 'ByebyeC-BOY': 1125, 'Count3DeKimeteAgeru': 947, 'DollGun': 865, 'Donburakokko': 605, 'DualJustice': 783, 'EienNoWith': 1385, 'EvaLady': 991, 'EverydayOsakanaChan': 911, 'GakuenNoise': 1076, 'GarakutayaManta': 1168, 'GinNoChimera': 751, 'GOOD_KISS_Ver2': 1104, 'Hamlet': 1626, 'HanzaiKousyouninMinegishiEitarou': 1207, 'HaruichibanNoFukukoro': 955, 'HarukaRefrain': 839, 'HealingPlanet': 791, 'HeiseiJimen': 1008, 'HighschoolKimengumi_vol01': 987, 'HighschoolKimengumi_vol20': 960, 'HinagikuKenzan': 817, 'HisokaReturns': 984, 'JangiriPonpon': 1445, 'JijiBabaFight': 862, 'Joouari': 952, 'Jyovolley': 781, 'KarappoHighschool': 1364, 'KimiHaBokuNoTaiyouDa': 1053, 'KoukouNoHitotachi': 938, 'KuroidoGanka': 1029, 'KyokugenC

### 7. Average text length per textbox

In [7]:
stat_avg_text_length_in_textbox = {}

for key, value in stat_overall_jp_char_count.items():
  stat_avg_text_length_in_textbox[key] = (value) / (stat_total_textbox_count[key])

print(stat_avg_text_length_in_textbox)

{'AisazuNihaIrarenai': 12.759674134419551, 'AkkeraKanjinchou': 11.750746268656716, 'Akuhamu': 12.344969199178644, 'AosugiruHaru': 9.368382352941177, 'AppareKappore': 10.902086677367576, 'Arisa': 11.864845434938893, 'ARMS': 10.603578154425612, 'BakuretsuKungFuGirl': 11.113293741434445, 'Belmondo': 18.165553080920564, 'BEMADER_P': 13.426501035196688, 'BokuHaSitatakaKun': 14.114412416851442, 'BurariTessenTorimonocho': 21.451300665456746, 'ByebyeC-BOY': 11.743571924947881, 'Count3DeKimeteAgeru': 11.0057003257329, 'DollGun': 12.770364623739333, 'Donburakokko': 15.297326203208556, 'DualJustice': 12.422811059907835, 'EienNoWith': 13.380707762557078, 'EvaLady': 14.626826029216467, 'EverydayOsakanaChan': 13.879726890756302, 'GakuenNoise': 11.687407407407408, 'GarakutayaManta': 11.506538796861378, 'GinNoChimera': 14.145631067961165, 'GOOD_KISS_Ver2': 12.862403100775193, 'Hamlet': 15.290029112081514, 'HanzaiKousyouninMinegishiEitarou': 14.331778425655976, 'HaruichibanNoFukukoro': 10.2581743869209

### 8. Average Textboxes per page

In [8]:
stat_avg_textbox_count_per_page = {}

for key, value in stat_total_textbox_count.items():
  stat_avg_textbox_count_per_page[key] = (value) / (stat_manga_page_count[key])

print(stat_avg_textbox_count_per_page)

{'AisazuNihaIrarenai': 7.835106382978723, 'AkkeraKanjinchou': 7.282608695652174, 'Akuhamu': 9.018518518518519, 'AosugiruHaru': 6.476190476190476, 'AppareKappore': 6.422680412371134, 'Arisa': 7.170103092783505, 'ARMS': 6.555555555555555, 'BakuretsuKungFuGirl': 11.283505154639176, 'Belmondo': 6.803030303030303, 'BEMADER_P': 6.355263157894737, 'BokuHaSitatakaKun': 11.38888888888889, 'BurariTessenTorimonocho': 7.379464285714286, 'ByebyeC-BOY': 7.654255319148936, 'Count3DeKimeteAgeru': 6.202020202020202, 'DollGun': 6.713541666666667, 'Donburakokko': 5.252808988764045, 'DualJustice': 4.383838383838384, 'EienNoWith': 7.008, 'EvaLady': 6.908256880733945, 'EverydayOsakanaChan': 11.069767441860465, 'GakuenNoise': 6.818181818181818, 'GarakutayaManta': 5.567961165048544, 'GinNoChimera': 4.0, 'GOOD_KISS_Ver2': 8.147368421052631, 'Hamlet': 7.427027027027027, 'HanzaiKousyouninMinegishiEitarou': 8.575, 'HaruichibanNoFukukoro': 7.56701030927835, 'HarukaRefrain': 5.797979797979798, 'HealingPlanet': 5.24

### 9. Panel-to-Text Ratio:


In [9]:
# stat_ratio_panel_to_text = {}

# for key, value in stat_total_panel_count.items():
#   stat_ratio_panel_to_text[key] = (value) / (stat_total_textbox_count[key])

# print(stat_ratio_panel_to_text)

### 10. Story Unique Character per manga count

In [10]:
import xml.etree.ElementTree as ET

data_path = "../Manga109/annotations"

def get_characters(xml_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()

    manga_data = []

    for character in root.find("characters"):
        character_id = character.get("id")
        character_name = character.get('name')
        manga_data.append(character_name)
      
    return manga_data

xml_files = [f for f in os.listdir(data_path) if f.endswith('.xml')]

stat_unique_faces = {}
for title in xml_files:
  manga_title = title[:-4]

  xml_file = os.path.join(data_path, f'{manga_title}.xml')

  stat_unique_faces[manga_title] = len(get_characters(xml_file))


### 11. Extract Genre from Manga109.org

In [25]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from io import StringIO

stat_manga_genre = {}
stat_manga_demographics = {}
stat_manga_year = {}

url = 'http://www.manga109.org/en/explore.html'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

df = pd.read_html(StringIO(str(soup)), attrs={'class': 'tablesorter'})[0]

# Map base titles to the rows containing metadata for easy lookup later
metadata_mapping = {}

for index, row in df.iterrows():
    base_title = row['Folder Name']
    genre = row['Genre']
    demographics = row['Target']
    year = row['Age']
    metadata_mapping[base_title] = {'genre': genre, 'demographics': demographics, 'year': year}

# Assign the metadata to the correct titles in the total text box count dictionary
for manga_title in stat_total_textbox_count.keys():
    # Extract base title (before "_vol") from the manga title if it has volume info
    if '_vol' in manga_title:
        base_title = manga_title.split('_vol')[0]
    else:
        base_title = manga_title

    # Check if the base title exists in the metadata mapping
    if base_title in metadata_mapping:
        # Retrieve the corresponding genre, demographics, and year
        genre = metadata_mapping[base_title]['genre']
        demographics = metadata_mapping[base_title]['demographics']
        year = metadata_mapping[base_title]['year']

        # Assign values to the correct manga titles in dictionaries
        stat_manga_genre[manga_title] = genre
        stat_manga_demographics[manga_title] = demographics
        stat_manga_year[manga_title] = year

print("Genre:", stat_manga_genre)
print("Demographics:", stat_manga_demographics)
print("Year:", stat_manga_year)


Genre: {'AisazuNihaIrarenai': 'love romance', 'AkkeraKanjinchou': 'battle', 'Akuhamu': 'four-frame cartoons', 'AosugiruHaru': 'love romance', 'AppareKappore': 'science fiction', 'Arisa': 'romantic comedy', 'ARMS': 'science fiction', 'BakuretsuKungFuGirl': 'romantic comedy', 'Belmondo': 'fantasy', 'BEMADER_P': 'science fiction', 'BokuHaSitatakaKun': 'humor', 'BurariTessenTorimonocho': 'historical drama', 'ByebyeC-BOY': 'science fiction', 'Count3DeKimeteAgeru': 'sports', 'DollGun': 'battle', 'Donburakokko': 'romantic comedy', 'DualJustice': 'battle', 'EienNoWith': 'animal', 'EvaLady': 'science fiction', 'EverydayOsakanaChan': 'animal', 'GakuenNoise': 'battle', 'GarakutayaManta': 'humor', 'GinNoChimera': 'fantasy', 'GOOD_KISS_Ver2': 'love romance', 'Hamlet': 'historical drama', 'HanzaiKousyouninMinegishiEitarou': 'suspense', 'HaruichibanNoFukukoro': 'love romance', 'HarukaRefrain': 'science fiction', 'HealingPlanet': 'science fiction', 'HeiseiJimen': 'humor', 'HighschoolKimengumi_vol01': 

In [26]:
# def filter_lone_vol_with_dupes(data):
#   for base_title in list(data.keys()):
#       # Check if base title has a volume-specific counterpart in stat_total_textbox_count
#       volume_titles = [title for title in stat_total_textbox_count if title.startswith(base_title + "_vol")]
      
#       # If volume-specific entries exist, remove the lone base title from stat_manga_genre
#       if volume_titles:
#           data.pop(base_title)

# # Now `stat_manga_genre` should have the same keys as `stat_total_textbox_count`
# filter_lone_vol_with_dupes(stat_manga_genre)
# filter_lone_vol_with_dupes(stat_manga_demographics)
# filter_lone_vol_with_dupes(stat_manga_year)

print("Genre Dictionary:", len(stat_manga_genre))
print("Demographics Dictionary:", len(stat_manga_demographics))
print("Year Dictionary:", len(stat_manga_year))


Genre Dictionary: 109
Demographics Dictionary: 109
Year Dictionary: 109


### Store Data in CSV

In [29]:
import pandas as pd

data = {
    'path_folder': list(stat_manga_page_count.keys()),
    'page_count': list(stat_manga_page_count.values()),

    'genre': list(stat_manga_genre.values()),
    'target': list(stat_manga_demographics.values()),
    'year': list(stat_manga_year.values()),

    'total_jp_chars': list(stat_overall_jp_char_count.values()),
    'total_unique_jp_chars': list(stat_manga_unique_chars_count.values()),
    "total_panel_count": list(stat_total_panel_count.values()),
    "total_textbox_count": list(stat_total_textbox_count.values()),
    "total_face_count": list(stat_total_face_count.values()),
    'total_unique_faces': list(stat_unique_faces.values()),

    "avg_text_length_in_textbox": list(stat_avg_text_length_in_textbox.values()),
    'avg_jp_char_count_per_page': list(stat_avg_jp_char_count_per_page.values()),
    "avg_textbox_count_per_page": list(stat_avg_textbox_count_per_page.values()),
    "avg_textbox_count_per_panel": list(stat_avg_textbox_count_per_panel.values()),
    "avg_face_count_per_panel": list(stat_avg_face_count_per_panel.values()),
    
    # "ratio_panel_to_text": list(stat_ratio_panel_to_text.values()),
}

# df = pd.DataFrame(data)
df = pd.DataFrame(data)

output_csv_file = 'manga_statistics.csv'

df.to_csv(output_csv_file, index=False)

print(f"CSV file '{output_csv_file}' created successfully.")


CSV file 'manga_statistics.csv' created successfully.


In [30]:
print(df['genre'].value_counts())
print()

genre
humor                  15
science fiction        14
love romance           13
romantic comedy        13
fantasy                12
sports                 10
battle                  9
historical drama        6
four-frame cartoons     5
animal                  5
suspense                5
horror                  2
Name: count, dtype: int64

