In [1]:
import os
import pandas as pd

clean_dataset_path = f'../CLEAN_Dataset/clean_manga109'

manga_titles = [f for f in os.listdir(clean_dataset_path)]


### 1. Overall JP char count in EVERY Manga | Total JP char in each page

In [2]:
stat_overall_jp_char_count = {}
stat_total_jp_char_page = {}

for manga_title in os.listdir(clean_dataset_path):
  manga_title_path = os.path.join(clean_dataset_path, manga_title)

  jp_char_count_manga = 0

  for manga_page in os.listdir(manga_title_path):
    page = os.path.join (manga_title_path, manga_page)
    # print(page)

    df = pd.read_csv(page)

    if 'text_content' in df.columns:
      text_content_col = df['text_content']
      jp_char_count_page = 0

      for text in text_content_col[text_content_col.notna()]:
        if text.strip():  # check if text is not just whitespace
          jp_char_count_page += len(text) # Total JP character count in a page
          stat_total_jp_char_page[manga_page] = jp_char_count_page

      jp_char_count_manga += jp_char_count_page # OVERALL JP character count in a manga title
    
  stat_overall_jp_char_count[manga_title] = jp_char_count_manga
  # print(f'Overall JP character count on manga {manga_title}: {jp_char_count_manga}')


### 1.5 Manga Page count

In [3]:
stat_manga_page_count = {}

for manga_title in os.listdir(clean_dataset_path):
  manga_title_path = os.path.join(clean_dataset_path, manga_title)

  stat_manga_page_count[manga_title] = len(os.listdir(manga_title_path)) * 2

print(stat_manga_page_count)


{'AisazuNihaIrarenai': 188, 'AkkeraKanjinchou': 184, 'Akuhamu': 162, 'AosugiruHaru': 210, 'AppareKappore': 194, 'Arisa': 194, 'ARMS': 162, 'BakuretsuKungFuGirl': 194, 'Belmondo': 198, 'BEMADER_P': 228, 'BokuHaSitatakaKun': 198, 'BurariTessenTorimonocho': 224, 'ByebyeC-BOY': 188, 'Copy of AisazuNihaIrarenai': 188, 'Copy of Arisa': 194, 'Count3DeKimeteAgeru': 198, 'DollGun': 192, 'Donburakokko': 178, 'DualJustice': 198, 'EienNoWith': 250, 'EvaLady': 218, 'EverydayOsakanaChan': 172, 'GakuenNoise': 198, 'GarakutayaManta': 206, 'GinNoChimera': 206, 'GOOD_KISS_Ver2': 190, 'Hamlet': 370, 'HanzaiKousyouninMinegishiEitarou': 200, 'HaruichibanNoFukukoro': 194, 'HarukaRefrain': 198, 'HealingPlanet': 190, 'HeiseiJimen': 222, 'HighschoolKimengumi_vol01': 198, 'HighschoolKimengumi_vol20': 188, 'HinagikuKenzan': 178, 'HisokaReturns': 180, 'JangiriPonpon': 220, 'JijiBabaFight': 146, 'Joouari': 194, 'Jyovolley': 186, 'KarappoHighschool': 242, 'KimiHaBokuNoTaiyouDa': 222, 'KoukouNoHitotachi': 134, 'Kuro

### 1.6 Genre/Demographic Target/Year extract

In [4]:
import requests
from io import StringIO
from bs4 import BeautifulSoup

stat_manga_genre = {}
stat_manga_demo_target = {}
stat_manga_year = {}

url = 'http://www.manga109.org/en/explore.html'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

table = soup.find('table', class_='tablesorter')

df = pd.read_html(StringIO(str(soup)), attrs = {'class': 'tablesorter'})[0]

for index, row in df.iterrows():
  folder_name = row["Folder Name"]

  manga_genre = row["Genre"]
  manga_demo_target = row["Target"]
  manga_year = row["Age"]

  stat_manga_genre[folder_name] = manga_genre
  stat_manga_demo_target[folder_name] = manga_demo_target
  stat_manga_year[folder_name] = manga_year


### 2. Average JP char count in each page of every manga

In [24]:
stat_avg_page_jp_char_count = {}

for title, count in stat_overall_jp_char_count.items():
    stat_avg_page_jp_char_count[title] = count / stat_manga_page_count[title]

print(stat_avg_page_jp_char_count)

{'AisazuNihaIrarenai': 99.97340425531915, 'AkkeraKanjinchou': 85.57608695652173, 'Akuhamu': 111.33333333333333, 'AosugiruHaru': 60.67142857142857, 'AppareKappore': 70.02061855670104, 'Arisa': 85.0721649484536, 'ARMS': 69.51234567901234, 'BakuretsuKungFuGirl': 125.39690721649484, 'Belmondo': 123.58080808080808, 'BEMADER_P': 85.32894736842105, 'BokuHaSitatakaKun': 160.74747474747474, 'BurariTessenTorimonocho': 158.29910714285714, 'ByebyeC-BOY': 89.88829787234043, 'Copy of AisazuNihaIrarenai': 99.97340425531915, 'Copy of Arisa': 85.0721649484536, 'Count3DeKimeteAgeru': 68.25757575757575, 'DollGun': 85.734375, 'Donburakokko': 80.35393258426966, 'DualJustice': 54.45959595959596, 'EienNoWith': 93.772, 'EvaLady': 101.04587155963303, 'EverydayOsakanaChan': 153.6453488372093, 'GakuenNoise': 79.68686868686869, 'GarakutayaManta': 64.06796116504854, 'GinNoChimera': 56.58252427184466, 'GOOD_KISS_Ver2': 104.79473684210527, 'Hamlet': 113.55945945945946, 'HanzaiKousyouninMinegishiEitarou': 122.895, 'H

### 3. Number of UNIQUE characters in EVERY Manga

In [25]:
import regex as re

stat_manga_unique_chars_count = {}
stat_manga_unique_chars = {}

for manga_title in os.listdir(clean_dataset_path):
  manga_title_path = os.path.join(clean_dataset_path, manga_title)

  unique_vocab_count = set()

  for manga_page in os.listdir(manga_title_path):
    page = os.path.join (manga_title_path, manga_page)

    df = pd.read_csv(page)

    if 'text_content' in df.columns:
      text_content_col = df['text_content']

      for text in text_content_col[text_content_col.notna()]:
        if text.strip():  # check if text is not just whitespace
          no_punctuation_text = re.sub(r'\p{P}+', '', text) # remove any punctuations. only read letters

          for char in no_punctuation_text:
            unique_vocab_count.add(char)

  stat_manga_unique_chars_count[manga_title] = len(unique_vocab_count)
  stat_manga_unique_chars[manga_title] = unique_vocab_count



### 4. Average Face & Text Across Manga

In [26]:
import os
import pandas as pd

# Dictionaries to store the results for each statistic
stat_total_panels = {}
stat_total_faces = {}
stat_total_texts = {}
stat_avg_num_faces_per_panel = {}
stat_avg_num_texts_per_panel = {}

def calculate_avg_faces_and_texts_across_manga(base_dir):
    # Iterate through each folder in the base directory
    for manga_folder in os.listdir(base_dir):
        manga_path = os.path.join(base_dir, manga_folder)

        # Check if it's a directory
        if os.path.isdir(manga_path):
            total_panels = 0
            total_faces = 0
            total_texts = 0

            # Iterate through each CSV file in the manga folder
            for csv_file in os.listdir(manga_path):
                if csv_file.endswith('.csv'):
                    csv_file_path = os.path.join(manga_path, csv_file)

                    # Load the DataFrame from the CSV file
                    try:
                        page_df = pd.read_csv(csv_file_path)
                        
                        # Check if 'type' column exists before proceeding
                        if 'type' not in page_df.columns:
                            continue

                        # Count total panels (frames), faces, and texts
                        total_panels += page_df[page_df["type"] == "frame"].shape[0]
                        total_faces += page_df[page_df["type"] == "face"].shape[0]
                        total_texts += page_df[page_df["type"] == "text"].shape[0]

                    except Exception as e:
                        print(f"Error processing file {csv_file_path}: {e}")

            # Calculate average number of faces and texts per panel for the current manga
            avg_num_faces_per_panel = total_faces / total_panels if total_panels > 0 else 0
            avg_num_texts_per_panel = total_texts / total_panels if total_panels > 0 else 0
            
            # Store results for the current manga in the provided dictionaries
            stat_total_panels[manga_folder] = total_panels
            stat_total_faces[manga_folder] = total_faces
            stat_total_texts[manga_folder] = total_texts
            stat_avg_num_faces_per_panel[manga_folder] = avg_num_faces_per_panel
            stat_avg_num_texts_per_panel[manga_folder] = avg_num_texts_per_panel

# Example usage
clean_manga_dataset_path = '../CLEAN_Dataset/clean_manga109'
calculate_avg_faces_and_texts_across_manga(clean_manga_dataset_path)

# The results are now accessible in the provided dictionaries
print("Total Panels:", stat_total_panels)
print("Total Faces:", stat_total_faces)
print("Total Texts:", stat_total_texts)
print("Average Faces per Panel:", stat_avg_num_faces_per_panel)
print("Average Texts per Panel:", stat_avg_num_texts_per_panel)


Total Panels: {'AisazuNihaIrarenai': 1014, 'AkkeraKanjinchou': 975, 'Akuhamu': 873, 'AosugiruHaru': 964, 'AppareKappore': 772, 'Arisa': 910, 'ARMS': 575, 'BakuretsuKungFuGirl': 1010, 'Belmondo': 921, 'BEMADER_P': 1141, 'BokuHaSitatakaKun': 1065, 'BurariTessenTorimonocho': 1335, 'ByebyeC-BOY': 1125, 'Copy of AisazuNihaIrarenai': 1014, 'Copy of Arisa': 910, 'Count3DeKimeteAgeru': 947, 'DollGun': 865, 'Donburakokko': 605, 'DualJustice': 783, 'EienNoWith': 1385, 'EvaLady': 991, 'EverydayOsakanaChan': 911, 'GakuenNoise': 1076, 'GarakutayaManta': 1168, 'GinNoChimera': 751, 'GOOD_KISS_Ver2': 1104, 'Hamlet': 1626, 'HanzaiKousyouninMinegishiEitarou': 1207, 'HaruichibanNoFukukoro': 955, 'HarukaRefrain': 839, 'HealingPlanet': 791, 'HeiseiJimen': 1008, 'HighschoolKimengumi_vol01': 987, 'HighschoolKimengumi_vol20': 960, 'HinagikuKenzan': 817, 'HisokaReturns': 984, 'JangiriPonpon': 1445, 'JijiBabaFight': 862, 'Joouari': 952, 'Jyovolley': 781, 'KarappoHighschool': 1364, 'KimiHaBokuNoTaiyouDa': 1053, 

### Store Data in CSV

In [27]:
import pandas as pd

data = {
    'path_folder': list(stat_manga_page_count.keys()),
    'page_count': list(stat_manga_page_count.values()),
    'overall_jp_chars': list(stat_overall_jp_char_count.values()),
    'avg_page_jp_char': list(stat_avg_page_jp_char_count.values()),
    'total_unique_chars': list(stat_manga_unique_chars_count.values()),

    # 'genre': list(stat_manga_genre.values()),
    # 'target': list(stat_manga_demo_target.values()),
    # 'year': list(stat_manga_year.values()),

    "total_panel_count": list(stat_total_panels.values()),
    "total_face_count": list(stat_total_faces.values()),
    "total_textbox_count": list(stat_total_texts.values()),
    "avg_face_per_panel": list(stat_avg_num_faces_per_panel.values()),
    "avg_textbox_per_panel": list(stat_avg_num_texts_per_panel.values()),
}

# df = pd.DataFrame(data)
df = pd.DataFrame(data)

output_csv_file = 'manga_statistics.csv'

df.to_csv(output_csv_file, index=False)

print(f"CSV file '{output_csv_file}' created successfully.")


CSV file 'manga_statistics.csv' created successfully.
