In [1]:
import os
import pandas as pd

clean_dataset_path = f'../CLEAN_Dataset/clean_manga109'

manga_titles = [f for f in os.listdir(clean_dataset_path)]


### 1. Overall JP char count in EVERY Manga | Total JP char in each page

In [2]:
stat_overall_jp_char_count = {}
stat_total_jp_char_page = {}

for manga_title in os.listdir(clean_dataset_path):
  manga_title_path = os.path.join(clean_dataset_path, manga_title)

  jp_char_count_manga = 0

  for manga_page in os.listdir(manga_title_path):
    page = os.path.join (manga_title_path, manga_page)
    # print(page)

    df = pd.read_csv(page)

    if 'text_content' in df.columns:
      text_content_col = df['text_content']
      jp_char_count_page = 0

      for text in text_content_col[text_content_col.notna()]:
        if text.strip():  # check if text is not just whitespace
          jp_char_count_page += len(text) # Total JP character count in a page
          stat_total_jp_char_page[manga_page] = jp_char_count_page

      jp_char_count_manga += jp_char_count_page # OVERALL JP character count in a manga title
    
  stat_overall_jp_char_count[manga_title] = jp_char_count_manga
  # print(f'Overall JP character count on manga {manga_title}: {jp_char_count_manga}')


### 1.5 Manga Page count

In [None]:
stat_manga_page_count = {}

for manga_title in os.listdir(clean_dataset_path):
  manga_title_path = os.path.join(clean_dataset_path, manga_title)

  stat_manga_page_count[manga_title] = len(os.listdir(manga_title_path))

print(stat_manga_page_count)


### 1.6 Genre/Demographic Target/Year extract

In [31]:
import requests
from io import StringIO
from bs4 import BeautifulSoup

stat_manga_genre = {}
stat_manga_demo_target = {}
stat_manga_year = {}

url = 'http://www.manga109.org/en/explore.html'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

table = soup.find('table', class_='tablesorter')

df = pd.read_html(StringIO(str(soup)), attrs = {'class': 'tablesorter'})[0]

for index, row in df.iterrows():
  folder_name = row["Folder Name"]

  manga_genre = row["Genre"]
  manga_demo_target = row["Target"]
  manga_year = row["Age"]

  stat_manga_genre[folder_name] = manga_genre
  stat_manga_demo_target[folder_name] = manga_demo_target
  stat_manga_year[folder_name] = manga_year


### 2. Average JP char count in each page of every manga

In [None]:
stat_avg_page_jp_char_count = {}

for title, count in stat_overall_jp_char_count.items():
    stat_avg_page_jp_char_count[title] = count / stat_manga_page_count[title]

print(stat_avg_page_jp_char_count)

### 3. Number of UNIQUE characters in EVERY Manga

In [6]:
import regex as re

stat_manga_unique_chars_count = {}
stat_manga_unique_chars = {}

for manga_title in os.listdir(clean_dataset_path):
  manga_title_path = os.path.join(clean_dataset_path, manga_title)

  unique_vocab_count = set()

  for manga_page in os.listdir(manga_title_path):
    page = os.path.join (manga_title_path, manga_page)

    df = pd.read_csv(page)

    if 'text_content' in df.columns:
      text_content_col = df['text_content']

      for text in text_content_col[text_content_col.notna()]:
        if text.strip():  # check if text is not just whitespace
          no_punctuation_text = re.sub(r'\p{P}+', '', text) # remove any punctuations. only read letters

          for char in no_punctuation_text:
            unique_vocab_count.add(char)

  stat_manga_unique_chars_count[manga_title] = len(unique_vocab_count)
  stat_manga_unique_chars[manga_title] = unique_vocab_count



### Store in CSV

In [None]:
import pandas as pd

data = {
    'manga_title': list(stat_manga_page_count.keys()),
    'page_count': list(stat_manga_page_count.values()),
    'overall_jp_chars': list(stat_overall_jp_char_count.values()),
    'avg_page_jp_char': list(stat_avg_page_jp_char_count.values()),
    'total_unique_chars': list(stat_manga_unique_chars_count.values()),

    # 'genre': list(stat_manga_genre.values()),
    # 'target': list(stat_manga_demo_target.values()),
    # 'year': list(stat_manga_year.values()),
}

# df = pd.DataFrame(data)
df = pd.DataFrame(data)

output_csv_file = 'manga_statistics.csv'

df.to_csv(output_csv_file, index=False)

print(f"CSV file '{output_csv_file}' created successfully.")
