In [1]:
import ebooklib 
from ebooklib import epub
from bs4 import BeautifulSoup
import json
import re
from collections import defaultdict
from sudachipy import tokenizer
from sudachipy import dictionary

In [2]:
def read_epub(file_path):
    book = epub.read_epub(file_path)
    content = []

    for item in book.get_items():
        if item.get_type() == ebooklib.ITEM_DOCUMENT:
            soup = BeautifulSoup(item.get_body_content(), 'html.parser')
            content.append(soup.get_text())

    return '\n'.join(content)

file_path = '無職転生 ～異世界行ったら本気だす～.epub'
epub_content = read_epub(file_path)

epub_sentences = epub_content.split()

japanese_char_pattern = re.compile(r'[\u3040-\u309F\u30a0-\u30ff\u4e00-\u9faf]')
filtered_sentences = []
for sentence in epub_sentences:
    filtered_sentence = ''.join(char for char in sentence if japanese_char_pattern.match(char))
    if filtered_sentence: 
        filtered_sentences.append(filtered_sentence)

In [3]:
tokenizer_obj = dictionary.Dictionary(dict="full").create(tokenizer.Tokenizer.SplitMode.C)

def is_kanji(char):
    return '\u4e00' <= char <= '\u9faf'

def is_pure_kanji(token):
    return all(is_kanji(char) for char in token)

def contains_kanji(token):
    return any(is_kanji(char) for char in token)

vocab_freq = defaultdict(int)
kana_words_freq = defaultdict(int)

def tokenize_text(text):
    return [word.surface() for word in tokenizer_obj.tokenize(text)]

for sentence in filtered_sentences:
    tokens = tokenize_text(sentence)
    for token in tokens:
        if not token:
            continue
        if is_pure_kanji(token):
            vocab_freq[token] += 1
        elif contains_kanji(token):
            vocab_freq[token] += 1
        else:
            kana_words_freq[token] += 1

kanji_words = sorted(vocab_freq.items(), key=lambda item: item[1], reverse=True)
kana_words = sorted(kana_words_freq.items(), key=lambda item: item[1], reverse=True)

print("Number of Unique Vocabulary:", len(kanji_words))
print("Number of Removed Words:", len(kana_words))

all_vocab = kanji_words + kana_words
sorted_occurences = sorted(all_vocab, key=lambda x: x[1], reverse=True)
print(f'Total Unique Words: {len(sorted_occurences)}')

total_occurrences = sum(count for word, count in kana_words + kanji_words)
print(f'Total Words: {total_occurrences}')


Number of Unique Vocabulary: 4928
Number of Removed Words: 1647
Total Unique Words: 6575
Total Words: 63158


In [4]:
JPDB_freq = "./JPDB_term_meta_bank_1.json"

all_vocab = sorted(sorted_occurences, key=lambda x: x[1], reverse=True)

new_frequencies = {word: i + 1 for i, (word, _) in enumerate(all_vocab)}

with open(JPDB_freq, 'r', encoding='utf-8') as file:
    jpdb_data = json.load(file)

filtered_data = []
for item in jpdb_data:
    word = item[0]
    if word in new_frequencies:
        filtered_data.append(item)

def update_frequencies(data, freq_dict):
    for item in data:
        word = item[0]
        reading = item[2].get('reading') if 'reading' in item[2] else None
        if word in freq_dict:
            freq = freq_dict[word]
        elif reading in freq_dict:
            freq = freq_dict[reading]
        else:
            continue
        
        if 'frequency' in item[2]:
          item[2]['frequency'] = {
              'value': freq,
              'displayValue': f"{freq}㋕"
          }
print('Success!')

update_frequencies(filtered_data, new_frequencies)

sorted_data = sorted(filtered_data, key=lambda x: x[2]['frequency']['value'] if 'frequency' in x[2] else x[2]['value'])

term_meta_json = 'term_meta_bank_1.json'
with open(term_meta_json, 'w', encoding='utf-8') as file:
    json.dump(sorted_data, file, ensure_ascii=False, indent=2)

print(f"New JSON file {term_meta_json} has been created.")

Success!
New JSON file term_meta_bank_1.json has been created.


In [5]:
with open(term_meta_json, 'r', encoding='utf-8') as file:
    term_data = json.load(file)

unique_data = []
delete_counter = 0
for i in range(len(term_data)):
    is_duplicate = False
    for j in range(i + 1, len(term_data)):
        if term_data[i] == term_data[j]:
            is_duplicate = True
            delete_counter += 1
            break
    if not is_duplicate:
        unique_data.append(term_data[i])

with open(term_meta_json, 'w', encoding='utf-8') as file:
    json.dump(unique_data, file, ensure_ascii=False, indent=4)

print(f"Removed {delete_counter} duplicates successfully!")


Removed 2353 duplicates successfully!


In [6]:
def find_occurrence(word):
    for item in sorted_occurences:
        if item[0] == word:
            return item[1]
    return None

with open(term_meta_json, 'r', encoding='utf-8') as file:
    term_data = json.load(file)

term_data.sort(key=lambda x: x[2]['frequency']['value'] if 'frequency' in x[2] else x[2]['value'])

for index, term in enumerate(term_data):
    if 'frequency' in term[2]:
      term[2]['frequency']['value'] = index + 1
      term[2]['frequency']['displayValue'] = f"{find_occurrence(term[0])}"
    else:
      term[2]['value'] = index + 1
      term[2]['displayValue'] = f"{find_occurrence(term[0])}"
        
with open(term_meta_json, 'w', encoding='utf-8') as file:
    json.dump(term_data, file, ensure_ascii=False, indent=4)


In [7]:
with open(term_meta_json, 'r', encoding='utf-8') as file:
    term_data = json.load(file)

term_data.sort(key=lambda x: (int(x[2]['frequency']['displayValue']) if 'frequency' in x[2] else int(x[2]['displayValue'])), reverse=True)

with open(term_meta_json, 'w', encoding='utf-8') as file:
    json.dump(term_data, file, ensure_ascii=False, indent=4)


In [8]:
with open(term_meta_json, 'r', encoding='utf-8') as file:
    term_data = json.load(file)

for index, term in enumerate(term_data):
    if 'frequency' in term[2]:
      term[2]['frequency']['value'] = index + 1
      term[2]['frequency']['displayValue'] = f"x{term[2]['frequency']['displayValue']}"
    else:
      term[2]['value'] = index + 1
      term[2]['displayValue'] = f"x{term[2]['displayValue']}"
        
with open(term_meta_json, 'w', encoding='utf-8') as file:
    json.dump(term_data, file, ensure_ascii=False, indent=4)

print(f'{term_meta_json} is ready for use!')

term_meta_bank_1.json is ready for use!


In [9]:
all_vocab

[('の', 2832),
 ('て', 2645),
 ('に', 2504),
 ('は', 2487),
 ('た', 2336),
 ('を', 2090),
 ('が', 1841),
 ('と', 1794),
 ('で', 1466),
 ('だ', 1249),
 ('も', 1095),
 ('し', 1061),
 ('か', 904),
 ('ない', 891),
 ('な', 674),
 ('い', 503),
 ('こと', 462),
 ('いる', 447),
 ('から', 394),
 ('俺', 387),
 ('ん', 326),
 ('そう', 323),
 ('いう', 300),
 ('する', 297),
 ('ば', 281),
 ('魔術', 263),
 ('よう', 253),
 ('ある', 245),
 ('パウロ', 245),
 ('だろう', 242),
 ('です', 236),
 ('れ', 229),
 ('それ', 219),
 ('なっ', 197),
 ('いい', 188),
 ('この', 186),
 ('さ', 185),
 ('だっ', 178),
 ('なかっ', 161),
 ('よ', 158),
 ('って', 147),
 ('ね', 144),
 ('だけ', 143),
 ('なる', 143),
 ('その', 142),
 ('もの', 141),
 ('ロキシー', 141),
 ('ます', 139),
 ('ルディ', 132),
 ('たら', 130),
 ('一', 124),
 ('き', 124),
 ('リーリャ', 124),
 ('何', 122),
 ('時', 121),
 ('見', 120),
 ('らしい', 120),
 ('じゃ', 119),
 ('なら', 119),
 ('思っ', 114),
 ('俺の', 113),
 ('自分', 113),
 ('言っ', 112),
 ('でき', 105),
 ('そんな', 100),
 ('なく', 98),
 ('シルフィ', 98),
 ('顔', 97),
 ('どう', 97),
 ('たり', 94),
 ('しれ', 93),
 ('まで', 90),
 ('