In [1]:
import ebooklib 
from ebooklib import epub
from bs4 import BeautifulSoup
import json
import re
from collections import defaultdict
from sudachipy import tokenizer
from sudachipy import dictionary

In [2]:
def read_epub(file_path):
    book = epub.read_epub(file_path)
    content = []

    for item in book.get_items():
        if item.get_type() == ebooklib.ITEM_DOCUMENT:
            soup = BeautifulSoup(item.get_body_content(), 'html.parser')
            content.append(soup.get_text())

    return '\n'.join(content)

file_path = '無職転生 ～異世界行ったら本気だす～.epub'
epub_content = read_epub(file_path)

epub_sentences = epub_content.split()

japanese_char_pattern = re.compile(r'[\u3040-\u309F\u30a0-\u30ff\u4e00-\u9faf]')
filtered_sentences = []
for sentence in epub_sentences:
    filtered_sentence = ''.join(char for char in sentence if japanese_char_pattern.match(char))
    if filtered_sentence: 
        filtered_sentences.append(filtered_sentence)

### Tokenize/Lemmatize

In [3]:
tokenizer_obj = dictionary.Dictionary(dict="full").create(tokenizer.Tokenizer.SplitMode.C)

def is_kanji(char):
    return '\u4e00' <= char <= '\u9faf'

def is_pure_kanji(token):
    return all(is_kanji(char) for char in token)

def contains_kanji(token):
    return any(is_kanji(char) for char in token)

pure_kanji_freq = defaultdict(int)
kanji_kana_freq = defaultdict(int)
kana_freq = defaultdict(int)

def tokenize_text(text):
    return [word.dictionary_form() for word in tokenizer_obj.tokenize(text)]

for sentence in filtered_sentences:
    tokens = tokenize_text(sentence)
    for token in tokens:
        if not token:
            continue
        if is_pure_kanji(token):
            pure_kanji_freq[token] += 1
        elif contains_kanji(token):
            kanji_kana_freq[token] += 1
        else:
            kana_freq[token] += 1

kanji_words = sorted(pure_kanji_freq.items(), key=lambda item: item[1], reverse=True)
kanji_kana_words = sorted(kanji_kana_freq.items(), key=lambda item: item[1], reverse=True)
kana_words = sorted(kana_freq.items(), key=lambda item: item[1], reverse=True)

print("Number of Pure Kanji Words:", len(kanji_words))
print("Number of Kanji + Kana Words:", len(kanji_kana_words))
print("Number of Kana Words:", len(kana_words))

all_vocab = kanji_words + kanji_kana_words + kana_words
sorted_occurences = sorted(all_vocab, key=lambda x: x[1], reverse=True)
print(f'Total Unique Words: {len(sorted_occurences)}')

total_occurrences = sum(count for word, count in (kana_words + kanji_kana_words + kanji_words))
print(f'Total Words: {total_occurrences}')


Number of Pure Kanji Words: 2623
Number of Kanji + Kana Words: 1707
Number of Kana Words: 1496
Total Unique Words: 5826
Total Words: 63158


In [10]:
kanji_kana_words[:10]

[('言う', 237),
 ('思う', 219),
 ('見る', 158),
 ('俺の', 113),
 ('使う', 98),
 ('聞く', 91),
 ('出る', 82),
 ('教える', 62),
 ('持つ', 56),
 ('知る', 53)]

In [5]:
JPDB_freq = "./JPDB_term_meta_bank_1.json"

all_vocab = sorted(sorted_occurences, key=lambda x: x[1], reverse=True)

new_frequencies = {word: i + 1 for i, (word, _) in enumerate(all_vocab)}

with open(JPDB_freq, 'r', encoding='utf-8') as file:
    jpdb_data = json.load(file)

filtered_data = []
for item in jpdb_data:
    word = item[0]
    if word in new_frequencies:
        filtered_data.append(item)

def update_frequencies(data, freq_dict):
    for item in data:
        word = item[0]
        reading = item[2].get('reading') if 'reading' in item[2] else None
        if word in freq_dict:
            freq = freq_dict[word]
        elif reading in freq_dict:
            freq = freq_dict[reading]
        else:
            continue
        
        if 'frequency' in item[2]:
          item[2]['frequency'] = {
              'value': freq,
              'displayValue': f"{freq}㋕"
          }
print('Success!')

update_frequencies(filtered_data, new_frequencies)

sorted_data = sorted(filtered_data, key=lambda x: x[2]['frequency']['value'] if 'frequency' in x[2] else x[2]['value'])

term_meta_json = 'term_meta_bank_1.json'
with open(term_meta_json, 'w', encoding='utf-8') as file:
    json.dump(sorted_data, file, ensure_ascii=False, indent=2)

print(f"New JSON file {term_meta_json} has been created.")

Success!
New JSON file term_meta_bank_1.json has been created.


In [6]:
with open(term_meta_json, 'r', encoding='utf-8') as file:
    term_data = json.load(file)

unique_data = []
delete_counter = 0
for i in range(len(term_data)):
    is_duplicate = False
    for j in range(i + 1, len(term_data)):
        if term_data[i] == term_data[j]:
            is_duplicate = True
            delete_counter += 1
            break
    if not is_duplicate:
        unique_data.append(term_data[i])

with open(term_meta_json, 'w', encoding='utf-8') as file:
    json.dump(unique_data, file, ensure_ascii=False, indent=4)

print(f"Removed {delete_counter} duplicates successfully!")


Removed 2801 duplicates successfully!


In [7]:
def find_occurrence(word):
    for item in sorted_occurences:
        if item[0] == word:
            return item[1]
    return None

with open(term_meta_json, 'r', encoding='utf-8') as file:
    term_data = json.load(file)

term_data.sort(key=lambda x: x[2]['frequency']['value'] if 'frequency' in x[2] else x[2]['value'])

for index, term in enumerate(term_data):
    if 'frequency' in term[2]:
      term[2]['frequency']['value'] = index + 1
      term[2]['frequency']['displayValue'] = f"{find_occurrence(term[0])}"
    else:
      term[2]['value'] = index + 1
      term[2]['displayValue'] = f"{find_occurrence(term[0])}"
        
with open(term_meta_json, 'w', encoding='utf-8') as file:
    json.dump(term_data, file, ensure_ascii=False, indent=4)


In [8]:
with open(term_meta_json, 'r', encoding='utf-8') as file:
    term_data = json.load(file)

term_data.sort(key=lambda x: (int(x[2]['frequency']['displayValue']) if 'frequency' in x[2] else int(x[2]['displayValue'])), reverse=True)

with open(term_meta_json, 'w', encoding='utf-8') as file:
    json.dump(term_data, file, ensure_ascii=False, indent=4)


In [9]:
with open(term_meta_json, 'r', encoding='utf-8') as file:
    term_data = json.load(file)

for index, term in enumerate(term_data):
    if 'frequency' in term[2]:
      term[2]['frequency']['value'] = index + 1
      term[2]['frequency']['displayValue'] = f"x{term[2]['frequency']['displayValue']}"
    else:
      term[2]['value'] = index + 1
      term[2]['displayValue'] = f"x{term[2]['displayValue']}"
        
with open(term_meta_json, 'w', encoding='utf-8') as file:
    json.dump(term_data, file, ensure_ascii=False, indent=4)

print(f'{term_meta_json} is ready for use!')

term_meta_bank_1.json is ready for use!
