In [1]:
import re
import os
import json
import glob
from collections import defaultdict
from sudachipy import tokenizer
from sudachipy import dictionary

In [None]:
file_list = os.listdir('.')
pattern2 = r"^VOL +"
matching_folders = [folder for folder in file_list if re.match(pattern2, folder)]

sorted(matching_folders)
print(f'Found {len(matching_folders)} folders containing text.')

In [None]:
all_sentences = []

japanese_char_pattern = re.compile(r'[\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FAF]')

for i in range(len(matching_folders)):
    folder_path = matching_folders[i]

    json_files = sorted(glob.glob(os.path.join(folder_path, '*.json')))

    for json_fp in json_files:
        with open(json_fp, 'r', encoding='utf-8') as file:
            data = json.load(file)

        lines = [[''.join(block["lines"])] for block in data.get("blocks", [])]

        cleaned_lines = [[
            ''.join(japanese_char_pattern.findall(line)) for line in sentence
        ] for sentence in lines]

        cleaned_lines = [sentence for sentence in cleaned_lines if any(sentence)]

        all_sentences.extend(cleaned_lines)
        
    if (i % 5) == 0:
      print(f'Finished processing {matching_folders[i]}')

print(f'Found {len(all_sentences)} sentences across {len(matching_folders)} folders.')


#### Make frequency list

In [None]:
tokenizer_obj = dictionary.Dictionary(dict="full").create(tokenizer.Tokenizer.SplitMode.C)

def is_kanji(char):
    return '\u4e00' <= char <= '\u9faf'

def is_pure_kanji(token):
    return all(is_kanji(char) for char in token)

def contains_kanji(token):
    return any(is_kanji(char) for char in token)

vocab_freq = defaultdict(int)
kana_words_freq = defaultdict(int)

def tokenize_text(text):
    return [word.surface() for word in tokenizer_obj.tokenize(text)]

for sentence_list in all_sentences:
    for sentence in sentence_list:
        tokens = tokenize_text(sentence)
        for token in tokens:
            if not token:
                continue
            if is_pure_kanji(token):
                vocab_freq[token] += 1
            elif contains_kanji(token):
                vocab_freq[token] += 1
            else:
                kana_words_freq[token] += 1

kanji_words = sorted(vocab_freq.items(), key=lambda item: item[1], reverse=True)
kana_words = sorted(kana_words_freq.items(), key=lambda item: item[1], reverse=True)

print("Number of Unique Vocabulary:", len(kanji_words))
print("Number of Removed Words:", len(kana_words))

all_vocab = kanji_words + kana_words
sorted_occurences = sorted(all_vocab, key=lambda x: x[1], reverse=True)
print(f'Total Unique Words: {len(sorted_occurences)}')

total_occurrences = sum(count for word, count in kana_words + kanji_words)
print(f'Total Words: {total_occurrences}')


In [None]:
all_vocab

#### Make json file

In [None]:
JPDB_freq = "./JPDB_term_meta_bank_1.json"

all_vocab = sorted(sorted_occurences, key=lambda x: x[1], reverse=True)

new_frequencies = {word: i + 1 for i, (word, _) in enumerate(all_vocab)}

with open(JPDB_freq, 'r', encoding='utf-8') as file:
    jpdb_data = json.load(file)

filtered_data = []
for item in jpdb_data:
    word = item[0]
    if word in new_frequencies:
        filtered_data.append(item)

def update_frequencies(data, freq_dict):
    for item in data:
        word = item[0]
        reading = item[2].get('reading') if 'reading' in item[2] else None
        if word in freq_dict:
            freq = freq_dict[word]
        elif reading in freq_dict:
            freq = freq_dict[reading]
        else:
            continue
        
        if 'frequency' in item[2]:
          item[2]['frequency'] = {
              'value': freq,
              'displayValue': f"{freq}㋕"
          }
print('Success!')

update_frequencies(filtered_data, new_frequencies)

sorted_data = sorted(filtered_data, key=lambda x: x[2]['frequency']['value'] if 'frequency' in x[2] else x[2]['value'])

term_meta_json = 'term_meta_bank_1.json'
with open(term_meta_json, 'w', encoding='utf-8') as file:
    json.dump(sorted_data, file, ensure_ascii=False, indent=2)

print(f"New JSON file {term_meta_json} has been created.")

#### Delete dupes

In [None]:
with open(term_meta_json, 'r', encoding='utf-8') as file:
    term_data = json.load(file)

unique_data = []
delete_counter = 0
for i in range(len(term_data)):
    is_duplicate = False
    for j in range(i + 1, len(term_data)):
        if term_data[i] == term_data[j]:
            is_duplicate = True
            delete_counter += 1
            break
    if not is_duplicate:
        unique_data.append(term_data[i])

with open(term_meta_json, 'w', encoding='utf-8') as file:
    json.dump(unique_data, file, ensure_ascii=False, indent=4)

print(f"Removed {delete_counter} duplicates successfully!")


#### Reorder terms

In [9]:
def find_occurrence(word):
    for item in sorted_occurences:
        if item[0] == word:
            return item[1]
    return None

with open(term_meta_json, 'r', encoding='utf-8') as file:
    term_data = json.load(file)

term_data.sort(key=lambda x: x[2]['frequency']['value'] if 'frequency' in x[2] else x[2]['value'])

for index, term in enumerate(term_data):
    if 'frequency' in term[2]:
      term[2]['frequency']['value'] = index + 1
      term[2]['frequency']['displayValue'] = f"{find_occurrence(term[0])}"
    else:
      term[2]['value'] = index + 1
      term[2]['displayValue'] = f"{find_occurrence(term[0])}"
        
with open(term_meta_json, 'w', encoding='utf-8') as file:
    json.dump(term_data, file, ensure_ascii=False, indent=4)


### Sort json by word frequency

In [10]:
with open(term_meta_json, 'r', encoding='utf-8') as file:
    term_data = json.load(file)

term_data.sort(key=lambda x: (int(x[2]['frequency']['displayValue']) if 'frequency' in x[2] else int(x[2]['displayValue'])), reverse=True)

with open(term_meta_json, 'w', encoding='utf-8') as file:
    json.dump(term_data, file, ensure_ascii=False, indent=4)


### Add marker for displayValue

In [None]:
with open(term_meta_json, 'r', encoding='utf-8') as file:
    term_data = json.load(file)

for index, term in enumerate(term_data):
    if 'frequency' in term[2]:
      term[2]['frequency']['value'] = index + 1
      term[2]['frequency']['displayValue'] = f"x{term[2]['frequency']['displayValue']}"
    else:
      term[2]['value'] = index + 1
      term[2]['displayValue'] = f"x{term[2]['displayValue']}"
        
with open(term_meta_json, 'w', encoding='utf-8') as file:
    json.dump(term_data, file, ensure_ascii=False, indent=4)

print(f'{term_meta_json} is ready for use!')