Using character composition data to construct a proxy for phonetic series.

In [8]:
from collections import defaultdict
import json

In [9]:
radicals = '一丨丶丿乀乁乙乚乛亅二亠人亻儿入八丷冂冖冫几凵刀刂力勹匕匚匸十卜卩㔾厂厶又口囗土士夂夊夕大女子宀寸小尢尣尸屮山巛巜川工己巾干幺广廴廾弋弓彐彑彡彳巜讠⻏阝忄扌氵犭⺮纟艹辶⻏门阝飞饣马心忄戈戶手扌支攴攵文斗斤方无旡日曰月木朩欠止歹歺殳毋比毛氏气水氵火灬爪爫父爻爿片牙牛牜犬犭尣尢王玉礻示罓网⺼月肉见見贝貝车車辶辵长長韦韋风風玄玉王瓜瓦甘生用田疋疒癶白皮皿目矛矢石示礻禸禾穴宂立旡无歺歹罒网衤衣钅金鸟鳥龙龍竹米糸糹纟缶网罒羊羽老而耒耳聿肉⺼月臣自至臼舌舛舟艮色艸艹虍虫血行衣衤西覀⺮竹页頁齐齊見见角言讠谷豆豕豸貝贝赤走足身車车辛辰辵辶邑⻏酉釆里麦麥龟龜魚'

In [10]:
with open('preprocessing/dataset/phonetic_series_composition.txt') as f:
    char_to_comp = {}
    desc_range = range(12272, 12284)
    for line in f:
        ln = line.strip().split(',')
        char_to_comp[ln[0]] = [
            ''.join([c for c in comp if ord(c) not in desc_range and ord(c) > 300])
            for comp in ln[2:]
        ]

In [11]:
char_to_all_daughters = {}
for char, comps in char_to_comp.items():
    curr_depth = 1
    unique_daughters_q = [c for c in set(''.join(comps))]
    if not unique_daughters_q:
        continue
    checked = set()
    while unique_daughters_q:
        daughter = unique_daughters_q[0]
        if daughter not in checked and daughter in char_to_comp:
            more_daughters = set(c for c in ''.join(char_to_comp[daughter]) if c not in checked)
            unique_daughters_q += list(more_daughters)
        checked.add(daughter)
        unique_daughters_q = unique_daughters_q[1:]
    char_to_all_daughters[char] = list(set(checked) - {char})

In [12]:
char_to_n_daughters = defaultdict(int)
for char, daughters in char_to_all_daughters.items():
    char_to_n_daughters[char] = len(daughters)

In [20]:
char_to_hypothesized_phonetic_series = {}

for char, daughters in char_to_all_daughters.items():
#     print(char, end=': ')
    highest = 0
    daughters_to_n = {}
    for d in daughters:
        if d in radicals:
            continue
#         print(d + str(char_to_n_daughters[d]), end=' ')
        if char_to_n_daughters[d] >= highest:
            highest = char_to_n_daughters[d]
            daughters_to_n[d] = char_to_n_daughters[d]
    if highest:
        best = [c for c in daughters_to_n if daughters_to_n[c] == highest]
#         print('\n' + char + ' -> ' + str(best))
        char_to_hypothesized_phonetic_series[char] = best[0]
    else:
#         print('\n' + char + ' -> ' + char)
        char_to_hypothesized_phonetic_series[char] = char
#     print()
#     print()

𠃊0 間9 門2 𠁣0 𠃛0 
僴 -> ['間']


In [14]:
# with open('hypothesized_phonetic_series.json', 'w') as f:
#     json.dump(char_to_hypothesized_phonetic_series, f, ensure_ascii=False)