In [29]:
import re
import json
from disjoint_set import DisjointSet
from bs4 import BeautifulSoup
from parse_util import is_cjk, get_pinyin_splits

In [30]:
with open('kroll_parse.json') as f:
    kroll = json.load(f)

In [31]:
# with open('kroll_parse.json', 'w', encoding='utf-8') as f:
#     json.dump(kroll, f, indent=2, ensure_ascii=False)

In [32]:
senses_with_bn = []
for character, entry in kroll.items():
    for definition in entry['definitions']:
        for sense in definition['senses']:
            if 'bn.' in sense:
                senses_with_bn.append([character, sense])

In [33]:
len(senses_with_bn)

1450

In [34]:
same_binomes_ds = DisjointSet()

## 1 occurrence of (bn.), 1 occurrence of ～, no occurrences of ⦿, no \<ol\>: 942 total

In [35]:
binomes_and_prons = set()

In [36]:
pat_1 = re.compile(r' ?(～.|.～)( .*?)( |,|\.)')

ct = 0
for char, sense in senses_with_bn:
    if (sense.count('bn.') == 1 and
        sense.count('～') == 1 and
        '⦿' not in sense):
        sense_strip_begin = sense[sense.index('(bn.)') + len('(bn.) '):]
        
        if '<ol' in sense_strip_begin:
            souptext = BeautifulSoup(sense_strip_begin, 'html.parser').find('ol').text
            if len(souptext) != len(souptext.encode()):
                continue

#         print(char, sense_strip_begin)
        text = BeautifulSoup(sense_strip_begin, 'html.parser').text
        match = pat_1.match(text)
        assert match is not None
#         print(text)
        char_str, pinyin, _ = match.groups()
        pinyin = pinyin.replace('(MC', '')
        
        binome = char_str.strip().replace('～', char)
        pron = pinyin.strip()
        
        binomes_and_prons.add((binome, pron))
        if len(get_pinyin_splits(pron)) != 1:
            print(binome, pron)
        ct += 1
ct

蒼茫 
嫦娥 cháng’é
狙猱 jū'náo
欄杆 lán’gān
茹藘 
瑪瑙 mǎ'nǎo
瑪瑙 mǎ'nǎo
肸蠁 
忀徉 or


1116

In [37]:
len(binomes_and_prons)

874

## 1 occurrence of (bn.), 1 occurrence of ～, 1+ occurrence of ⦿, no \<ol\>: 66 total

In [38]:
pat_2 = re.compile(r'⦿ (..)( .*?)( |,|\.)')

ct = 0
for char, sense in senses_with_bn:
    if (sense.count('bn.') == 1 and
        sense.count('～') == 1 and
        sense.count('⦿') and
        '<ol' not in sense):
        sense_strip_begin = sense[sense.index('(bn.)') + len('(bn.) '):]
        ct += 1
        sense_1, sense_2 = sense_strip_begin.split('⦿')

        text = BeautifulSoup(sense_1, 'html.parser').text
        match = pat_1.match(text)
        assert match is not None
        char_str, pinyin, _ = match.groups()
        pinyin = pinyin.replace('(MC', '')

        binome = char_str.strip().replace('～', char)
        pron = pinyin.strip()

        sense_2 = '⦿' + BeautifulSoup(sense_2, 'html.parser').text
        match = pat_2.match(sense_2)
        assert match is not None
        binome_variant, pron_variant, _ = match.groups()
        pron_variant = pron_variant.strip()
        
        binomes_and_prons.add((binome, pron))
        binomes_and_prons.add((binome_variant, pron_variant))
        
        if len(get_pinyin_splits(pron)) != 1:
            print(binome, pron)
        if len(get_pinyin_splits(pron_variant)) != 1:
            print(binome_variant, pron_variant)
        
        same_binomes_ds.union((binome, pron), (binome_variant, pron_variant))
ct

馬腦 mǎ'nǎo
瑪瑙 mǎ'nǎo


66

In [39]:
len(binomes_and_prons)

942

## 1 occurrence of (bn.), 2+ occurrence of ～, no ⦿, no \<ol\>: 109 total

In [40]:
pat_mc = re.compile(r"( ?\(MC ['‘`A-Za-z+]+-['‘`A-Za-z+]+\))")
for char, sense in senses_with_bn:
    if (sense.count('bn.') == 1 and
        sense.count('～') > 1 and
        '⦿' not in sense and
        '<ol' not in sense):
        sense_strip_begin = sense[sense.index('(bn.)') + len('(bn.) '):]
#         print(char, sense_strip_begin)
        for mc in pat_mc.findall(sense_strip_begin):
            sense_strip_begin = sense_strip_begin.replace(mc, '')
        text = BeautifulSoup(sense_strip_begin, 'html.parser').text
        text = re.sub('[,\.\(\);]', '', text).replace('→ ', '').split()
        curr_binome = None
        curr_pron = None
        prev_binome = None
#         print(text)
        for substring in text:
            if ('→' in substring or
                all(ord(c) < 128 or c in '’“”' for c in substring) or
                substring in ('Śāriputra', 'śarīra', 'zanjī')):
                continue

            if all(is_cjk(c) or c == '～' for c in substring):
                curr_binome = substring.replace('～', char)

                if prev_binome is None:
                    prev_binome = curr_binome

            else:
                curr_pron = substring

                if curr_binome is None:
                    continue

                same_binomes_ds.find((curr_binome, curr_pron))
                
                if prev_binome:
                    same_binomes_ds.union((curr_binome, curr_pron), (prev_binome, curr_pron))

                for (binome, pron) in [s for s in same_binomes_ds.itersets() if (curr_binome, curr_pron) in s][0]:
                    binomes_and_prons.add((binome, pron))
                    if len(get_pinyin_splits(pron)) != 1:
                        print(binome, pron)
                
                prev_binome = None

恆娥 héng’é
嫦娥 cháng’é
荷 hé
蘆 lú
金柑 jīn’gān
金錢松 jīnqiánsōng
蛩蛩駏驉 qióngqióngjùxū
蛩 qióng
嬋 chán
荳 dòu
豆 dòu
檳 bīn
儻 tǎng
沙 shā
蕙 huì
紛 fēn
澎 pēng
慇 yīn
舍利弗 shèlìfú
鸜鵒 qú
鸜 qú
羊躑躅 yángzhízhú


In [41]:
def is_in_remaining_set(sense):
    # 1, 1, 0, 1+
    # 1, 2+, 0, 1+
    # 1, 2+, 1+, 1+
    if (sense.count('bn.') == 1 and
        sense.count('～') and
        '<ol' in sense):
        sense_strip_begin = sense[sense.index('(bn.)') + len('(bn.) '):]
        if '<ol' in sense_strip_begin:
            souptext = BeautifulSoup(sense_strip_begin, 'html.parser').find('ol').text
            if len(souptext) != len(souptext.encode()):
                return True
    
    # 1, 0, 0+, 0+
    # 2, 0+, 0+, 0+
    # 3+, 0+, 0+, 0+
    if (sense.count('bn.') and
        sense.count('～') == 0):
        return True
    
    # 1, 2+, 1+, 0
    if (sense.count('bn.') == 1 and
        sense.count('～') > 1 and
        '⦿' in sense and
        '<ol' not in sense):
        return True
    
    return False

## Remaining cases

In [42]:
# ct = 0
# for char, sense in senses_with_bn:
#     if is_in_remaining_set(sense):
#         ct += 1
#         print(char, BeautifulSoup(sense, 'html.parser').text.strip().replace('\n', ''))
#         print()
# ct

In [43]:
hand_annotated = '''蓽茇 bìbá	蓽撥 bìbō
刺促 cìcù
的皪 dìlì
氛氳 fēnyùn
芙蓉 fúróng	芙蕖 fúqú
蕉悴 qiáocuì	蕉萃 qiáocuì	憔悴 qiáocuì	憔萃 qiáocuì
驕傲 jiāo’ào
崑崙 kūnlún
芒忽 huǎnghū	恍惚 huǎnghū
獼猴 míhóu
撓挑 náotiāo
磐礡 pánbó	磐薄 pánbó	盤薄 pánbó
蚍蜉 pífú
彯搖 piàoyáo	票姚 piàoyáo
葡萄 pútáo	葡桃 pútáo
蒲伏 púfú	蒲服 púfú	匍匐 púfú
阡眠 qiānmián	阡綿 qiānmián	芊眠 qiānmián	芊綿 qiānmián
憔悴 qiáocuì	憔顇 qiáocuì	蕉萃 qiáocuì
芍藥 sháoyào	勺藥 sháoyào
驌驦 sùshuāng	鷫鷞 sùshuāng
爣朗 tǎnglǎng	爣閬 tǎnglǎng	儻朗 tǎnglǎng	儻閬 tǎnglǎng
婉戀 wǎnliàn
茱萸 zhūyú
茵蒀 yīnyūn	葐蒀 fényūn	氤氳 yīnyūn
鷓鴣 zhègū
獬豸 xièzhì
螽斯 zhōngsī
汎𣶏 fájié
蒼茫 cāngmáng
嫦娥 cháng’é
狙猱 jū'náo
欄杆 lán’gān
茹藘 rúlǘ
瑪瑙 mǎ'nǎo
瑪瑙 mǎ'nǎo
肸蠁 xīxiǎng'''

In [44]:
for binome_class in hand_annotated.split('\n'):
    binome_pron_list = []
    for binome_pron in binome_class.split('\t'):
        binome, pron = binome_pron.split()
        binome_pron_list.append((binome, pron))
        binomes_and_prons.add((binome, pron))
    for binome_pron in binome_pron_list[1:]:
        same_binomes_ds.union(binome_pron_list[0], binome_pron)

In [45]:
a1 = ('忀徉', 'xiāngyáng')
a2 = ('儴佯', 'xiāngyáng')
a3 = ('相羊', 'xiāngyáng')

same_binomes_ds.union(a1, a2)
same_binomes_ds.union(a2, a3)
binomes_and_prons |= {a1, a2, a3}

In [46]:
import pickle as pkl
with open('BINOMES_PARSED_LIST.pkl', 'wb') as f:
    pkl.dump({(b, p.strip()) for (b, p) in binomes_and_prons if (len(b) == 2 and (p.strip()))}, f)
with open('BINOMES_PARSED_SAME_DS.pkl', 'wb') as f:
    pkl.dump(same_binomes_ds, f)

# verify that we have a superset

In [45]:
import pandas as pd

In [46]:
binomes_df = pd.read_csv('binomes_df.tsv', sep='\t')

In [47]:
for _, row in binomes_df.iterrows():
    if len({b for b, p in binomes_and_prons if b == row.traditional}) != 1:
        print(row.traditional)

黯慘
黯黮
黯黮
翱翔
懊惱
斑剝
崩騰
赑屓
辟易
勃然
參潭
慘悴
燦爛
藏催
嘈啐
嘈雜
參差
差池
瀺灂
嶄巗
瀺灂
悵望
趻踔
哆然
差池
蔥蘢
踧踖
蹴然
蹙然
攢雜
蹉跌
澹泊
澹淡
澹蕩
澹灧
滴瀝
蹀躞
憞溷
峨然
幡然
袢廷
沸濆
紛綸
芬氳
紛紜
蚡緼
豐茸
怫然
嘸然
闞然
杭壯
輷輘
鴻洞
鴻蒙
劃然
萑蔰
萑蘭
荒唐
熀爛
潢然
浹洽
間關
齽齘
啾嘈
局促
跼躅
具然
悁急
屈強
戄然
鏗鍧
鏗鏘
塊然
曠蕩
蘭單
磊落
離麋
離靡
離披
漣洏
聯娟
聯卷
連卷
連軒
戀眷
瀲灧
寥廓
飉濿
嘹唳
寥唳
寥亮
嘹亮
寥落
潦倒
璘班
璘彬
琳瑯
轔轢
燐亂
凜然
轔轢
輘輷
岺巆
凌厲
岺嶙
隆穹
龍鐘
蘢蔥
霡霂
曼羨
曼衍
茫昧
茫然
茫洋
莽蒼
彌漫
密勿
黽勉
膩細
蘖卼
濘溺
嘔軋
盤桓
磅礴
滂沛
濆薄
輣訇
輣闐
漂然
瞥列
瞥裂
嬪然
坡坨
葡服
葡伏
祺然
崎嶷
岐嶷
磧礫
磧歷
鎗玲
踥跌
嶔岑
嶔崎
丘虛
蘧然
蜷跼
逡循
溶與
颯爽
颯沓
騷搔
奭然
儵忽
率爾
率然
斯須
索落
索然
荅然
荅焉
嗒然
嗒焉
曇欒
黮暗
曭朗
曭莽
倜然
岧嵽
岹嵽
岹嶤
岧嶤
岧嶤
膧䑃
秃兀
脫然
妥帖
頑鈍
婉轉
汪翔
萎約
巍峨
萎腇
蓊茸
蓊鬱
齷齪
烏乎
嗚虖
嗚呼
烏呼
烏虖
嗚乎
嗚呼
嗚咽
嗚咽
沕潏
沕穆
翕然
翕赩
屭奰
翕赫
嗑然
洗然
象罔
翛然
蕭灑
瀟灑
瀟瑟
蕭瑟
蕭瑟
瀟森
蕭森
瀟率
蕭率
蕭肅
瀟索
蕭索
蕭條
瀟條
蕭條
蕭閒
澩灂
涓然
嫣然
閹然
菴藹
泱灢
么麼
妖冶
嶢嶭
搖曳
窅眇
窅然
嶪峩
嶪峨
曵曳
依稀
伊邑
伊鬱
依約
夷靡
迤靡
蘙薈
蘙蔚
佚豫
屹崪
听然
顒然
優游
優游
油然
褏然
俞然
蔚結
雜錯
雜襲
嶄然
蹍然
輾轉
展轉
輾轉
贄然
灼然
酌然
岝崿
