In [112]:
import pickle as pkl
import pandas as pd
import numpy as np
import unicodedata as unic
from collections import defaultdict

In [5]:
with open('BINOMES_FULL_LIST.pkl', 'rb') as f:
    binomes_full_list = pkl.load(f)
with open('BINOMES_PARSED_LIST.pkl', 'rb') as f:
    binomes_parsed_list = pkl.load(f)
with open('BINOMES_PARSED_SAME_DS.pkl', 'rb') as f:
    binomes_parsed_same_ds = pkl.load(f)

In [11]:
# for st in binomes_parsed_same_ds.itersets():
#     if len(st) > 1:
#         print(st)

In [21]:
pinyins = set()
with open('pinyin_table.csv') as f:
    for line in f:
        ln = [entry for entry in line.strip().split(',') if entry]
        for entry in ln:
            pinyins.add(unic.normalize('NFC', ''.join([c for c in entry if c.isalpha()])))
pinyins.add('er')

In [30]:
def remove_tone_mark(pinyin_syllable):
    return ''.join([c for c in unic.normalize('NFD', pinyin_syllable) if c.isalpha()])

In [51]:
def change_to_numerals(pinyin_syllable):
    no_tone = remove_tone_mark(pinyin_syllable)
    non_alpha = [c for c in unic.normalize('NFD', pinyin_syllable) if not c.isalpha()]
    if "ā"[1] in non_alpha:
        return no_tone + '1'
    if "é"[1] in non_alpha:
        return no_tone + '2'
    if "ǒ"[1] in non_alpha:
        return no_tone + '3'
    if "ò"[1] in non_alpha:
        return no_tone + '4'

In [61]:
clusters = []
for st in binomes_parsed_same_ds.itersets():
    st_with_numbers = set()
    for (binome, pron) in st:
        if len(binome) != 2:
            continue
        pron_composed = unic.normalize('NFC', pron)
        valid_splits = []
        for i in range(1, len(pron_composed)):
            first, second = pron_composed[:i], pron_composed[i:]
            if remove_tone_mark(first) in pinyins and remove_tone_mark(second) in pinyins:
                valid_splits.append((first, second))
        if len(valid_splits) == 1:
            st_with_numbers.add((binome, change_to_numerals(valid_splits[0][0]), change_to_numerals(valid_splits[0][1])))
        elif "'" in pron_composed:
            first, second = pron_composed.split("'")
            st_with_numbers.add((binome, change_to_numerals(first), change_to_numerals(second)))
        elif "`" in pron_composed:
            first, second = pron_composed.split("`")
            st_with_numbers.add((binome, change_to_numerals(first), change_to_numerals(second)))
        elif "’" in pron_composed:
            first, second = pron_composed.split("’")
            st_with_numbers.add((binome, change_to_numerals(first), change_to_numerals(second)))
        else:
            print(binome, pron)
    if st_with_numbers:
        clusters.append(st_with_numbers)

芍藥 zhuólüè
勺藥 zhuólüè
鸜鵒 qú


In [62]:
clusters

[{('俾倪', 'pi4', 'ni4'), ('俾睨', 'pi4', 'ni4'), ('埤坭', 'pi4', 'ni4')},
 {('篳篥', 'bi4', 'li4'), ('觱篥', 'bi4', 'li4')},
 {('倉庚', 'cang1', 'geng1'), ('鶬鶊', 'cang1', 'geng1')},
 {('孱顏', 'chan2', 'yan2'), ('巉巖', 'chan2', 'yan2')},
 {('硨磲', 'ju1', 'qu2'), ('車渠', 'ju1', 'qu2')},
 {('岨峿', 'ju3', 'yu3'), ('鉏鋙', 'ju3', 'yu3'), ('齟齬', 'ju3', 'yu3')},
 {('蹢躅', 'zhi2', 'zhu2'), ('躑躅', 'zhi2', 'zhu2')},
 {('毒冒', 'dai4', 'mao4'), ('瑇瑁', 'dai4', 'mao4')},
 {('仿佛', 'fang3', 'fu2'), ('彷彿', 'fang3', 'fu2')},
 {('仿偟', 'pang2', 'huang2'),
  ('傍偟', 'pang2', 'huang2'),
  ('彷徨', 'pang2', 'huang2'),
  ('徬徨', 'pang2', 'huang2'),
  ('房皇', 'pang2', 'huang2')},
 {('氤氳', 'yin1', 'yun1'),
  ('絪縕', 'yin1', 'yun1'),
  ('茵蒀', 'yin1', 'yun1'),
  ('葐蒀', 'fen2', 'yun1')},
 {('匍匐', 'pu2', 'fu2'),
  ('扶伏', 'pu2', 'fu2'),
  ('蒲伏', 'pu2', 'fu2'),
  ('蒲服', 'pu2', 'fu2')},
 {('浩汗', 'hao4', 'han4'), ('滈汗', 'hao4', 'han4')},
 {('澒洞', 'hong4', 'dong4'), ('虹洞', 'hong4', 'dong4')},
 {('怳惚', 'huang3', 'hu1'),
  ('恍忽', 'huang3', 'hu1'),

In [66]:
pron_to_cluster_idx = {}
for idx, st in enumerate(clusters):
    for (binome, first, second) in st:
        pron_to_cluster_idx[(first, second)] = idx

cluster_idx_to_prons = defaultdict(list)
for idx, st in enumerate(clusters):
    for (binome, first, second) in st:
        cluster_idx_to_prons[idx].append((first, second))

In [72]:
for i in range(221):
    for (first, second) in cluster_idx_to_prons[i]:
        if any(((first, second) in cluster_idx_to_prons[j]) or ((second, first) in cluster_idx_to_prons[j]) for j in range(i)):
            print(i, first, second)

34 pi4 ni4
34 pi4 ni4
68 long3 cong1
78 nuo3 e1
83 yu4 fu2
90 hu1 huang3
90 hu1 huang3
102 po4 ju1
113 ban1 lan2
118 li2 shi1
119 shi1 li2
121 yi3 li3
126 ping1 ling2
128 qing2 ling2
131 ling2 long2
137 hong4 meng2
140 mie4 meng3
145 xing4 ming3
147 cuo4 mo4
156 pan2 shan1
158 pan2 bi4
166 ren3 ran3
169 se4 suo3
180 tong2 long2
181 tong2 meng2
184 xian3 xi1
198 cha1 ya1
209 zhun1 zhan1


In [273]:
clusters_manually_cleaned = [{('俾倪', 'pi4', 'ni4'), ('俾睨', 'pi4', 'ni4'), ('埤坭', 'pi4', 'ni4'), ('埤堄', 'pi4', 'ni4'), ('睥睨', 'pi4', 'ni4')},
 {('篳篥', 'bi4', 'li4'), ('觱篥', 'bi4', 'li4')},
 {('倉庚', 'cang1', 'geng1'), ('鶬鶊', 'cang1', 'geng1')},
 {('孱顏', 'chan2', 'yan2'), ('巉巖', 'chan2', 'yan2')},
 {('硨磲', 'ju1', 'qu2'), ('車渠', 'ju1', 'qu2')},
 {('岨峿', 'ju3', 'yu3'), ('鉏鋙', 'ju3', 'yu3'), ('齟齬', 'ju3', 'yu3')},
 {('蹢躅', 'zhi2', 'zhu2'), ('躑躅', 'zhi2', 'zhu2')},
 {('芍藥', 'zhuo2', 'lüe4'), ('勺藥', 'zhuo2', 'lüe4')},
 {('毒冒', 'dai4', 'mao4'), ('瑇瑁', 'dai4', 'mao4')},
 {('仿佛', 'fang3', 'fu2'), ('彷彿', 'fang3', 'fu2')},
 {('仿偟', 'pang2', 'huang2'),
  ('傍偟', 'pang2', 'huang2'),
  ('彷徨', 'pang2', 'huang2'),
  ('徬徨', 'pang2', 'huang2'),
  ('房皇', 'pang2', 'huang2')},
 {('氤氳', 'yin1', 'yun1'),
  ('絪縕', 'yin1', 'yun1'),
  ('茵蒀', 'yin1', 'yun1'),
  ('葐蒀', 'fen2', 'yun1')},
 {('匍匐', 'pu2', 'fu2'),
  ('扶伏', 'pu2', 'fu2'),
  ('蒲伏', 'pu2', 'fu2'),
  ('蒲服', 'pu2', 'fu2')},
 {('浩汗', 'hao4', 'han4'), ('滈汗', 'hao4', 'han4')},
 {('澒洞', 'hong4', 'dong4'), ('虹洞', 'hong4', 'dong4')},
 {('怳惚', 'huang3', 'hu1'),
  ('恍忽', 'huang3', 'hu1'),
  ('恍惚', 'huang3', 'hu1'),
  ('慌惚', 'huang3', 'hu1'),
  ('芒忽', 'huang3', 'hu1'),
  ('荒忽', 'huang1', 'hu1'),
  ('惚怳', 'hu1', 'huang3'),
  ('惚恍', 'hu1', 'huang3')},
 {('迦逅', 'xie4', 'hou4'), ('邂逅', 'xie4', 'hou4')},
 {('摎蓼', 'jiu1', 'liu3'), ('糾蓼', 'jiao3', 'liao3')},
 {('君遷', 'jun1', 'qian1'), ('桾櫏', 'jun1', 'qian1')},
 {('坎坷', 'kan3', 'ke3'), ('轗軻', 'kan3', 'ke3')},
 {('崑崙', 'kun1', 'lun2'), ('昆侖', 'kun1', 'lun2')},
 {('昆侖', 'hun2', 'lun2'), ('渾淪', 'hun2', 'lun2')},
 {('昆吾', 'kun1', 'wu2'), ('琨珸', 'kun1', 'wu2')},
 {('藍縷', 'lan2', 'lu3'), ('襤縷', 'lan2', 'lu3'), ('襤褸', 'lan2', 'lu3')},
 {('蘺褷', 'li2', 'shi1'), ('離褷', 'li2', 'shi1'), ('褵褷', 'li2', 'shi1'), ('褷褵', 'shi1', 'li2')},
 {('苓落', 'ling2', 'luo4'), ('零落', 'ling2', 'luo4')},
 {('瑪瑙', 'ma3', 'nao3'), ('馬腦', 'ma3', 'nao3')},
 {('僶俛', 'min3', 'mian3'), ('僶勉', 'min3', 'mian3'), ('閔勉', 'min3', 'mian3')},
 {('茗艼', 'ming2', 'ting1'), ('酩酊', 'ming3', 'ding3')},
 {('兠鍪', 'dou1', 'mou2'), ('兠鞪', 'dou1', 'mou2')},
 {('椅柅', 'yi3', 'ni3'), ('猗狔', 'yi3', 'ni3')},
 {('杌隉', 'wu4', 'nie4'), ('臲卼', 'nie4', 'wu4'), ('阢隉', 'wu4', 'nie4')},
 {('婆娑', 'po2', 'suo1'), ('媻娑', 'po2', 'suo1')},
 {('咆哮', 'pao2', 'xiao1'), ('炰烋', 'pao2', 'xiao1')},
 {('徘徊', 'pai2', 'huai2'), ('裴徊', 'pei2', 'huai2')},
 {('僛丑', 'qi1', 'chou3'), ('魌丑', 'qi1', 'chou3')},
 {('籧篨', 'qu2', 'chu2'), ('蘧蒢', 'qu2', 'chu2')},
 {('劬錄', 'qu2', 'lu4'), ('軥錄', 'qu2', 'lu4')},
 {('鴝鵒', 'qu2', 'yu4'), ('鸜鵒', 'qu2', 'yu4')},
 {('茹蘆', 'ru2', 'lu2'), ('蕠蘆', 'ru2', 'lu2')},
 {('舑舕', 'tan1', 'tan4'), ('舔舕', 'tian3', 'tan4')},
 {('唐突', 'tang2', 'tu1'), ('樘突', 'tang2', 'tu1')},
 {('姚冶', 'yao2', 'ye3'), ('窕冶', 'yao2', 'ye3')},
 {('滂陀', 'pang1', 'tuo2'), ('滂陁', 'pang1', 'tuo2')},
 {('汪漾', 'wang1', 'yang4'), ('瀇漾', 'wang3', 'yang4')},
 {('罔閬', 'wang3', 'liang3'), ('魍魎', 'wang3', 'liang3')},
 {('握齱', 'wo4', 'chuo4'), ('齷齱', 'wo4', 'chuo4')},
 {('孅趨', 'qian1', 'qu1'), ('纖趨', 'xian1', 'qu1')},
 {('䆗窱', 'yao3', 'tiao3'), ('窈窕', 'yao3', 'tiao3')},
 {('慇勤', 'yin1', 'qin2'), ('慇懃', 'yin1', 'qin2'), ('殷勤', 'yin1', 'qin2')},
 {('招搖', 'shao2', 'yao2'), ('逍遙', 'xiao1', 'yao2')},
 {('侜張', 'zhou1', 'zhang1'), ('譸張', 'zhou1', 'zhang1')},
 {('侏儒', 'zhu1', 'ru2'), ('朱儒', 'zhu1', 'ru2')},
 {('倉卒', 'cang1', 'cu4'), ('倉猝', 'cang1', 'cu4')},
 {('蟕蠵', 'zui1', 'xi1'), ('觜蠵', 'zi1', 'xi1')},
 {('靉靆', 'ai4', 'dai4')},
 {('靉靅', 'ai4', 'pei4')},
 {('倉煌', 'cang1', 'huang2'), ('倉皇', 'cang1', 'huang2')},
 {('嶒峵', 'ceng2', 'rong2')},
 {('崚嶒', 'ling2', 'ceng2')},
 {('蠆介', 'chai4', 'jie4'), ('蠆芥', 'chai4', 'jie4')},
 {('蟬聯', 'chan2', 'lian2'), ('蟬連', 'chan2', 'lian2')},
 {('猖獗', 'chang1', 'jue2'), ('猖蹶', 'chang1', 'jue2')},
 {('綝纚', 'shen1', 'xi3')},
 {('綝縿', 'shen1', 'shan1')},
 {('躕躇', 'chu2', 'chu2')},
 {('踟躕', 'chi2', 'chu2')},
 {('葱蘢', 'cong1', 'long3'), ('蘢葱', 'long3', 'cong1')},
 {('綷縩', 'cui4', 'cai4')},
 {('綷粲', 'cui4', 'can4')},
 {('蝴蝶', 'hu2', 'die2')},
 {('蛺蝶', 'jia2', 'die2')},
 {('跌宕', 'die1', 'dang4'), ('跌蕩', 'die1', 'dang4')},
 {('娥媓', 'e2', 'huang2'), ('娥皇', 'e2', 'huang2')},
 {('恆娥', 'heng2', 'e2')},
 {('嫦娥', 'chang2', 'e2')},
 {('婀娜', 'e1', 'nuo3'), ('娜婀', 'nuo3', 'e1')},
 {('放浪', 'fang4', 'lang2')},
 {('放蕩', 'fang4', 'dang4')},
 {('放意', 'fang4', 'yi4')},
 {('岪鬱', 'fu2', 'yu4'), ('鬱岪', 'yu4', 'fu2')},
 {('芙蓉', 'fu2', 'rong2'), ('芙蕖', 'fu2', 'qu2')},
 {('菡萏', 'han4', 'dan4')},
 {('蘆菔', 'lu2', 'fu2')},
 {('萊菔', 'lai2', 'fu2')},
 {('蘿菔', 'luo2', 'fu2')},
 {('鈷䥈', 'gu3', 'mu3'), ('鈷鉧', 'gu3', 'mu3')},
 {('滉瀁', 'huang4', 'yang3')},
 {('捷給', 'jie2', 'ji3')},
 {('捷急', 'jie2', 'ji2')},
 {('桔梗', 'jie2', 'geng3')},
 {('山桔', 'shan1', 'jie2')},
 {('金橘', 'jin1', 'ju2')},
 {('金柑', 'jin1', 'gan1')},
 {('蜻蜒', 'qing1', 'yan2')},
 {('蜻蛉', 'qing1', 'ling2')},
 {('盧橘', 'lu2', 'ju2')},
 {('苴蒪', 'ju1', 'po4'), ('蒪苴', 'po4', 'ju1')},
 {('蘘荷', 'rang2', 'he2')},
 {('駏驉', 'ju4', 'xu1')},
 {('嬋娟', 'chan2', 'juan1')},
 {('㛹娟', 'pian2', 'juan1')},
 {('㝩㝗', 'kang1', 'lang2')},
 {('㝩梁', 'kang1', 'liang2')},
 {('孔雀', 'kong3', 'que4')},
 {('孔翠', 'kong3', 'cui4')},
 {('荳蔻', 'dou4', 'kou4'), ('豆蔻', 'dou4', 'kou4')},
 {('斕斑', 'lan2', 'ban1'), ('斕斒', 'lan2', 'ban1'), ('斒斕', 'ban1', 'lan2')},
 {('檳榔', 'bin1', 'lang2')},
 {('桄榔', 'guang1', 'lang2')},
 {('儻朗', 'tang3', 'lang3'),
  ('儻閬', 'tang3', 'lang3'),
  ('爣朗', 'tang3', 'lang3'),
  ('爣閬', 'tang3', 'lang3')},
 {('荔支', 'li4', 'zhi1'), ('荔枝', 'li4', 'zhi1')},
 {('邐迤', 'li3', 'yi3'), ('迤邐', 'yi3', 'li3')},
 {('俍倡', 'lang3', 'chang1')},
 {('俍傍', 'lang3', 'pang2')},
 {('踉蹌', 'liang4', 'qiang1'), ('踉蹡', 'liang4', 'qiang1')},
 {('伶俜', 'ling2', 'ping1'), ('俜伶', 'ping1', 'ling2')},
 {('㱥殑', 'ling2', 'qing2'), ('殑㱥', 'qing2', 'ling2')},
 {('沙鰡', 'sha1', 'liu4'), ('紗鰡', 'sha1', 'liu4')},
 {('瓏玲', 'long2', 'ling2'), ('玲瓏', 'ling2', 'long2')},
 {('瓏璁', 'long2', 'cong1')},
 {('蘢茸', 'long3', 'rong2')},
 {('蘿勒', 'luo2', 'le4'), ('蘿艻', 'luo2', 'le4')},
 {('蛤蟆', 'ha2', 'ma2'), ('蝦蟆', 'ha2', 'ma2')},
 {('濛澒', 'meng2', 'hong4'), ('澒濛', 'hong4', 'meng2')},
 {('艨艟', 'meng2', 'chong1'), ('艨衝', 'meng2', 'chong1')},
 {('蠓蠛', 'meng3', 'mie4'), ('蠛蠓', 'mie4', 'meng3')},
 {('渺茫', 'miao3', 'mang2')},
 {('渺漫', 'miao3', 'man4')},
 {('渺瀰', 'miao3', 'mi2')},
 {('溟涬', 'ming3', 'xing4'), ('涬溟', 'xing4', 'ming3')},
 {('莫錯', 'mo4', 'cuo4'), ('錯莫', 'cuo4', 'mo4')},
 {('葩華', 'pa1', 'hua2')},
 {('紛葩', 'fen1', 'pa1')},
 {('滂湃', 'peng1', 'pai4'), ('澎湃', 'peng1', 'pai4')},
 {('叛衍', 'pan4', 'yan3')},
 {('叛換', 'pan4', 'huan4')},
 {('媻姍', 'pan2', 'shan1'), ('媻珊', 'pan2', 'shan1'), ('蹣跚', 'pan2', 'shan1')},
 {('磐桓', 'pan2', 'huan2')},
 {('磐辟', 'pan2', 'bi4'), ('蹣辟', 'pan2', 'bi4')},
 {('蹣連', 'pan2', 'lian2')},
 {('磅礡', 'pang2', 'bo2')},
 {('磅唐', 'pang2', 'tang2')},
 {('蓬勃', 'peng2', 'bo2')},
 {('蓬茸', 'peng2', 'rong2')},
 {('戚促', 'qi1', 'cu4')},
 {('戚草', 'qi1', 'cao3')},
 {('苒荏', 'ran3', 'ren3'), ('荏苒', 'ren3', 'ran3')},
 {('萎蕤', 'wei3', 'rui2'), ('葳蕤', 'wei3', 'rui2')},
 {('瑟索', 'se4', 'suo3'), ('瑟縮', 'se4', 'suo3')},
 {('僧祇', 'seng1', 'qi2'), ('僧耆', 'seng1', 'qi2')},
 {('舍利', 'she4', 'li4')},
 {('淰燿', 'shen3', 'shuo4')},
 {('淰躍', 'shen3', 'yue4')},
 {('倏忽', 'shu1', 'hu1')},
 {('倏眒', 'shu1', 'shen1')},
 {('壇曼', 'dan4', 'man4'), ('壇漫', 'dan4', 'man4')},
 {('町畽', 'tian3', 'tuan3'), ('町疃', 'tian3', 'tuan3')},
 {('曈曚', 'tong2', 'meng2'), ('膧朦', 'tong2', 'meng2')},
 {('曈曨', 'tong2', 'long2'), ('膧朧', 'tong2', 'long2')},
 {('汍瀾', 'wan2', 'lan2'), ('汍蘭', 'wan2', 'lan2')},
 {('巇險', 'xi1', 'xian3'), ('險巇', 'xian3', 'xi1')},
 {('恓惶', 'xi1', 'huang2')},
 {('恓屑', 'xi1', 'xie4')},
 {('躞蹀', 'xie4', 'die2')},
 {('惺忪', 'xing1', 'song1'), ('惺鬆', 'xing1', 'song1')},
 {('坱圠', 'yang3', 'ya4'), ('坱軋', 'yang3', 'ya4')},
 {('徜徉', 'chang2', 'yang2')},
 {('彷徉', 'pang2', 'yang2')},
 {('望洋', 'wang4', 'yang2')},
 {('汪洋', 'wang1', 'yang2')},
 {('勺藥', 'shao2', 'yao4'), ('芍藥', 'shao2', 'yao4')},
 {('颻颺', 'yao2', 'yang2')},
 {('飄颻', 'piao1', 'yao2')},
 {('枒杈', 'ya1', 'cha1'), ('杈枒', 'cha1', 'ya1')},
 {('猗儺', 'e3', 'nuo2')},
 {('喑噁', 'yin4', 'wu4')},
 {('喑嗚', 'yin4', 'wu1')},
 {('因巡', 'yin1', 'xun2'), ('因循', 'yin1', 'xun2')},
 {('雍容', 'yong1', 'rong2')},
 {('三雍', 'san1', 'yong1')},
 {('優遊', 'you1', 'you2')},
 {('優柔', 'you1', 'rou2')},
 {('幼妙', 'yao4', 'miao4'), ('幼眇', 'yao4', 'miao4')},
 {('邅迍', 'zhan1', 'zhun1'), ('迍邅', 'zhun1', 'zhan1')},
 {('啁噍', 'zhou1', 'jiao1')},
 {('啁啾', 'zhou1', 'jiu1')},
 {('鷦鷯', 'jiao1', 'liao2')},
 {('蓽撥', 'bi4', 'bo1'), ('蓽茇', 'bi4', 'ba2')},
 {('憔悴', 'qiao2', 'cui4'),
  ('憔萃', 'qiao2', 'cui4'),
  ('憔顇', 'qiao2', 'cui4'),
  ('蕉悴', 'qiao2', 'cui4'),
  ('蕉萃', 'qiao2', 'cui4')},
 {('盤薄', 'pan2', 'bo2'), ('磐礡', 'pan2', 'bo2'), ('磐薄', 'pan2', 'bo2')},
 {('彯搖', 'piao4', 'yao2'), ('票姚', 'piao4', 'yao2')},
 {('葡桃', 'pu2', 'tao2'), ('葡萄', 'pu2', 'tao2')},
 {('芊眠', 'qian1', 'mian2'),
  ('芊綿', 'qian1', 'mian2'),
  ('阡眠', 'qian1', 'mian2'),
  ('阡綿', 'qian1', 'mian2')},
 {('驌驦', 'su4', 'shuang1'), ('鷫鷞', 'su4', 'shuang1')},
 {('儴佯', 'xiang1', 'yang2'),
  ('忀徉', 'xiang1', 'yang2'),
  ('相羊', 'xiang1', 'yang2')}]

In [274]:
cluster_idx_to_prons_cleaned = defaultdict(list)
for idx, st in enumerate(clusters_manually_cleaned):
    for (binome, first, second) in st:
        cluster_idx_to_prons_cleaned[idx].append((first, second))

for i in range(221):
    for (first, second) in cluster_idx_to_prons_cleaned[i]:
        if any(((first, second) in cluster_idx_to_prons_cleaned[j]) or ((second, first) in cluster_idx_to_prons_cleaned[j]) for j in range(i)):
            print(i, first, second)

In [275]:
CLUSTERS_PARSED = [{y for y in x} for x in clusters_manually_cleaned]

# merge with CHIDEOD

In [276]:
binomes_df = pd.read_csv('binomes_df.tsv', sep='\t')

In [277]:
len(binomes_df)

927

In [278]:
for i, row in binomes_df.iterrows():
    traditional, pinyin_tonenumber, orthographic_variants = row
    first, second = pinyin_tonenumber.split('~')
    
    found_idx_in_CLUSTER = None
    
    for i, cluster in enumerate(CLUSTERS_PARSED):
        if any(traditional == binome_ for (binome_, first_, second_) in cluster):
            found_idx_in_CLUSTER = i
            break
#     if not found_idx_in_CLUSTER:
#         for i, cluster in enumerate(CLUSTERS_PARSED):
#             if any((first == first_ and second_ == second_) or (second_ == first and first_ == second) for (binome_, first_, second_) in cluster):
#                 found_idx_in_CLUSTER = i
#                 break

    if found_idx_in_CLUSTER and found_idx_in_CLUSTER >= 0:
        if traditional[1] != '然' and traditional[0] != traditional[1]:
            CLUSTERS_PARSED[found_idx_in_CLUSTER].add((traditional, first, second))
    else:
        if traditional[1] != '然' and traditional[0] != traditional[1]:
            CLUSTERS_PARSED.append({(traditional, first, second)})
            found_idx_in_CLUSTER = -1
            print('added', traditional)
    if found_idx_in_CLUSTER and str(orthographic_variants) != 'nan':
        for variant in orthographic_variants.strip().replace(' ', '').split(','):
            if variant[1] != '然' and variant[0] != variant[1]:
                CLUSTERS_PARSED[found_idx_in_CLUSTER].add((variant, first, second))

added 欸乃
added 薆薱
added 啽囈
added 啽哢
added 啽默
added 黯慘
added 黯黮
added 案衍
added 昂藏
added 囂藹
added 翱翔
added 懊憦
added 懊惱
added 斑剝
added 苯䔿
added 崩騰
added 菶茸
added 逼迸
added 觱發
added 觱沸
added 蔽芾
added 襞襀
added 赑屓
added 辟易
added 髟鼬
added 蹩躠
added 玢豳
added 繽紛
added 驞駍
added 屏營
added 撥剌
added 勃窣
added 布濩
added 驂驔
added 參潭
added 慘悴
added 燦爛
added 蒼黃
added 滄浪
added 蒼茫
added 蒼莽
added 傖獰
added 藏催
added 嘈啐
added 嘈雜
added 嘈囋
added 草次
added 惻愴
added 參差
added 岑崟
added 蹭蹬
added 查牙
added 差池
added 侘傺
added 瀺灂
added 蟾蜍
added 纏綿
added 澶湉
added 嬋媛
added 潺湲
added 猖狂
added 敞怳
added 惝怳
added 悵望
added 怊悵
added 呫囓
added 趻踔
added 噌吰
added 搶攘
added 彳亍
added 佁儗
added 充斥
added 憧憬
added 舂容
added 衝蓯
added 惆悵
added 躊躇
added 愁疾
added 綢繆
added 儲與
added 惴耎
added 愴怳
added 炊累
added 逴躒
added 綽約
added 汋約
added 淖約
added 茈虒
added 蔥蘢
added 從容
added 叢殘
added 踧踖
added 巑岏
added 攢雜
added 漼溰
added 璀璨
added 璀錯
added 翠微
added 蹉跌
added 蹉跎
added 嵯峨
added 駘蕩
added 澶漫
added 澹泊
added 澹淡
added 澹蕩
added 淡沱
added 澹灧
added 宕冥
added 碭突
added 忉怛
a

In [279]:
CLUSTERS_PARSED

[{('俾倪', 'pi4', 'ni4'),
  ('俾睨', 'pi4', 'ni4'),
  ('埤坭', 'pi4', 'ni4'),
  ('埤堄', 'pi4', 'ni4'),
  ('睥睨', 'pi4', 'ni4')},
 {('篳篥', 'bi4', 'li4'), ('觱篥', 'bi4', 'li4')},
 {('倉庚', 'cang1', 'geng1'), ('鶬鶊', 'cang1', 'geng1')},
 {('孱顏', 'chan2', 'yan2'),
  ('嶄巖', 'chan2', 'yan2'),
  ('嶄巗', 'chan2', 'yan2'),
  ('巉巖', 'chan2', 'yan2')},
 {('硨磲', 'ju1', 'qu2'), ('車渠', 'ju1', 'qu2')},
 {('岨峿', 'ju3', 'yu3'), ('鉏鋙', 'ju3', 'yu3'), ('齟齬', 'ju3', 'yu3')},
 {('蹢躅', 'zhi2', 'zhu2'), ('躑躅', 'zhi2', 'zhu2')},
 {('勺藥', 'zhuo2', 'lüe4'), ('芍藥', 'zhuo2', 'lüe4')},
 {('毒冒', 'dai4', 'mao4'), ('瑇瑁', 'dai4', 'mao4')},
 {('仿佛', 'fang3', 'fu2'), ('彷彿', 'fang3', 'fu2')},
 {('仿偟', 'pang2', 'huang2'),
  ('傍偟', 'pang2', 'huang2'),
  ('彷徨', 'pang2', 'huang2'),
  ('徬徨', 'pang2', 'huang2'),
  ('房皇', 'pang2', 'huang2')},
 {('氤氳', 'yin1', 'yun1'),
  ('絪縕', 'yin1', 'yun1'),
  ('茵蒀', 'yin1', 'yun1'),
  ('葐蒀', 'fen2', 'yun1')},
 {('匍匐', 'pu2', 'fu2'),
  ('扶伏', 'pu2', 'fu2'),
  ('蒲伏', 'pu2', 'fu2'),
  ('蒲服', 'pu2', 'fu2')}

In [280]:
import pprint
with open('20220428-combined-binome-clusters.txt', 'w') as f:
    f.write(pprint.pformat(CLUSTERS_PARSED))

In [281]:
cluster_idx_to_prons_cleaned = defaultdict(list)
for idx, st in enumerate(CLUSTERS_PARSED):
    for (binome, first, second) in st:
        cluster_idx_to_prons_cleaned[idx].append((first, second))

for i in range(len(CLUSTERS_PARSED)):
    for (first, second) in cluster_idx_to_prons_cleaned[i]:
        if any(((first, second) in cluster_idx_to_prons_cleaned[j]) or ((second, first) in cluster_idx_to_prons_cleaned[j]) for j in range(i)):
            print(i, first, second)

21 hun2 lun2
21 hun2 lun2
69 cui4 cai4
159 tong2 meng2
159 tong2 meng2
163 xi1 xie4
168 pang2 yang2
186 zhou1 jiu1
186 zhou1 jiao1
186 zhou1 jiao1
214 bi4 bo1
216 bi4 fei4
233 cang1 huang2
241 cao2 za2
255 chan2 yuan2
258 chang3 huang3
258 chang3 huang3
281 chuo4 yue1
282 chuo4 yue1
284 cong1 long3
310 di4 li4
313 die2 xie4
313 die2 xie4
325 fei4 wei4
328 fen1 yun1
330 fen2 yun1
334 fu2 yu4
344 han4 dan4
355 hong2 hong2
357 hong4 dong4
372 huo4 luo4
373 huo4 luo4
401 ju3 lü3
419 kong1 tong2
419 kong1 tong2
419 kong1 tong2
423 kou4 mao4
424 kou4 mao4
457 lian2 quan2
466 liao2 li4
467 liao2 li4
468 liao2 li4
469 liao2 li4
469 liao2 li4
471 liao2 liang4
480 lin2 li2
482 lin2 li4
485 lin4 li4
485 lin4 li4
485 lin4 li4
488 ling2 hong1
490 ling2 li4
498 liu2 qiu2
502 long3 cong1
502 long3 cong1
514 mai4 mu4
529 mi2 man4
537 min3 mian3
547 nie4 wu4
547 nie4 wu4
547 nie4 wu4
553 pan2 bo2
554 pan2 huan2
556 pan4 huan4
558 pang2 bo2
558 pang2 bo2
559 pang2 bo2
564 pen1 bo2
564 pen1 bo2
564 pen1 

In [331]:
with open('20220428-combined-binome-clusters-cleaned.txt') as f:
    clusters_eval = eval(''.join(f.readlines()))
    
cluster_idx_to_binomes_cleaned = defaultdict(list)
for idx, st in enumerate(clusters_eval):
    for (binome, first, second) in st:
        cluster_idx_to_binomes_cleaned[idx].append(binome)

for i in range(len(cluster_idx_to_binomes_cleaned)):
    if any(b in cluster_idx_to_binomes_cleaned[j] for j in range(i) for b in cluster_idx_to_binomes_cleaned[i]):
        print(cluster_idx_to_binomes_cleaned[i])

In [334]:
import pprint
with open('20220428-combined-binome-clusters-FINAL.txt', 'w') as f:
    f.write(pprint.pformat(clusters_eval))

In [343]:
binomes_only = [{triplet[0] for triplet in st} for st in clusters_eval]
variants_only = [st for st in binomes_only if len(st) > 1]

In [345]:
for i, cluster in enumerate(variants_only):
    for binome in cluster:
        print(f'{i}\t{binome}')

0	埤坭
0	睥睨
0	俾睨
0	俾倪
0	埤堄
1	篳篥
1	觱篥
2	倉庚
2	鶬鶊
3	嶄巗
3	巉巖
3	孱顏
3	嶄巖
4	硨磲
4	車渠
5	鉏鋙
5	岨峿
5	齟齬
6	躑躅
6	蹢躅
7	芍藥
7	勺藥
8	毒冒
8	瑇瑁
9	彷彿
9	仿佛
10	徬徨
10	仿偟
10	彷徨
10	房皇
10	傍偟
11	氤氳
11	茵蒀
11	葐蒀
11	絪縕
12	匍匐
12	蒲伏
12	葡伏
12	葡服
12	蒲服
12	扶伏
13	滈瀚
13	浩瀚
13	滈汗
13	浩汗
14	澒洞
14	虹洞
15	荒忽
15	芒忽
15	慌惚
15	惚恍
15	恍惚
15	怳惚
15	恍忽
15	惚怳
16	迦逅
16	邂逅
17	糾蓼
17	摎蓼
18	君遷
18	桾櫏
19	轗軻
19	坎軻
19	坎坷
20	崑崙
20	昆侖
20	渾淪
21	昆吾
21	琨珸
22	藍褸
22	襤縷
22	襤褸
22	藍縷
23	褵褷
23	褷褵
23	離褷
23	蘺褷
24	零落
24	苓落
25	瑪瑙
25	馬腦
26	僶俛
26	僶勉
26	閔勉
27	酩酊
27	茗艼
28	兠鍪
28	兠鞪
29	阢隉
29	蘖卼
29	杌隉
29	臲卼
30	媻娑
30	婆娑
31	咆哮
31	炰烋
32	裴徊
32	徘徊
33	魌丑
33	僛丑
34	蘧蒢
34	籧篨
35	劬錄
35	軥錄
36	鸜鵒
36	鴝鵒
37	茹蘆
37	蕠蘆
38	舑舕
38	舔舕
39	碭突
39	樘突
39	唐突
40	姚冶
40	窕冶
41	滂陁
41	滂陀
42	瀇漾
42	汪漾
42	汪翔
42	汪洋
42	瀇瀁
42	望洋
43	魍魎
43	罔閬
44	齷齱
44	齷齪
44	握齱
45	纖趨
45	孅趨
46	䆗窱
46	窈窕
47	殷勤
47	慇勤
47	慇懃
48	逍遙
48	招搖
49	侜張
49	譸張
50	侏儒
50	朱儒
51	倉猝
51	倉卒
52	蟕蠵
52	觜蠵
53	倉皇
53	倉煌
54	蠆芥
54	蠆介
55	蟬聯
55	蟬連
56	猖蹶
56	猖獗
57	蘢葱
57	葱蘢
58	綷粲
58	綷縩
59	跌蕩
59	跌宕
60	娥媓
60	娥皇
61	婀娜
61	娜婀
62	茀鬱
62	鬱岪
62	岪鬱
63	芙蓉
63	芙蕖
64	鈷䥈
64	鈷鉧
65	蒪苴
