In [1]:
import numpy as np
import pandas as pd
import pickle as pkl
from collections import defaultdict

In [2]:
df = pd.read_excel('chideod.xlsx')

In [3]:
def filter_for_gestalt_binomes(row):
    if row.data_source == 'Kroll':
        if len(row.traditional) == 2:
            if row.traditional[0] != row.traditional[1]:
                return True
    return False

In [4]:
binomes_df = df[df.apply(filter_for_gestalt_binomes, axis=1)]

In [5]:
binomes_df_clean = binomes_df[['traditional', 'pinyin_tonenumber', 'orthographic_variants']]\
    .drop_duplicates()\
    .sort_values('pinyin_tonenumber')

binomes_df_clean.to_csv('binomes_df.tsv', sep='\t', index=None)

In [6]:
print(f'{len(binomes_df_clean)} total Gestalt binome entries found.')
binomes_df_clean.head()

927 total Gestalt binome entries found.


Unnamed: 0,traditional,pinyin_tonenumber,orthographic_variants
9494,欸乃,ai3~nai3,
10164,靉靆,ai4~dai4,
9941,薆薱,ai4~dui4,
10162,靉靅,ai4~pei4,
6759,啽囈,an1~yi4,


In [7]:
pinyin_to_binomes = defaultdict(set)
for _, row in binomes_df_clean.iterrows():
    pinyin_to_binomes[row.pinyin_tonenumber].add(row.traditional)
    if isinstance(row.orthographic_variants, str):
        variants_str = row.orthographic_variants
        variants_str = variants_str.replace(', ', '|').replace('、', '|').replace(',', '|').replace(' ', '')
        assert all(len(var) == 2 for var in variants_str.split('|')), row.orthographic_variants
        
        for var in variants_str.split('|'):
            pinyin_to_binomes[row.pinyin_tonenumber].add(var)

In [8]:
words_with_multiple_forms = {k: v for k, v in pinyin_to_binomes.items() if len(v) > 1}
print(f'{len(words_with_multiple_forms)} distinct 2-syllable pronunciations have multiple written forms.')

209 distinct 2-syllable pronunciations have multiple written forms.


In [9]:
words_with_multiple_forms

{'an4~tan3': {'黮䨴', '黮暗', '黮黮', '黯黮'},
 'bi4~fei4': {'蔽芾', '觱沸'},
 'bi4~yi4': {'辟易', '避易'},
 'bo2~ran2': {'勃如', '勃然'},
 'can1~tan2': {'參潭', '參驔'},
 'cang1~cu4': {'倉卒', '倉猝'},
 'cang1~huang2': {'倉煌', '倉皇', '蒼黃'},
 'cang2~cui1': {'摧藏', '藏催'},
 'cao2~za2': {'嘈囋', '嘈雜'},
 'cha1~ya2': {'杈枒', '枒杈'},
 'chai4~jie4': {'蠆介', '蠆芥'},
 'chan2~juan1': {'嬋娟', '蟬娟'},
 'chan2~lian2': {'蟬聯', '蟬連'},
 'chan2~yan2': {'孱顏', '嶄巖', '嶄巗', '巉巖'},
 'chan2~yuan2': {'嬋媛', '潺湲', '蟬媛'},
 'chang1~jue2': {'猖獗', '猖蹶'},
 'chang2~yang2': {'倘佯', '彷徉', '徜徉'},
 'chang3~huang3': {'惝怳', '惝恍', '敞怳'},
 'chen3~chuo1': {'趻踔', '踸踔'},
 'chuo4~yue1': {'汋約', '淖約', '綽約'},
 'ci1~chi2': {'差池', '柴池'},
 'cu4~ran2': {'蹙然', '蹴然'},
 'cui4~cai4': {'綷粲', '綷縩'},
 'cuo2~e2': {'嵯峨', '嵳峩'},
 'dan4~man4': {'壇曼', '壇漫'},
 'dang4~tu2': {'唐突', '碭突'},
 'dao4~ao4': {'燾傲', '燾奡'},
 'di4~li4': {'玓瓑', '的皪'},
 'diao4~ao2': {'稠嶅', '稠磝'},
 'die1~dang4': {'跌宕', '跌蕩'},
 'die2~xie4': {'蹀躞', '躞蹀'},
 'e1~nuo3': {'娜婀', '婀娜'},
 'e2~ran2': {'峨然', '峩然'},
 'e3~nuo2': {'猗

In [10]:
with open('df_full_5y.pkl', 'rb') as f:
    ngram_df = pkl.load(f)

In [11]:
all_recorded_terms = set(ngram_df.term)

all_binomes = {binome for _, binomes in pinyin_to_binomes.items() for binome in binomes}

print(f'Out of {len(all_binomes)} binomes and {len(all_recorded_terms)} terms recorded in Google Ngrams, {len(all_binomes & all_recorded_terms)} of the binomes appear in Google Ngrams.')

Out of 981 binomes and 95220 terms recorded in Google Ngrams, 65 of the binomes appear in Google Ngrams.


In [12]:
all_binomes & all_recorded_terms

{'仿佛',
 '侏儒',
 '依稀',
 '充斥',
 '勃然',
 '匍匐',
 '呻吟',
 '咀嚼',
 '咆哮',
 '咫尺',
 '唐突',
 '因循',
 '坎坷',
 '妥帖',
 '孟浪',
 '局促',
 '巍峨',
 '差池',
 '幡然',
 '徘徊',
 '徜徉',
 '恍惚',
 '慷慨',
 '憔悴',
 '憧憬',
 '披靡',
 '拮据',
 '揶揄',
 '放浪',
 '斟酌',
 '旖旎',
 '梗概',
 '模糊',
 '殷勤',
 '汪洋',
 '油然',
 '浩瀚',
 '浸淫',
 '混沌',
 '渺茫',
 '滑稽',
 '潦倒',
 '澎湃',
 '猖狂',
 '猖獗',
 '璀璨',
 '磅礴',
 '磊落',
 '穹隆',
 '空洞',
 '突兀',
 '索然',
 '翠微',
 '翱翔',
 '芳菲',
 '茫然',
 '荒唐',
 '蓬勃',
 '蜿蜒',
 '跌宕',
 '蹉跎',
 '邂逅',
 '雍容',
 '零落',
 '鞠躬'}