In [6]:
import pandas as pd
import numpy as np
from collections import defaultdict, Counter
import unicodedata as unic
import pickle as pkl
from disjoint_set import DisjointSet

# how many binome entries from Kroll/CHIDEOD

In [7]:
chideod_kroll_df = pd.read_csv('binomes_df.tsv', sep='\t')
print(len(chideod_kroll_df))

all_chideod_binomes = set()
for i, row in chideod_kroll_df.iterrows():
    all_chideod_binomes.add(row.traditional)
    variants = row.orthographic_variants
    if str(variants) != 'nan':
        variants_list = variants.strip().replace(' ', '').split(',')
        for v in variants_list:
            all_chideod_binomes.add(v)
print(len(all_chideod_binomes))

927
981


# how many "nontrivial" binomes entries from Kroll/CHIDEOD (reversals allowed)

In [8]:
all_nontrivial_chideod_binomes = {bn for bn in all_chideod_binomes if not (bn[1] == '然' or bn[0] == bn[1])}
len(all_nontrivial_chideod_binomes)

912

In [9]:
{bn for bn in all_chideod_binomes if (bn[1] == '然' or bn[0] == bn[1])}

{'俞俞',
 '俞然',
 '倜然',
 '具然',
 '凜然',
 '劃然',
 '勃然',
 '听然',
 '哆然',
 '喁喁',
 '嗑然',
 '嗒然',
 '嘸然',
 '塊然',
 '奭然',
 '嫣然',
 '嬪然',
 '岧岧',
 '岹岹',
 '峨然',
 '峩然',
 '嵬嵬',
 '嶄然',
 '嶪嶪',
 '嶫嶫',
 '巍巍',
 '幡然',
 '怫然',
 '恓恓',
 '戄然',
 '旛然',
 '曈曈',
 '橈橈',
 '油然',
 '洗然',
 '涓然',
 '渫渫',
 '滉然',
 '漂然',
 '潢然',
 '灼然',
 '率然',
 '皛皛',
 '祺然',
 '窅然',
 '索然',
 '翕然',
 '翛然',
 '肸肸',
 '脫然',
 '芊芊',
 '茫然',
 '荅然',
 '蔚蔚',
 '蘧然',
 '蜿蜿',
 '褏然',
 '譙譙',
 '贄然',
 '蹍然',
 '蹙然',
 '蹴然',
 '酌然',
 '閹然',
 '闞然',
 '顒然',
 '顒顒',
 '鬱鬱',
 '黮黮'}

# how many binomes from Kroll/CHIDEOD with true orthographic variants

true orthographic variant:

- not just duplication
- not just ~然
- not just a reversal

In [10]:
clusters = DisjointSet()

for i, row in chideod_kroll_df.iterrows():
    main_binome = row.traditional
    variants = row.orthographic_variants
    
    if str(variants) == 'nan':
        forms = []
    else:
        forms = variants.strip().replace(' ', '').split(',')
    
    if main_binome in all_nontrivial_chideod_binomes:
        forms.append(main_binome)
    
    if forms:
        for form in forms:
            clusters.find(form)
        for form in forms[1:]:
            clusters.union(form, forms[0])

In [11]:
print(f'num clusters {len(list(clusters.itersets()))}')
print(f'num clusters with >= binome {len(list(c for c in clusters.itersets() if len(c) > 1))}')

num clusters 748
num clusters with >= binome 135


In [12]:
list(s for s in clusters.itersets() if len(s) > 1)

[{'黮䨴', '黮暗', '黮黮', '黯黮'},
 {'辟易', '避易'},
 {'參潭', '參驔'},
 {'倉卒', '倉猝'},
 {'倉煌', '倉皇'},
 {'摧藏', '藏催'},
 {'杈枒', '枒杈'},
 {'差池', '柴池'},
 {'蠆介', '蠆芥'},
 {'嬋娟', '蟬娟'},
 {'蟬聯', '蟬連'},
 {'孱顏', '嶄巖', '嶄巗', '巉巖'},
 {'嬋媛', '蟬媛'},
 {'猖獗', '猖蹶'},
 {'倘佯', '彷徉', '徜徉'},
 {'惝怳', '惝恍'},
 {'趻踔', '踸踔'},
 {'蔥蘢', '蘢蔥'},
 {'嵯峨', '嵳峩'},
 {'唐突', '樘突', '碭突'},
 {'燾傲', '燾奡'},
 {'稠嶅', '稠磝'},
 {'跌宕', '跌蕩'},
 {'蹀躞', '躞蹀'},
 {'猗儺', '猗狔'},
 {'娜婀', '婀娜'},
 {'氾瀾', '汎瀾', '泛瀾', '范瀾'},
 {'仿佛', '彷彿'},
 {'氛氳', '雰氳'},
 {'岪鬱', '茀鬱'},
 {'汩稽', '滑稽'},
 {'浩汗', '浩瀚', '滈瀚'},
 {'浩溔', '灝溔', '皓溔'},
 {'怳惚', '恍惚', '惚怳', '惚恍', '芒忽', '荒忽'},
 {'昆侖', '渾淪'},
 {'混沌', '渾沌'},
 {'嶕嶢', '嶢嶭'},
 {'摎蓼', '糾蓼'},
 {'叫條', '窌條'},
 {'侷促', '局促', '跼蹙'},
 {'鉏鋙', '齟齬'},
 {'坎坷', '坎軻', '轗軻'},
 {'忼慨', '慷慨'},
 {'崆峒', '空同', '空洞'},
 {'倥偬', '倥傯'},
 {'蘭單', '闌單'},
 {'欄杆', '闌干'},
 {'藍縷', '藍褸', '襤褸'},
 {'啷噹', '琅當'},
 {'壘塊', '磊塊'},
 {'褵褷', '褷褵'},
 {'迤邐', '邐迤'},
 {'戀眷', '眷戀'},
 {'踉蹌', '踉蹡'},
 {'寥唳', '寥戾'},
 {'列缺', '裂缺'},
 {'璘彬', '瞵彬'},
 {'蹸轢', '躪轢', '轥轢'},
 {'苓落', '零落'},
 

# how many binomes from parse

In [8]:
with open('BINOMES_PARSED_LIST.pkl', 'rb') as f:
    binomes_parsed_list = pkl.load(f)
len(set(binomes_parsed_list))

1116

In [9]:
list(binomes_parsed_list)[:10]

[('螃蟹', 'pángxiè'),
 ('呻吟', 'shēnyín'),
 ('檮杌', 'táowù'),
 ('崒兀', 'zúwù'),
 ('岨峿', 'jǔyǔ'),
 ('蜻蛉', 'qīnglíng'),
 ('莫錯', 'mòcuò'),
 ('輧闐', 'piántián'),
 ('盤薄', 'pánbó'),
 ('俇攘', 'kuángráng')]

# how many nontrivial binomes

In [10]:
binomes_parsed_list_nontrivial = {bn for (bn, pron) in binomes_parsed_list if not (bn[1] == '然' or bn[0] == bn[1])}
len(binomes_parsed_list_nontrivial)

1088

# how many binomes from parse up to reversal

In [11]:
_tmp = set()
for binome in binomes_parsed_list_nontrivial:
    if binome[::-1] not in _tmp:
        _tmp.add(binome[0])
len(_tmp)

917

# how many binome clusters from COMBINED

each 'cluster' contains some b1 and b2 such that b1 and b2 are not simply reversals of one another

In [12]:
with open('20220428-combined-binome-clusters-FINAL.txt') as f:
    combined_binome_clusters = eval(''.join(f.readlines()))

combined_binome_clusters = [set(bn for (bn, _, _) in cluster) for cluster in combined_binome_clusters]
combined_binome_clusters = [c for c in combined_binome_clusters if len(c) > 1]

In [13]:
len(combined_binome_clusters)

178

In [14]:
[st for st in combined_binome_clusters if any(bn[::-1] in st for bn in st)]

[{'怳惚', '恍忽', '恍惚', '惚怳', '惚恍', '慌惚', '芒忽', '荒忽'},
 {'蘺褷', '褵褷', '褷褵', '離褷'},
 {'葱蘢', '蘢葱'},
 {'娜婀', '婀娜'},
 {'岪鬱', '茀鬱', '鬱岪'},
 {'苴蒪', '蒪苴'},
 {'斒斕', '斕斑', '斕斒'},
 {'迤邐', '邐迤'},
 {'伶俜', '俜伶'},
 {'㱥殑', '殑㱥'},
 {'玲瓏', '瓏玲'},
 {'澒濛', '濛澒'},
 {'蠓蠛', '蠛蠓'},
 {'涬溟', '溟涬'},
 {'莫錯', '錯莫'},
 {'苒荏', '荏苒'},
 {'巇險', '險巇'},
 {'杈枒', '枒杈'},
 {'迍邅', '邅迍'},
 {'蔥蘢', '蘢蔥'},
 {'蹀躞', '躞蹀'},
 {'戀眷', '眷戀'},
 {'穹隆', '隆穹'}]

# out of binomes with true orthographic variants: how many have a form that occurs at least once in Sinica corpus?

In [15]:
sinica_occs = pd.read_csv('20220429-binomes-sinica-occs.tsv', sep='\t')
sinica_occs = sinica_occs.fillna(0)

def cluster_is_not_simply_reversal(row):
    cluster_idx = row.cluster
    only_cluster = sinica_occs[sinica_occs.cluster == cluster_idx]
    variants = list(set(only_cluster.binome))
    if len(variants) == 2 and variants[0] == variants[1][::-1]:
        return False
    return True

In [16]:
print('num binome clusters with true orthographic variants')
sinica_occs = sinica_occs[sinica_occs.apply(cluster_is_not_simply_reversal, axis=1)]
print(len(set(sinica_occs.cluster)))

num binome clusters with true orthographic variants
159


In [17]:
def cluster_has_at_least_one_form_in_sinica(row):
    cluster_idx = row.cluster
    only_cluster = sinica_occs[sinica_occs.cluster == cluster_idx]
    n_occs = [0 if np.isnan(n) else int(n) for n in list(only_cluster.sinica_n_occs)]
    return sum(n_occs) > 0

In [18]:
sinica_occs_in_sinica = sinica_occs[sinica_occs.apply(cluster_has_at_least_one_form_in_sinica, axis=1)]
len(set(sinica_occs_in_sinica.cluster))

80

In [19]:
sinica_occs_in_sinica.tail()

Unnamed: 0,cluster,binome,sinica_n_occs
412,168,蕭索,7.0
413,169,蕭條,81.0
414,169,瀟條,0.0
426,175,輾轉,85.0
427,175,展轉,1.0


In [20]:
for i in set(sinica_occs_in_sinica_mix.cluster):
    cluster = sinica_occs_in_sinica_mix[sinica_occs_in_sinica_mix.cluster == i]

NameError: name 'sinica_occs_in_sinica_mix' is not defined

# out of these, how many only have shared radicals

In [21]:
from database import load_database, Node

In [22]:
db = load_database()

In [23]:
def have_shared_radical(c1, c2):
    if not db[c1] or not db[c2]: #atmomic
        return False
    c1_first_comp_child = db[c1][0][1]
    c1_grandchildren = c1_first_comp_child.children
    c1_g_first = c1_grandchildren[0]
    c1_g_last = c1_grandchildren[-1]
    
    c2_first_comp_child = db[c2][0][1]
    c2_grandchildren = c2_first_comp_child.children
    c2_g_first = c2_grandchildren[0]
    c2_g_last = c2_grandchildren[-1]
    
#     firsts are glyphs and match, or lasts are glyphs and match
    first_are_chars_and_match = (c1_g_first.type_ == c2_g_first.type_ == 'char' and 
                                 c1_g_first.glyph == c2_g_first.glyph)
    last_are_chars_and_match = (c1_g_last.type_ == c2_g_last.type_ == 'char' and 
                                 c1_g_last.glyph == c2_g_last.glyph)
    return first_are_chars_and_match or last_are_chars_and_match

In [24]:
def binome_has_shared_radical(bn):
    return have_shared_radical(bn[0], bn[1])

In [25]:
def cluster_has_only_shared_radical(row):
    cluster_idx = row.cluster
    only_cluster = sinica_occs_in_sinica[sinica_occs_in_sinica.cluster == cluster_idx]
    return all(binome_has_shared_radical(bn) for bn in only_cluster.binome)

In [26]:
sinica_occs_in_sinica_only_shared_radical = (
    sinica_occs_in_sinica[sinica_occs_in_sinica.apply(cluster_has_only_shared_radical, axis=1)])

In [27]:
print('number of clusters whose members only contain those with shared radicals')
len(set(sinica_occs_in_sinica_only_shared_radical.cluster))

number of clusters whose members only contain those with shared radicals


25

# out of these, how many do not have any forms with shared radicals

In [17]:
def cluster_has_no_forms_with_shared_radical(row):
    cluster_idx = row.cluster
    only_cluster = sinica_occs_in_sinica[sinica_occs_in_sinica.cluster == cluster_idx]
    return all(not binome_has_shared_radical(bn) for bn in only_cluster.binome)

In [29]:
sinica_occs_sinica_no_shared_radical = (
    sinica_occs_in_sinica[sinica_occs_in_sinica.apply(cluster_has_no_forms_with_shared_radical, axis=1)]
)
print('number of clusters whose members do not contain those with shared radicals')
len(set(sinica_occs_sinica_no_shared_radical.cluster))

number of clusters whose members do not contain those with shared radicals


16

# how many clusters have a mix of the two

In [18]:
def cluster_has_mix(row):
    cluster_idx = row.cluster
    return ((cluster_idx not in set(sinica_occs_sinica_no_shared_radical.cluster)) and
            (cluster_idx not in set(sinica_occs_in_sinica_only_shared_radical.cluster)))

In [31]:
sinica_occs_in_sinica_mix = sinica_occs_in_sinica[sinica_occs_in_sinica.apply(cluster_has_mix, axis=1)]
len(set(sinica_occs_in_sinica_mix.cluster))

39

In [32]:
sinica_occs_in_sinica_mix.tail(30)

Unnamed: 0,cluster,binome,sinica_n_occs
273,109,憔萃,0.0
274,109,蕉萃,0.0
275,109,憔顇,0.0
281,112,葡桃,0.0
282,112,葡萄,104.0
336,136,跼蹙,0.0
337,136,局促,2.0
338,136,侷促,14.0
341,138,空洞,79.0
342,138,空同,0.0


In [33]:
db['闌'], db['干']

([('⿵門柬', <Node '⿵' with 2 children>)], [('⿱一十', <Node '⿱' with 2 children>)])

In [34]:
sinica_occs_in_sinica_mix['shared_radical'] = sinica_occs_in_sinica.binome.apply(binome_has_shared_radical)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sinica_occs_in_sinica_mix['shared_radical'] = sinica_occs_in_sinica.binome.apply(binome_has_shared_radical)


In [35]:
sinica_occs_in_sinica_mix.to_csv('tmp_sinica_occs_in_sinica_mix.tsv', sep='\t')

In [36]:
sinica_occs_in_sinica_mix

Unnamed: 0,cluster,binome,sinica_n_occs,shared_radical
0,0,埤坭,0.0,True
1,0,睥睨,9.0,True
2,0,俾睨,0.0,False
3,0,俾倪,0.0,True
4,0,埤堄,0.0,True
...,...,...,...,...
400,162,烏乎,0.0,False
403,164,瀟灑,73.0,True
404,164,蕭灑,0.0,False
426,175,輾轉,85.0,True


In [41]:
print(f'out of 39 binomes, 7 of them are such that the more common form in sinica is the one with non-matching radicals')

out of 39 binomes, 7 of them are such that the more common form in sinica is the one with non-matching radicals


In [84]:
k = 1
pct_shared_radical_dict = dict()
for cluster_idx in set(sinica_occs_in_sinica_mix.cluster):
    cluster_df = sinica_occs_in_sinica_mix[sinica_occs_in_sinica_mix.cluster == cluster_idx]
    most_row = cluster_df.iloc[np.argmax(cluster_df.sinica_n_occs)]
    if most_row.shared_radical == True:
        num = most_row.sinica_n_occs
        den = sum(cluster_df.sinica_n_occs)
        pct_shared_radical_dict[cluster_idx] = num / den
        if list(cluster_df.sinica_n_occs).count(0) == len(cluster_df) - 1:
            print(k)
            k += 1

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26


In [69]:
np.mean(list(pct_shared_radical_dict.values()))

0.9925192191886458

In [48]:
good_binome_cluster_idxs = set()
for cluster_idx in set(sinica_occs_in_sinica_mix.cluster):
    cluster_slice = sinica_occs_in_sinica_mix[sinica_occs_in_sinica_mix.cluster == cluster_idx]
    argmax_row = cluster_slice.iloc[np.argmax(cluster_slice.sinica_n_occs)]
    if argmax_row.shared_radical:
        good_binome_cluster_idxs.add(cluster_idx)

In [52]:
sinica_occs_in_sinica_mix

Unnamed: 0,cluster,binome,sinica_n_occs,shared_radical
0,0,埤坭,0.0,True
1,0,睥睨,9.0,True
2,0,俾睨,0.0,False
3,0,俾倪,0.0,True
4,0,埤堄,0.0,True
...,...,...,...,...
400,162,烏乎,0.0,False
403,164,瀟灑,73.0,True
404,164,蕭灑,0.0,False
426,175,輾轉,85.0,True


In [60]:
to_export = sinica_occs_in_sinica_mix[sinica_occs_in_sinica_mix.apply(lambda row: row.cluster in good_binome_cluster_idxs, axis=1)]
to_export.to_csv('20220522_for_appendix_b.tsv', sep='\t', index=False)

In [38]:
sinica_occs_in_sinica_mix.iloc[np.argmax(sinica_occs_in_sinica_mix.sinica_n_occs)]

cluster              32
binome               徘徊
sinica_n_occs     108.0
shared_radical     True
Name: 97, dtype: object

# in general, out of combined nontrivial binomes (reversals count as a single binome), how many have shared radicals?

In [42]:
combined_nontrivial_binome_list = binomes_parsed_list_nontrivial | all_nontrivial_chideod_binomes
len(combined_nontrivial_binome_list)

1335

In [43]:
print('counting reversals once:')
_tmp = set()
for bn in combined_nontrivial_binome_list:
    if bn[::-1] not in _tmp:
        _tmp.add(bn)
len(_tmp)

counting reversals once:


1305

In [275]:
len({bn for bn in _tmp if binome_has_shared_radical(bn)})

680

In [27]:
contains_at_one_matching_form = 0
for s in clusters.itersets():
    if any(binome_has_shared_radical(bn) for bn in s):
        contains_at_one_matching_form += 1
contains_at_one_matching_form

389

In [15]:
len(list(clusters.itersets()))

748