In [1]:
% run utils.ipynb

In [2]:
import json

In [3]:
ety = pd.read_csv('../data/etymwn.tsv', sep='\t', header=None)
ety.columns = ['src', 'rel', 'to']

ety = ety.assign(
    src_lang=ety.src.apply(lambda x: x.split(':')[0].strip()),
    src_word=ety.src.apply(lambda x: x.split(':')[1].strip()),
    to_lang=ety.to.apply(lambda x: x.split(':')[0].strip()),
    to_word=ety.to.apply(lambda x: x.split(':')[1].strip()),
)

ety.head()

Unnamed: 0,src,rel,to,src_lang,src_word,to_lang,to_word
0,aaq: Pawanobskewi,rel:etymological_origin_of,eng: Penobscot,aaq,Pawanobskewi,eng,Penobscot
1,aaq: senabe,rel:etymological_origin_of,eng: sannup,aaq,senabe,eng,sannup
2,abe: waniigan,rel:etymological_origin_of,eng: wangan,abe,waniigan,eng,wangan
3,abe: waniigan,rel:etymological_origin_of,eng: wannigan,abe,waniigan,eng,wannigan
4,abs: beta,rel:etymological_origin_of,zsm: beta,abs,beta,zsm,beta


In [31]:
langs = ety.src_lang.unique()
langs.sort()
langs.shape

(397,)

In [17]:
words = ety.src_word.unique()
words.sort()
words = set(words)
len(words)

2743415

In [28]:
word_lang = {}

for i, row in tqdm(ety.iterrows()):
    w = row.src_word.lower()
    if w not in word_lang:
        word_lang[w] = []
    if row.src_lang not in word_lang[w]:
        word_lang[w].append(row.src_lang)

6031431it [09:05, 11062.54it/s]


In [45]:
with open('../data/word_lang.json', 'w') as f:
    json.dump(word_lang, f, ensure_ascii=False)

In [73]:
network_to = {}

for l in tqdm(langs):
    cnt = ety[ety.src_lang == l].groupby('to_lang').to_lang.count()
    network_to[l] = list(zip(cnt.index, map(int, cnt.values)))

100%|██████████| 397/397 [03:12<00:00,  2.06it/s]


In [75]:
with open('../data/network_to.json', 'w') as f:
    json.dump(network_to, f, ensure_ascii=False)

In [76]:
network_from = {}

for l in tqdm(langs):
    cnt = ety[ety.to_lang == l].groupby('src_lang').src_lang.count()
    network_from[l] = list(zip(cnt.index, map(int, cnt.values)))

100%|██████████| 397/397 [03:20<00:00,  1.98it/s]


In [77]:
with open('../data/network_from.json', 'w') as f:
    json.dump(network_from, f, ensure_ascii=False)

In [4]:
synonyms = pd.read_csv('../data/uwn.tsv', sep='\t', header=None)
synonyms.columns = ['src', 'rel', 'to', 'weight']
synonyms = synonyms[synonyms.rel != 'rel:means']
synonyms.drop(['weight', 'rel'], axis=1, inplace=True)
synonyms.head()

Unnamed: 0,src,to
13,s/n9002814,t/tha/รัสเซีย
14,s/n9002814,t/tur/Rusya Federasyonu
15,s/n9002814,t/arb/روسيا
16,s/n9002814,t/ara/روسيا
17,s/n9002814,t/vol/Rusän


In [18]:
synonymDict = dict()

for i, (src, to) in tqdm(synonyms.iterrows()):
    # some have multiple forms, keep first
    _, lang, word = to.split('/')[:3]
    if word in words:
        syns = synonymDict.get(src, [])
        syns.append((lang, word))
        synonymDict[src] = syns


0it [00:00, ?it/s][A
839it [00:00, 8343.62it/s][A
2096it [00:00, 10439.70it/s][A
3407it [00:00, 11334.42it/s][A
4792it [00:00, 11958.18it/s][A
5754it [00:00, 7073.50it/s] [A
7056it [00:00, 7724.12it/s][A
8493it [00:01, 8379.91it/s][A
9597it [00:01, 8615.39it/s][A
10676it [00:01, 8245.99it/s][A
11611it [00:01, 8137.68it/s][A
12478it [00:01, 8106.82it/s][A
13529it [00:01, 8252.27it/s][A
14881it [00:01, 8556.01it/s][A
16232it [00:02, 7632.41it/s][A
17291it [00:02, 7765.03it/s][A
18566it [00:02, 7978.64it/s][A
19999it [00:02, 8240.39it/s][A
21442it [00:02, 8486.12it/s][A
22931it [00:02, 8729.09it/s][A
24458it [00:02, 8969.51it/s][A
25980it [00:02, 9190.42it/s][A
27500it [00:02, 9395.82it/s][A
28975it [00:03, 9572.63it/s][A
30525it [00:03, 9761.68it/s][A
32010it [00:03, 9913.09it/s][A
33525it [00:03, 10071.43it/s][A
35031it [00:03, 10217.04it/s][A
36556it [00:03, 10360.04it/s][A
38059it [00:03, 10481.95it/s][A
39553it [00:03, 10551.90it/s][A
40981it [00:03, 1

In [19]:
word_syns = {}

for key, value in tqdm(synonymDict.items()):
    for synTuple in value:
        syns = synonymDict[key].copy()
        syns.remove(synTuple)
        lang, word = synTuple
        word_syns[word] = syns

100%|██████████| 66484/66484 [00:03<00:00, 18158.13it/s]


In [23]:
with open('../data/syns.json', 'w') as f:
    json.dump(word_syns, f, ensure_ascii=False)