In [None]:
% run utils.ipynb

In [None]:
import json

In [None]:
def save(data, name):
    with open('../data/out.{}.json'.format(name), 'w') as f:
        json.dump(data, f, ensure_ascii=False)

In [None]:
lang_locations = pd.read_csv('../data/languages_coordinates.csv')
lang_locations.drop(['glottocode', 'macroarea'], 1, inplace=True)
lang_locations.head()

In [None]:
lang_locations.shape

In [None]:
relations = pd.read_csv('../data/etymwn.tsv', sep='\t', header=None)
relations.columns = ['src', 'rel', 'to']
relations = relations[
    ~relations.src.apply(lambda x: len(x.split(' ')) > 4) &
    ~relations.to.apply(lambda x: len(x.split(' ')) > 4) &
    ~relations.src.apply(lambda x: len(x.split(':')) > 2) &
    ~relations.to.apply(lambda x: len(x.split(':')) > 2) &
    ~relations.src.str.contains('-') & 
    ~relations.src.str.contains('\[') & 
    ~relations.to.str.contains('-') &
    ~relations.to.str.contains('\[') &
    ~relations.rel.isin(['rel:is_derived_from', 'rel:etymologically_related', 'derived'])
]
relations = relations.assign(
    src_lang=relations.src.apply(lambda x: x.split(':')[0].strip()),
    src_word=relations.src.apply(lambda x: x.split(':')[1].strip().lower()),
    to_lang=relations.to.apply(lambda x: x.split(':')[0].strip()),
    to_word=relations.to.apply(lambda x: x.split(':')[1].strip().lower()),
)
relations = relations[relations.to_word != relations.src_word]
relations.drop_duplicates(inplace=True)
relations.head()

In [None]:
relations.shape

In [None]:
words_per_lang = relations.groupby(relations.to_lang).count().to_word
words_per_lang.sort_values(ascending=False).plot(logy=True);

In [None]:
min_word = 20
langs = pd.Series(words_per_lang[words_per_lang > min_word].index)
langs.sort_values()
langs.shape

In [None]:
lang_locations[lang_locations.isocode.isin(langs)].shape

In [None]:
macrolangs = pd.read_csv('../data/macrolanguages.tsv', sep='\t')
macrolangs.drop(['I_Status'], 1, inplace=True)
macrolangs = macrolangs[~macrolangs.I_Id.isin(langs) & macrolangs.I_Id.isin(lang_locations.isocode)]
macrolangs = dict(macrolangs.groupby(macrolangs.M_Id).first().reset_index().values)
len(macrolangs)

In [None]:
unknown_lang = ~langs.isin(lang_locations.isocode)
langs[unknown_lang] = langs[unknown_lang].apply(macrolangs.get)
langs = langs[langs.values != None]
langs.shape

In [None]:
lang_locations_patch = np.array([
    [34.5, 41],
    [37.1, -3.5],
    [51, 0],
    [40.3, 45],
    [28, 84.5],
    [52, 5],
    [52, -1],
    [48, 2],
    [48.649, 11.4676],
    [48.649, 13.4676],
    [59.92, 10.71],
    [52, 5],
    [52, 0],
    [47, 2],
    [53.3, 6.3],
    [47.649, 12.4676],
    [53.2, -7.5],
    [55.7, 12],
    [32, 50],
    [44.3, 4],
    [56, 37],
    [51.152, 12.692],
    [40.4414, -1.11788],
    [39.8667, 32.8667],
    [52, -4],
    [32, 50],
    [52, 14]
])
lang_locations_patch.shape

In [None]:
lang_locations.loc[lang_locations.isocode.isin(langs) & lang_locations.latitude.isnull(), ['latitude', 'longitude']] = lang_locations_patch

In [None]:
lang_locations[lang_locations.isocode.isin(langs) & lang_locations.latitude.isnull()]

In [None]:
lang_locations = lang_locations[lang_locations.isocode.isin(langs)]
lang_locations.shape

In [None]:
relations = relations[relations.src_lang.isin(langs) & relations.to_lang.isin(langs)]
relations.shape

In [None]:
parents_rel = relations[relations.rel != 'rel:etymology']
parents_rel.shape

In [None]:
words = set()
words.update(relations.src_word)
words.update(relations.to_word)
len(words)

In [None]:
word_per_lang = pd.DataFrame(dict(
    word=np.r_[relations.src_word, relations.to_word],
    lang=np.r_[relations.src_lang, relations.to_lang],
))
word_per_lang.shape

In [None]:
word_langs = dict(word_per_lang.groupby(word_per_lang.word).lang.apply(lambda x: list(np.unique(x))).reset_index().values)
len(word_langs)

In [None]:
save(word_langs, 'word_langs')

In [None]:
word_per_lang.head()

In [None]:
lang_cases = word_per_lang.groupby(word_per_lang.lang).word.apply(list)
lang_cases.head()

In [None]:
lang_count = word_per_lang.groupby(word_per_lang.lang).word.count()
lang_count.head()

In [None]:
lang_len_means = lang_cases.apply(lambda w: float(np.mean([len(x) for x in w])))
lang_len_means.head()

In [None]:
lang_len_percentiles = lang_cases.apply(lambda w: np.percentile([len(x) for x in w], [25, 50, 75]))
lang_len_percentiles.head()

In [None]:
lang_len_std = lang_cases.apply(lambda w: float(np.std([len(x) for x in w])))
lang_len_std.head()

In [None]:
lang_cases_letters = lang_cases.apply(lambda w: [x for xx in w for x in xx])
lang_cases_letters.head()

In [None]:
lang_letters = lang_cases_letters.apply(lambda w: [(l, int(c)) for l, c in zip(*np.unique(w, return_counts=True))])
lang_letters.head()

In [None]:
lang_stats = pd.DataFrame(dict(
    count=lang_count,
    mean=lang_len_means,
    std=lang_len_std,
    percentile25=lang_len_percentiles.apply(lambda x: float(x[0])),
    percentile50=lang_len_percentiles.apply(lambda x: float(x[1])),
    percentile75=lang_len_percentiles.apply(lambda x: float(x[2])),
    histogram=lang_letters
))
lang_stats.head()

In [None]:
src_to_count = relations.groupby(relations.to_lang).to_word.count()
src_to_count.head()

In [None]:
src_to = parents_rel.groupby([parents_rel.src_lang, parents_rel.to_lang]).count().rel
src_to.shape

In [None]:
src_to.head()

In [None]:
network_to = {}

for (src, to), count in src_to.items():
    if src not in network_to:
        network_to[src] = []
        
    ratio = count# / src_to_count.loc[to]
    #assert ratio <= 1
    network_to[src].append([to, ratio])

In [None]:
to_src_count = relations.groupby(relations.src_lang).src_word.count()
to_src_count.head()

In [None]:
to_src = parents_rel.groupby([parents_rel.to_lang, parents_rel.src_lang]).count().rel
to_src.shape

In [None]:
to_src.head()

In [None]:
network_from = {}

for (to, src), count in to_src.items():
    if to not in network_from:
        network_from[to] = []
        
    ratio = count# / to_src_count.loc[src]
    #assert ratio <= 1
    network_from[to].append([src, ratio])

In [None]:
save({
    'to': network_to,
    'from': network_from,
    'locations': lang_locations.set_index('isocode').to_dict('index'),
    'stats': lang_stats.to_dict('index')
}, 'lang_network')    

In [None]:
mappings = pd.read_csv('../data/uwn.tsv', sep='\t', header=None)
mappings.columns = ['src', 'rel', 'to', 'weight']
mappings = mappings[mappings.rel != 'rel:means']
mappings = mappings.assign(
    lang=mappings.to.apply(lambda x: x.split('/')[1].strip()),
    word=mappings.to.apply(lambda x: x.split('/')[2].strip().lower()),
)
mappings = mappings[mappings.word.isin(words) & mappings.lang.isin(langs)]
mappings.drop(['weight', 'rel'], axis=1, inplace=True)
mappings.set_index('src', inplace=True)
mappings.head()

In [None]:
mappings.shape

In [None]:
clusters = mappings.groupby(mappings.index).apply(lambda x: list(x.lang.str.cat(':' + x.word)))
clusters.head()

In [None]:
meanings = {}

for _, cluster in tqdm(clusters.items()):
        
    for lang_word in cluster:
        
        if lang_word not in meanings:
            meanings[lang_word] = set()
            
        meanings[lang_word].update(cluster)
        meanings[lang_word].remove(lang_word)
        
for key, values in meanings.items():
    meanings[key] = list(values)

In [None]:
len(meanings)

In [None]:
save(meanings, 'word_meanings')

In [None]:
parents = pd.DataFrame(dict(
     src=parents_rel.src_lang + ':' + parents_rel.src_word + ',',
    to=parents_rel.to_lang + ':' + parents_rel.to_word,
)).groupby('to').src.sum()
parents.head()

In [None]:
parents_map = dict(parents.apply(lambda x: x.split(',')[:-1]).reset_index().values)
len(parents_map)

In [None]:
save(parents_map, 'word_parents')

In [None]:
children = pd.DataFrame(dict(
     src=parents_rel.src_lang + ':' + parents_rel.src_word,
    to=parents_rel.to_lang + ':' + parents_rel.to_word + ',',
)).groupby('src').to.sum()
children.head()

In [None]:
children_map = dict(children.apply(lambda x: x.split(',')[:-1]).reset_index().values)
len(children_map)

In [None]:
save(children_map, 'word_children')

In [None]:
def recurse(lang_word, mapping, seen=None):
    if seen is None:
        seen = set()

    if lang_word in seen:
        return []
    
    seen.add(lang_word)
    ps = mapping.get(lang_word, [])
    return [(p, recurse(p, mapping, seen.copy())) for p in ps]

In [None]:
recurse('eng:dog', parents_map)

In [None]:
recurse('eng:dog', children_map)

In [None]:
def recurse_unfold(lang_word, mapping):
    
    edges = []
    depth = 0
    
    def edgify(lang_word, history=[], seen=None):
        if seen is None:
            seen = set()
        
        ps = mapping.get(lang_word, [])

        if lang_word in seen:
            edges.append(history)
            return
        
        if not len(ps):
            edges.append(history + [lang_word])
            return

        seen.add(lang_word)
        [edgify(p, history + [lang_word], seen.copy()) for p in ps]

    edgify(lang_word)
    return edges

In [None]:
recurse_unfold('eng:dog', parents_map)

In [None]:
recurse_unfold('eng:dog', children_map)

In [None]:
lang_words = (parents_rel.src_lang + ':' + parents_rel.src_word).values
lang_words.shape

In [None]:
lang_influences = {}

for lang_word in tqdm(lang_words):
    edges = recurse_unfold(lang_word, parents_map) + recurse_unfold(lang_word, children_map)
    
    for edge in edges:
        lang = edge[0].split(':')[0]

        if lang not in lang_influences:
            lang_influences[lang] = []

        lang_influences[lang].append(edge)

In [None]:
lang_influences_ord = { k: sorted(v, key=len, reverse=True) for k, v in tqdm(lang_influences.items()) }

In [None]:
n_samples = 50

In [None]:
lang_samples = {}

for lang in tqdm(langs):
    if lang in lang_influences:
        top_starters = [lang_word[0].split(':')[1] for lang_word in lang_influences_ord[lang]]

        lang_samples[lang] = [top_starters[i] for i in sorted(np.unique(top_starters, return_index=True)[1])][:n_samples]

In [None]:
save(lang_samples, 'lang_samples')

In [None]:
relation_groups = relations.groupby(relations.src_lang).apply(lambda x: x.groupby(x.to_lang).src_word.apply(list))
relation_groups.shape

In [None]:
relation_samples = {}

for _, (src, to, words) in tqdm(relation_groups.reset_index().iterrows()):
    relation = '{}{}'.format(src, to)
    relation_samples[relation] = np.random.choice(words, min(n_samples, len(words))).tolist()

In [None]:
save(relation_samples, 'relation_samples')