In [4]:
import pandas as pd
import languagecodes
from collections import Counter
import os

In [5]:
LANGCODE_MAPPING = {
    'sw': 'swh',
    'ar': 'arb',
    'zh': 'cmn'
}

def get_iso_639_langcode(lang_code):
    if lang_code in LANGCODE_MAPPING:
        return LANGCODE_MAPPING[lang_code]
    elif len(lang_code) == 2:
        return languagecodes.iso_639_alpha3(lang_code)
    else:
        return lang_code

In [6]:
class PanLex:
    def __init__(self, panlex_dir, allow_space=False, min_quality=5):
        self.panlex_dir = panlex_dir
        self.allow_space = allow_space
        self.min_quality = min_quality
        self.cache_word = dict()

        print('loadding langvar csv')
        self.langvar = pd.read_csv(f'{panlex_dir}/langvar.csv')

        print('loadding source csv')
        source = pd.read_csv(f'{panlex_dir}/source.csv')
        source['source'] = source.id
        source = source[['source', 'quality']]
        self.source = source

        print('loadding expr csv')
        self.expr = pd.read_csv(f'{panlex_dir}/expr.csv', usecols=['id', 'langvar', 'txt'])
        
        print('loadding meaning csv')
        self.meaning = pd.read_csv(f'{panlex_dir}/meaning.csv')
        
        print('loadding denotation csv')
        denotation = pd.read_csv(f'{panlex_dir}/denotation.csv', usecols=['meaning', 'expr'])
        denotation['id'] = denotation.expr
        del denotation['expr']
        self.denotation = denotation
        
        print('finish setting up')

    def get_langvar(self, lang_code, var_code=0):
        langvar = self.langvar[(self.langvar.lang_code == lang_code) & (self.langvar.var_code == var_code)]
        langvar = list(langvar.id)
        assert len(langvar) == 1
        return langvar[0]

    def get_word(self, lang_code, var_code=0):
        expr = self.expr
        lang_code = get_iso_639_langcode(lang_code)
        langvar = self.get_langvar(lang_code, var_code=var_code)
        if langvar in self.cache_word:
            return self.cache_word[langvar]
        
        print(f'loadding {lang_code} word')
        word = expr[expr.langvar == langvar]
        if not self.allow_space:
            word = word[word.txt.str.contains(' ') == False]
        word = pd.merge(word, self.meaning, how='inner', on='id')
        word = pd.merge(word, self.source, how='inner', on='source')
        del word['source']
        word = pd.merge(word, self.denotation, how='inner', on='id')
        
        print(f'loaded {len(word)} words')
        print('quality distribution:', Counter(word.quality))
        word = word[word.quality >= self.min_quality]
        self.cache_word[langvar] = word
        return word

    def get_dictionary(self, lang1, lang2):
        word1 = self.get_word(lang1)
        word2 = self.get_word(lang2)
        print(f'building {lang1} & {lang2} dictionary')
        # words with same meaning are translation (distance-1 translations)
        dictionary = pd.merge(word1, word2, how='inner', on='meaning').dropna()
        # slightly differ to how panlex measure quality (`tr1q`)
        # https://dev.panlex.org/translation-evaluation/
        dictionary['quality'] = dictionary.quality_x + dictionary.quality_y
        dictionary = dictionary[['txt_x', 'txt_y', 'quality']]
        dictionary = dictionary.groupby(['txt_x', 'txt_y']).agg({'quality': 'sum'}).reset_index()
        print(f'total entry {lang1} & {lang2} = {len(dictionary)}')
        return dictionary

In [7]:
def write_dictionary(dictionary, filename):
    with open(filename, 'w') as fp:
        for l in dictionary.iterrows():
            print(l[1].txt_x, l[1].txt_y, l[1].quality, sep='\t', file=fp)

In [8]:
# If False, remove entry with space
allow_space=False
# Filter source with quality less than min_quality, scale is 0-9
min_quality=5

In [11]:
panlex = PanLex('./panlex-20211201-csv/',
                allow_space=allow_space, min_quality=min_quality)
directory = './data/'
os.makedirs(directory, exist_ok=True)

loadding langvar csv
loadding source csv
loadding expr csv
loadding meaning csv
loadding denotation csv
finish setting up


In [15]:
langs = 'de,es,fr,pl,pt,da,it,en'.split(',')
for lang in langs:
    print(lang)
    _ = panlex.get_word(lang)

de
es
fr
pl
pt
loadding por word
loaded 673896 words
quality distribution: Counter({5: 508784, 2: 133769, 3: 12459, 4: 8601, 7: 4912, 6: 2219, 8: 1990, 1: 565, 9: 534, 0: 63})
da
loadding dan word
loaded 215636 words
quality distribution: Counter({5: 155812, 3: 37262, 4: 12386, 7: 5663, 6: 1500, 9: 1405, 8: 567, 2: 555, 1: 373, 0: 113})
it
loadding ita word
loaded 2056936 words
quality distribution: Counter({5: 1078883, 7: 778773, 3: 103099, 2: 45578, 4: 27638, 1: 20789, 6: 1613, 9: 290, 8: 214, 0: 59})
en
loadding eng word
loaded 10933993 words
quality distribution: Counter({5: 10236105, 3: 393475, 4: 145939, 2: 100626, 7: 44989, 8: 5015, 6: 3773, 1: 2430, 9: 1380, 0: 261})


In [19]:
trg = 'en'
for src in langs:
    print(src)
    if src == 'en':
        continue
    dictionary = panlex.get_dictionary(src, trg)
    filename = f'{directory}/{src}-{trg}.txt'
    write_dictionary(dictionary, filename)

de
building de & en dictionary
total entry de & en = 186704
es
building es & en dictionary
total entry es & en = 179417
fr
building fr & en dictionary
total entry fr & en = 335991
pl
building pl & en dictionary
total entry pl & en = 99400
pt
building pt & en dictionary
total entry pt & en = 199576
da
building da & en dictionary
total entry da & en = 58524
it
building it & en dictionary
total entry it & en = 315851
en
