# Augment Entity DataFrame with Translation Columns

## Is using paid translation service from Googld Cloud Translate
Put the path of google application credential here

In [17]:
# %env GOOGLE_APPLICATION_CREDENTIALS=../GOOGLE_APPLICATION_CREDENTIALS/AIDA-0cf5a4cb6e90.json

In [4]:
import csv
import pandas as pd
import html
import sys
from ast import literal_eval as make_tuple
from langdetect import detect

## Original Entity Dataframe

### Params

In [5]:
repo = 'jchen-test-ta1'
version = '001'
store_data_dir = 'store_data/' + repo

In [6]:
df_entity = pd.read_hdf(store_data_dir + '/entity_all_' + version + '.h5')
df_entity = df_entity.where(pd.notnull(df_entity), None)
has_origin = True if 'originLabel' in df_entity.columns else False
df_entity.head()

Unnamed: 0,e,type,name,source,targets,target_scores,fbid,fbid_score_avg,fbid_score_max,wikidata,wiki_label_en,wiki_label_ru,wiki_label_uk,wiki_alias_en,wiki_alias_ru,wiki_alias_uk,lang,label
0,http://www.isi.edu/gaia/entities/2d26abd5-94c7...,ldcOnt:TTL,"(contractor, contractor)",HC00017P3,,,,,,,,,,,,,en,
1,http://www.isi.edu/gaia/entities/ee5a9d74-0942...,ldcOnt:PER.Politician.HeadOfGovernment,"(Jayashree Lakhan,)",HC00017P3,,,,,,,,,,,,,en,
2,http://www.isi.edu/gaia/entities/58fd6e68-88aa...,ldcOnt:GPE.ProvinceState.ProvinceState,"(India, India, India, India, India's, India's,...",HC00017P3,"(LDC2019E43:4699848, LDC2019E43:1545739, LDC20...","(0.001, 1.0, 1.0, 0.0438)","(m.03rk0, m.03rz4)","(0.469724013, 0.6634847224)","(0.9556255937, 0.6634847224)","(http://www.wikidata.org/entity/Q668, http://w...","(India, Indian Ocean)","(Индия, Индийский океан)","(Індія, Індійський океан)","((IND, in, Bharat, Bharatvarsh, Hindustan, IN,...","((Республика Индия,), ())","((Бгарат, Республіка Індія), ())",en,
2,http://www.isi.edu/gaia/entities/58fd6e68-88aa...,ldcOnt:GPE.ProvinceState.ProvinceState,"(India, India, India, India, India's, India's,...",HC00017P3,"(LDC2019E43:4699848, LDC2019E43:1545739, LDC20...","(0.001, 1.0, 1.0, 0.0438)","(m.03rk0,)","(0.6578779092,)","(1.0,)","(http://www.wikidata.org/entity/Q668,)","(India,)","(Индия,)","(Індія,)","((IND, in, Bharat, Bharatvarsh, Hindustan, IN,...","((Республика Индия,),)","((Бгарат, Республіка Індія),)",en,
10,http://www.isi.edu/gaia/entities/6329ec5d-f1a0...,ldcOnt:PER.MilitaryPersonnel,"(director, director)",HC00017P3,,,,,,,,,,,,,en,


## Get the dictionaries
Load previously generated Russian and Ukranian dictionaries

In [7]:
def get_dictionary(file_path):
    with open(file_path, 'r') as csvfile:
        reader = csv.DictReader(csvfile)
        return {row['RU/UK'].lower():row['EN'] for row in reader}

In [8]:
dict_ru_path = 'dictionaries/dict_ru_en.csv'
dict_uk_path = 'dictionaries/dict_uk_en.csv'
dict_ru = get_dictionary(dict_ru_path)
dict_uk = get_dictionary(dict_uk_path)

## Translation Function
Translate a list of words. Dictionaries are used before using Google translate API (because it's fee-based). Save new translations in the dictionary.

In [13]:
def is_en(text):
    try:
        if detect(text) != 'ru' and detect(text) != 'uk':
            return True
    except:
        return True
    return False

def translate_words(word_list, source_lang):
    dict_a = None
    if source_lang.lower() == 'ru':
        dict_a = dict_ru
    else:
        dict_a = dict_uk
        
    translations = []
    for word in word_list:
        translation = None
        if is_en(word):
            translations.append(word)
        elif word.lower() in dict_a:
            translations.append(dict_a[word.lower()])
        else: # else use google translate and add to dictionary
            print('\r', word, end='')
            translated = free_google_translate(word, source_lang)
            translations.append(translated)
            dict_a[word.lower()] = translated # write out the new dictionary??
    return translations

# free google translator
from googletrans import Translator
free_translator = Translator()
def free_google_translate(text, source_lang):
    try:
        translation = free_translator.translate(text, src=source_lang, dest='en')
    except:
        return None
    return translation.text
    
def free_google_translate_bulk(words, source_lang):
    requests = list(partition_list(words, 100))
    res = {}
    for request in requests:
        try:
            translations = free_translator.translate(request, src=source_lang, dest='en')
            for translation in translations:
                res[translation.origin] = translation.text
        except:
            pass
    return res

# # paid google cloud translator
# from google.cloud import translate
# google_translate_client = translate.Client()

# def google_translate(text, source_lang):
#     translation = google_translate_client.translate(text, source_language=source_lang, target_language='EN')
#     return html.unescape(translation['translatedText'])
google_translate = free_google_translate

# def google_translate_bulk(words, source_lang):
#     requests = list(partition_list(words, 100))
#     dict_a = {}
#     for request in requests:
#         translations = google_translate_client.translate(request, source_language=source_lang, target_language='EN')
#         for translation in translations:
#             dict_a[translation['input'].lower()] = html.unescape(translation['translatedText'])
#     return dict_a
google_translate_bulk = free_google_translate_bulk

def partition_list(lines: list, size: int):
    for i in range(0, len(lines), size):
        yield lines[i:i+size]

## Get a list of strings from dataframe not in the dictionary
Avoiding translating as we go through the dataframe, it's very slow

In [14]:
def get_all_ru_uk(df):
    lists = {}
    ru = []
    uk = []
    for i, row in df.iterrows():
        if row['name'] and row.lang:
            if row.lang.lower()[:2] == 'ru':
                ru = ru + list(row['name'])
            elif row.lang.lower()[:2] == 'uk':
                uk = uk + list(row['name'])
        if row.wiki_label_ru:
            ru = ru + list(row.wiki_label_ru)
        if row.wiki_label_uk:
            uk = uk + list(row.wiki_label_uk)
        if row.wiki_alias_ru:
            for lst in row.wiki_alias_ru:
                if lst:
                    ru = ru + list(lst)
        if row.wiki_alias_uk:
            for lst in row.wiki_alias_uk:
                if lst:
                    uk = uk + list(lst)
        if has_origin and row.originLabel and row.lang:
            if row.lang.lower()[:2] == 'ru':
                ru = ru + list(row.originLabel)
            elif row.lang.lower()[:2] == 'uk':
                uk = uk + list(row.originLabel)
    ru = list(set(ru)) # remove duplicates
    uk = list(set(uk))
    ru = list(w for w in ru if not is_en(w) and w.lower() not in (word.lower() for word in dict_ru))
    uk = list(w for w in uk if not is_en(w) and w.lower() not in (word.lower() for word in dict_uk))
    
    lists['ru'] = list(set(ru))
    lists['uk'] = list(set(uk))

    return lists
        
need_transl = get_all_ru_uk(df_entity)

### Translate those not in the dictionary in bulk

In [15]:
dict_ru_ext = free_google_translate_bulk(need_transl['ru'], 'RU')
dict_ru.update(dict_ru_ext)
dict_uk_ext = free_google_translate_bulk(need_transl['uk'], 'UK')
dict_uk.update(dict_uk_ext)

## Save dictionaries

In [16]:
def save_dictionary(dict_a, file):       
    with open(file, 'w') as csvfile:
        fieldnames = ['RU/UK', 'EN']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        writer.writeheader()
        for k, v in dict_a.items():
            writer.writerow({'RU/UK': k, 'EN': v})

save_dictionary(dict_ru, dict_ru_path)
save_dictionary(dict_uk, dict_uk_path)

## The core: Adding translation columns to entity table

In [18]:
def add_translation_cols(table):
    if has_origin:
        table[['transl_name', 'transl_label_ru', 'transl_label_uk', 'transl_alias_ru', 'transl_alias_uk', 'transl_origin_label']] = table[['name', 'lang', 'wiki_label_ru', 'wiki_label_uk', 'wiki_alias_ru', 'wiki_alias_uk', 'originLabel']].apply(get_translation_cols, axis='columns')
    else:
        table[['transl_name', 'transl_label_ru', 'transl_label_uk', 'transl_alias_ru', 'transl_alias_uk']] = table[['name', 'lang', 'wiki_label_ru', 'wiki_label_uk', 'wiki_alias_ru', 'wiki_alias_uk']].apply(get_translation_cols, axis='columns')
    return table
   
def get_translation_cols(row):
    transl_label = None if not row['name'] or not row.lang or row.lang.lower().startswith('en') else translate_words(list(row['name']), row.lang[:2])
    transl_label_ru = translate_words(list(row.wiki_label_ru), 'RU') if row.wiki_label_ru else None 
    transl_label_uk = translate_words(list(row.wiki_label_uk), 'UK') if row.wiki_label_uk else None
    transl_alias_ru = ()
    if row.wiki_alias_ru:
        for lst in row.wiki_alias_ru:
            if lst:
                transl_alias_ru = transl_alias_ru + (translate_words(list(lst), 'RU'),)
            else:
                transl_alias_ru = transl_alias_ru + (None,)
    else:
        transl_alias_ru = None
    
    transl_alias_uk = ()
    if row.wiki_alias_uk:
        for lst in row.wiki_alias_uk:
            if lst:
                transl_alias_uk = transl_alias_uk + (translate_words(list(lst), 'UK'),)
            else:
                transl_alias_uk = transl_alias_uk + (None,)
    else:
        transl_alias_uk = None
    
    if has_origin:
        transl_origin_label = None if not row.originLabel or not row.lang or row.lang.lower().startswith('en') else translate_words(list(row.originLabel), row.lang[:2])
        return pd.Series({'transl_name': transl_label, 'transl_label_ru': transl_label_ru, 'transl_label_uk': transl_label_uk, 'transl_alias_ru': transl_alias_ru, 'transl_alias_uk': transl_alias_uk, 'transl_origin_label': transl_origin_label})
    else:
        return pd.Series({'transl_name': transl_label, 'transl_label_ru': transl_label_ru, 'transl_label_uk': transl_label_uk, 'transl_alias_ru': transl_alias_ru, 'transl_alias_uk': transl_alias_uk})

df_trans = add_translation_cols(df_entity)


 Провінція Нахон-Ратчасіма

## Write out new entity dataframe

In [19]:
# write out dataframe
df_trans.to_hdf(store_data_dir + '/entity_trans_all_' + version + '.h5', 'entity', mode='w', format='fixed')
_ = pd.read_hdf(store_data_dir + '/entity_trans_all_' + version + '.h5')

# write out dataframe filtered
# df_trans_filtered = df_trans[(~df_trans['debug'])]
df_trans_filtered = df_trans
df_trans_filtered.to_hdf(store_data_dir + '/entity_trans_all_filtered_' + version + '.h5', 'entity', mode='w', format='fixed')
df_trans_filtered.to_csv(store_data_dir + '/entity_trans_all_filtered_' + version + '.csv')

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block0_values] [items->['e', 'type', 'name', 'source', 'targets', 'target_scores', 'fbid', 'fbid_score_avg', 'fbid_score_max', 'wikidata', 'wiki_label_en', 'wiki_label_ru', 'wiki_label_uk', 'wiki_alias_en', 'wiki_alias_ru', 'wiki_alias_uk', 'lang', 'label', 'transl_name', 'transl_label_ru', 'transl_label_uk', 'transl_alias_ru', 'transl_alias_uk']]

  return pytables.to_hdf(path_or_buf, key, self, **kwargs)


In [20]:
df_trans = pd.read_hdf(store_data_dir + '/entity_trans_all_filtered_' + version + '.h5')
df_trans.head()

Unnamed: 0,e,type,name,source,targets,target_scores,fbid,fbid_score_avg,fbid_score_max,wikidata,...,wiki_alias_en,wiki_alias_ru,wiki_alias_uk,lang,label,transl_name,transl_label_ru,transl_label_uk,transl_alias_ru,transl_alias_uk
0,http://www.isi.edu/gaia/entities/2d26abd5-94c7...,ldcOnt:TTL,"(contractor, contractor)",HC00017P3,,,,,,,...,,,,en,,,,,,
1,http://www.isi.edu/gaia/entities/ee5a9d74-0942...,ldcOnt:PER.Politician.HeadOfGovernment,"(Jayashree Lakhan,)",HC00017P3,,,,,,,...,,,,en,,,,,,
2,http://www.isi.edu/gaia/entities/58fd6e68-88aa...,ldcOnt:GPE.ProvinceState.ProvinceState,"(India, India, India, India, India's, India's,...",HC00017P3,"(LDC2019E43:4699848, LDC2019E43:1545739, LDC20...","(0.001, 1.0, 1.0, 0.0438)","(m.03rk0, m.03rz4)","(0.469724013, 0.6634847224)","(0.9556255937, 0.6634847224)","(http://www.wikidata.org/entity/Q668, http://w...",...,"((IND, in, Bharat, Bharatvarsh, Hindustan, IN,...","((Республика Индия,), ())","((Бгарат, Республіка Індія), ())",en,,,"[Индия, indian ocean]","[india, indian ocean]","([Республика Индия], None)","([Бгарат, republic of india], None)"
2,http://www.isi.edu/gaia/entities/58fd6e68-88aa...,ldcOnt:GPE.ProvinceState.ProvinceState,"(India, India, India, India, India's, India's,...",HC00017P3,"(LDC2019E43:4699848, LDC2019E43:1545739, LDC20...","(0.001, 1.0, 1.0, 0.0438)","(m.03rk0,)","(0.6578779092,)","(1.0,)","(http://www.wikidata.org/entity/Q668,)",...,"((IND, in, Bharat, Bharatvarsh, Hindustan, IN,...","((Республика Индия,),)","((Бгарат, Республіка Індія),)",en,,,[Индия],[india],"([Республика Индия],)","([Бгарат, republic of india],)"
10,http://www.isi.edu/gaia/entities/6329ec5d-f1a0...,ldcOnt:PER.MilitaryPersonnel,"(director, director)",HC00017P3,,,,,,,...,,,,en,,,,,,


## Save dictionary

In [21]:
save_dictionary(dict_ru, dict_ru_path)
save_dictionary(dict_uk, dict_uk_path)