In [1]:
from vocabulary_grouping_tools.ingestion.api_ingest import get_english_translation
import pandas as pd
import yaml

In [2]:
from vocabulary_grouping_tools.grouping.grouping_tools import make_all_subseqs

In [3]:
secrets = yaml.safe_load(open('../metadata/secrets.yml', 'r'))

In [4]:
vocab = pd.read_csv('../input_files_from_output/vocab_ak_1_2_ttmik_1_2_w_hanja_refined.csv')

In [5]:
vocab

Unnamed: 0.1,Unnamed: 0,hangul_word,english_phrase,hanja_word
0,0,휴대,being portable,携帶
1,1,형,elder brother from male pov,
2,2,피곤하다,tired,
3,3,포도,grape,葡萄
4,4,팔,arm | eight,
...,...,...,...,...
554,554,가족,family,家族
555,555,가방,bag,街坊
556,556,가르치다,to teach,
557,557,가다,to go,


In [6]:
hanja_words = [_word for _word in vocab['hanja_word'] if str(_word) != 'nan']

In [7]:
def cutoff_ending(hanja_word):
    if hanja_word.endswith('하다'):
        return hanja_word[:-2]
    elif hanja_word.endswith('다'):
        return hanja_word[:-1]
    else:
        return hanja_word

In [8]:
subseq_set = set()
for hangul_wrd in list(hanja_words):
    subseq_set.update(make_all_subseqs(cutoff_ending(hangul_wrd)))

In [9]:
subseq_set

{'一',
 '一月',
 '三',
 '三月',
 '下',
 '下鐵',
 '下鐵驛',
 '中',
 '中國',
 '中國語',
 '中學',
 '中學生',
 '主',
 '九',
 '九月',
 '乳',
 '事',
 '事務',
 '事務室',
 '二',
 '二月',
 '五',
 '五月',
 '仁',
 '仁川',
 '仁川空',
 '仁川空港',
 '今',
 '他',
 '休',
 '休暇',
 '住',
 '住所',
 '來',
 '來日',
 '便',
 '便紙',
 '健',
 '健康',
 '傘',
 '內',
 '公',
 '公園',
 '其',
 '其他',
 '典',
 '冊',
 '冊床',
 '冊房',
 '冷',
 '冷藏',
 '冷藏庫',
 '冷麵',
 '出',
 '出發',
 '分',
 '初',
 '初等',
 '初等學',
 '初等學生',
 '利',
 '利用',
 '前',
 '劇',
 '劇場',
 '動',
 '務',
 '務室',
 '包',
 '化',
 '化粧',
 '化粧室',
 '匣',
 '十',
 '十一',
 '十一月',
 '十二',
 '十二月',
 '十月',
 '十萬',
 '午',
 '午前',
 '午後',
 '口',
 '口頭',
 '只',
 '只今',
 '同',
 '同生',
 '員',
 '問',
 '四',
 '四月',
 '國',
 '國籍',
 '國語',
 '園',
 '圖',
 '圖書',
 '圖書館',
 '土',
 '土曜',
 '土曜日',
 '地',
 '地下',
 '地下鐵',
 '地下鐵驛',
 '坊',
 '堂',
 '場',
 '夕',
 '外',
 '外國',
 '大',
 '大學',
 '大學生',
 '夫',
 '女',
 '女同',
 '女同生',
 '女子',
 '姓',
 '姓銜',
 '婚',
 '子',
 '學',
 '學校',
 '學生',
 '室',
 '家',
 '家族',
 '宿',
 '宿舍',
 '宿題',
 '寄',
 '寄宿',
 '寄宿舍',
 '寢',
 '寢臺',
 '寫',
 '寫眞',
 '小',
 '小包',
 '局',
 '山',
 '島',
 '川',
 '川空',
 '川空港',
 '州',


In [10]:
len(subseq_set)

502

In [11]:
secrets

{'client_id': 'WyPCRsYaxeKJ748sfFdV', 'client_secret': 'EFpX_fxYpo'}

In [12]:
translation_for_subseq = {}
for subseq in subseq_set:
    print('Now getting translation for ' + subseq)
    english_trans = get_english_translation(subseq, secrets['client_id'],
                                            secrets['client_secret'],
                                            source_language='zh-TW')
    print(f'Translation is {english_trans}')
    print()
    translation_for_subseq[subseq] = english_trans

Now getting translation for 初
Translation is at first time

Now getting translation for 生鮮膾
Translation is raw and fresh meat and vegetable dish

Now getting translation for 冊
Translation is book

Now getting translation for 秋
Translation is autumn

Now getting translation for 獨
Translation is alone

Now getting translation for 冊床
Translation is a bookmaking

Now getting translation for 金
Translation is gold

Now getting translation for 火
Translation is fire

Now getting translation for 萄
Translation is grape

Now getting translation for 寫
Translation is write

Now getting translation for 傘
Translation is umbrella

Now getting translation for 寄宿舍
Translation is boarding house

Now getting translation for 曜
Translation is burn

Now getting translation for 主
Translation is master

Now getting translation for 一月
Translation is January

Now getting translation for 中國
Translation is China

Now getting translation for 看
Translation is Look.

Now getting translation for 火曜日
Translation is obs

In [13]:
len(translation_for_subseq)

502

In [14]:
df_of_seq_translation = pd.DataFrame(translation_for_subseq.items()).rename(columns={
    0: 'hanja_subsequence',
    1: 'english_translation'
})

In [15]:

df_of_seq_translation.to_csv('../metadata/hanja_subsequence_translation.csv')