In [18]:
from vocabulary_grouping_tools.ingestion.api_ingest import get_english_translation
import pandas as pd
import yaml

In [19]:
from vocabulary_grouping_tools.grouping.grouping_tools import make_all_subseqs

In [20]:
secrets = yaml.safe_load(open('../metadata/secrets.yml', 'r'))

In [21]:
vocab = pd.read_csv('../input_files_from_output/vocab_ak_1_2_ttmik_1_2_w_hanja_refined.csv')

In [22]:
vocab

Unnamed: 0.1,Unnamed: 0,hangul_word,english_phrase,hanja_word
0,0,휴대,being portable,携帶
1,1,형,elder brother from male pov,
2,2,피곤하다,tired,
3,3,포도,grape,葡萄
4,4,팔,arm | eight,
...,...,...,...,...
554,554,가족,family,家族
555,555,가방,bag,街坊
556,556,가르치다,to teach,
557,557,가다,to go,


In [23]:
hangul_words = vocab['hangul_word']

In [33]:
max([len(_word) for _word in hangul_words])

5

In [24]:
def cutoff_ending(hangul_word):
    if hangul_word.endswith('하다'):
        return hangul_word[:-2]
    elif hangul_word.endswith('다'):
        return hangul_word[:-1]
    else:
        return hangul_word

In [25]:
subseq_set = set()
for hangul_wrd in list(hangul_words):
    subseq_set.update(make_all_subseqs(cutoff_ending(hangul_wrd)))

In [26]:
subseq_set

{'정',
 '키',
 '콕',
 '지우',
 '학교',
 '던',
 '책방',
 '커피숍',
 '구',
 '의자',
 '숟가락',
 '런',
 '미있',
 '곤',
 '끝',
 '아이스크',
 '창',
 '청역',
 '걸',
 '스어',
 '런던',
 '레비전',
 '아이스크림',
 '세탁',
 '약',
 '비행기',
 '건강',
 '한국어',
 '야',
 '끄럽',
 '데',
 '람',
 '방학',
 '읽',
 '맞',
 '랑',
 '운전',
 '멋있',
 '넘어지',
 '오전',
 '혀',
 '퓨',
 '속',
 '아내',
 '불',
 '교실',
 '텔',
 '루',
 '레이시',
 '메일',
 '이메일',
 '엽',
 '라질',
 '호',
 '만색',
 '거',
 '유럽',
 '여기',
 '친구',
 '오',
 '간호사',
 '니',
 '온돌',
 '브라',
 '방콕',
 '팔',
 '자주',
 '런데',
 '악수',
 '가족',
 '생선회',
 '베트남',
 '프랑스어',
 '기숙',
 '영',
 '휴가',
 '아버지',
 '특히',
 '항',
 '방',
 '색',
 '형',
 '시월',
 '아까',
 '그런',
 '리드',
 '한번',
 '노란',
 '별',
 '투',
 '떤',
 '와주',
 '선생',
 '장미',
 '매',
 '나나',
 '요',
 '조',
 '토요',
 '시',
 '달리',
 '어머니',
 '햄버거',
 '카',
 '카메',
 '언니',
 '이렇게',
 '렌',
 '눈',
 '인천공',
 '영어',
 '뉴스',
 '넘어',
 '장고',
 '적',
 '갈아',
 '랑스어',
 '스크',
 '회',
 '바나',
 '파리',
 '미국',
 '렇지',
 '네',
 '간호',
 '요일',
 '왜',
 '식당',
 '울',
 '만나',
 '타',
 '셔',
 '트남',
 '인천',
 '드리',
 '그',
 '아프리',
 '살',
 '주소',
 '십만',
 '비',
 '기차표',
 '소포',
 '할',
 '전',
 '이징',
 '분',
 '

In [27]:
len(subseq_set)

1039

In [28]:
secrets

{'client_id': 'WyPCRsYaxeKJ748sfFdV', 'client_secret': 'EFpX_fxYpo'}

In [29]:
translation_for_subseq = {}
for subseq in subseq_set:
    print('Now getting translation for ' + subseq)
    english_trans = get_english_translation(subseq, secrets['client_id'], secrets['client_secret'])
    print(f'Translation is {english_trans}')
    print()
    translation_for_subseq[subseq] = english_trans

Now getting translation for 정
Translation is Attachment

Now getting translation for 키
Translation is Height

Now getting translation for 콕
Translation is Poke

Now getting translation for 지우
Translation is Jiwoo

Now getting translation for 학교
Translation is School

Now getting translation for 던
Translation is Dunn

Now getting translation for 책방
Translation is Bookstore

Now getting translation for 커피숍
Translation is Coffee shop

Now getting translation for 구
Translation is District

Now getting translation for 의자
Translation is chair

Now getting translation for 숟가락
Translation is Spoon

Now getting translation for 런
Translation is Run

Now getting translation for 미있
Translation is It's meaningful

Now getting translation for 곤
Translation is gon

Now getting translation for 끝
Translation is The end

Now getting translation for 아이스크
Translation is Ice cubes

Now getting translation for 창
Translation is Window

Now getting translation for 청역
Translation is Cheong Station

Now getting

In [30]:
len(translation_for_subseq)

1039

In [31]:
df_of_seq_translation = pd.DataFrame(translation_for_subseq.items()).rename(columns={
    0: 'hangul_syllable',
    1: 'english_translation'
})

In [32]:

df_of_seq_translation.to_csv('../metadata/hangul_subsequence_translation.csv')