In [56]:
import pandas as pd
import itertools

vocab = pd.read_csv('../input_files_from_output/vocab_ak_1_2_ttmik_1_2_w_hanja_refined.csv')

translation = pd.read_csv('../metadata/hangul_subsequence_translation.csv')

In [57]:
def cutoff_ending(hangul_word):
    if hangul_word.endswith('하다'):
        return hangul_word[:-2]
    elif hangul_word.endswith('다'):
        return hangul_word[:-1]
    else:
        return hangul_word

In [58]:
translation

Unnamed: 0.1,Unnamed: 0,hangul_syllable,english_translation
0,0,정,Attachment
1,1,키,Height
2,2,콕,Poke
3,3,지우,Jiwoo
4,4,학교,School
...,...,...,...
1034,1034,빨래,Laundry
1035,1035,브,V
1036,1036,바,Bar
1037,1037,집안일,household chores


In [59]:
english_trans_for_syll = {
    _dict['hangul_syllable']: _dict['english_translation']
    for _dict in translation.to_dict(orient='records')
}


In [60]:
vocab

Unnamed: 0.1,Unnamed: 0,hangul_word,english_phrase,hanja_word
0,0,휴대,being portable,携帶
1,1,형,elder brother from male pov,
2,2,피곤하다,tired,
3,3,포도,grape,葡萄
4,4,팔,arm | eight,
...,...,...,...,...
554,554,가족,family,家族
555,555,가방,bag,街坊
556,556,가르치다,to teach,
557,557,가다,to go,


In [66]:
divider_to_use = ['|', '/']
def get_word_break_lists(_word_0):
    _word = cutoff_ending(_word_0)
    if len(_word) < 2:
        return [_word]
    word_break_lists = []
    num_dividers = len(_word) - 1
    for divider_set in itertools.product(*([divider_to_use] * num_dividers)):
        current_word = ''
        for _seq, _div in zip(_word[:-1], divider_set):
            current_word += ''.join([_seq, _div])
        current_word += _word[-1]
        word_to_split = current_word.replace('/', '')
        word_break_lists.append(word_to_split.split('|'))
    return word_break_lists

IndexError: string index out of range

In [67]:
vocab['hangul_syll_chains'] = vocab['hangul_word'].apply(get_word_break_lists)

In [68]:
vocab2 = vocab.explode('hangul_syll_chains').rename(columns = {
    'hangul_syll_chains': 'hangul_syllable_chain'
})

In [69]:
vocab2

Unnamed: 0.1,Unnamed: 0,hangul_word,english_phrase,hanja_word,hangul_syllable_chain
0,0,휴대,being portable,携帶,"[휴, 대]"
0,0,휴대,being portable,携帶,[휴대]
1,1,형,elder brother from male pov,,형
2,2,피곤하다,tired,,"[피, 곤]"
2,2,피곤하다,tired,,[피곤]
...,...,...,...,...,...
556,556,가르치다,to teach,,"[가르, 치]"
556,556,가르치다,to teach,,[가르치]
557,557,가다,to go,,가
558,558,가게,store / shop,,"[가, 게]"


In [70]:
def _get_chain(word_chain):
    return ' + '.join([
        f'{hangul_syll} ({english_trans_for_syll[hangul_syll]})'
        for hangul_syll in word_chain
    ])

vocab2['chain_with_trans'] = vocab2['hangul_syllable_chain'].apply(_get_chain)

In [71]:
vocab2

Unnamed: 0.1,Unnamed: 0,hangul_word,english_phrase,hanja_word,hangul_syllable_chain,chain_with_trans
0,0,휴대,being portable,携帶,"[휴, 대]",휴 (Phew) + 대 (Versus)
0,0,휴대,being portable,携帶,[휴대],휴대 (Carrying)
1,1,형,elder brother from male pov,,형,형 (Brother)
2,2,피곤하다,tired,,"[피, 곤]",피 (Blood) + 곤 (gon)
2,2,피곤하다,tired,,[피곤],피곤 (fatigue)
...,...,...,...,...,...,...
556,556,가르치다,to teach,,"[가르, 치]",가르 (Gar) + 치 (phooey)
556,556,가르치다,to teach,,[가르치],가르치 (teaching)
557,557,가다,to go,,가,가 (go)
558,558,가게,store / shop,,"[가, 게]",가 (go) + 게 (Crab)


In [72]:
vocab2.to_csv('../output/starter_vocab_with_extensive_chain.csv')