In [1]:
import pandas as pd
import itertools

vocab = pd.read_csv('../input_files_from_output/vocab_ak_1_2_ttmik_1_2_w_hanja_refined.csv')

hangeul_translation = pd.read_csv('../metadata/hangul_subsequence_translation.csv')

english_trans_for_hangeul = {
    _dict['hangeul_subseq']: _dict['english_translation']
    for _dict in hangeul_translation.to_dict(orient='records')
}


hanja_translation = pd.read_csv('../metadata/hanja_subsequence_translation.csv')

english_trans_for_hanja = {
    _dict['hanja_subsequence']: _dict['english_translation']
    for _dict in hanja_translation.to_dict(orient='records')
}

In [2]:
def cutoff_ending(_word):
    if _word.endswith('하다'):
        return _word[:-2]
    elif _word.endswith('다'):
        return _word[:-1]
    else:
        return _word





In [3]:
vocab

Unnamed: 0.1,Unnamed: 0,hangul_word,english_phrase,hanja_word
0,0,휴대,being portable,携帶
1,1,형,elder brother from male pov,
2,2,피곤하다,tired,
3,3,포도,grape,葡萄
4,4,팔,arm | eight,
...,...,...,...,...
554,554,가족,family,家族
555,555,가방,bag,街坊
556,556,가르치다,to teach,
557,557,가다,to go,


In [4]:
divider_to_use = ['|', '/']
def get_word_break_lists(_word_0):
    if str(_word_0).lower() == 'nan':
        return []
    _word = cutoff_ending(_word_0)
    if len(_word) < 2:
        return [_word]
    word_break_lists = []
    num_dividers = len(_word) - 1
    for divider_set in itertools.product(*([divider_to_use] * num_dividers)):
        current_word = ''
        for _seq, _div in zip(_word[:-1], divider_set):
            current_word += ''.join([_seq, _div])
        current_word += _word[-1]
        word_to_split = current_word.replace('/', '')
        word_break_lists.append(word_to_split.split('|'))
    return word_break_lists

In [5]:
vocab['hangul_syll_chains'] = vocab['hangul_word'].apply(get_word_break_lists)
vocab['hanja_syll_chains'] = vocab['hanja_word'].apply(get_word_break_lists)

def _get_hangul_hanja_chains(hangul_chains, hanja_chains):
    if len(hanja_chains) > 0:
        return list(zip(hangul_chains, hanja_chains))
    else:
        return [(_chain, []) for _chain in hangul_chains]

vocab['hangul_hanja_chains'] = vocab.apply(lambda row: _get_hangul_hanja_chains(
    row['hangul_syll_chains'], row['hanja_syll_chains']), axis=1)

In [6]:
vocab.head(50)

Unnamed: 0.1,Unnamed: 0,hangul_word,english_phrase,hanja_word,hangul_syll_chains,hanja_syll_chains,hangul_hanja_chains
0,0,휴대,being portable,携帶,"[[휴, 대], [휴대]]","[[携, 帶], [携帶]]","[([휴, 대], [携, 帶]), ([휴대], [携帶])]"
1,1,형,elder brother from male pov,,[형],[],"[(형, [])]"
2,2,피곤하다,tired,,"[[피, 곤], [피곤]]",[],"[([피, 곤], []), ([피곤], [])]"
3,3,포도,grape,葡萄,"[[포, 도], [포도]]","[[葡, 萄], [葡萄]]","[([포, 도], [葡, 萄]), ([포도], [葡萄])]"
4,4,팔,arm | eight,,[팔],[],"[(팔, [])]"
5,5,크다,big,,[크],[],"[(크, [])]"
6,6,춥다,cold,,[춥],[],"[(춥, [])]"
7,7,착하다,to be kind / good-natured,,[착],[],"[(착, [])]"
8,8,차,car | tea,,[차],[],"[(차, [])]"
9,9,집,house,,[집],[],"[(집, [])]"


In [7]:
vocab2 = vocab.explode('hangul_hanja_chains').rename(columns = {
    'hangul_hanja_chains': 'hangul_hanja_subseq_chain'
})

In [8]:
vocab2

Unnamed: 0.1,Unnamed: 0,hangul_word,english_phrase,hanja_word,hangul_syll_chains,hanja_syll_chains,hangul_hanja_subseq_chain
0,0,휴대,being portable,携帶,"[[휴, 대], [휴대]]","[[携, 帶], [携帶]]","([휴, 대], [携, 帶])"
0,0,휴대,being portable,携帶,"[[휴, 대], [휴대]]","[[携, 帶], [携帶]]","([휴대], [携帶])"
1,1,형,elder brother from male pov,,[형],[],"(형, [])"
2,2,피곤하다,tired,,"[[피, 곤], [피곤]]",[],"([피, 곤], [])"
2,2,피곤하다,tired,,"[[피, 곤], [피곤]]",[],"([피곤], [])"
...,...,...,...,...,...,...,...
556,556,가르치다,to teach,,"[[가, 르, 치], [가, 르치], [가르, 치], [가르치]]",[],"([가르, 치], [])"
556,556,가르치다,to teach,,"[[가, 르, 치], [가, 르치], [가르, 치], [가르치]]",[],"([가르치], [])"
557,557,가다,to go,,[가],[],"(가, [])"
558,558,가게,store / shop,,"[[가, 게], [가게]]",[],"([가, 게], [])"


In [9]:
def _get_chain(word_chain, engish_trans):
    return ' + '.join([
        f'{hangul_syll} ({engish_trans[hangul_syll]})'
        for hangul_syll in word_chain
    ])

vocab2['hangul_chain_with_trans'] = vocab2['hangul_hanja_subseq_chain'].apply(
    lambda x: _get_chain(x[0], english_trans_for_hangeul))

In [11]:
vocab2['hanja_chain_with_trans'] = vocab2['hangul_hanja_subseq_chain'].apply(
    lambda x: _get_chain(x[1], english_trans_for_hanja))

In [12]:
vocab2.to_csv('../output/starter_vocab_with_extensive_chain_2.csv')