In [65]:
import pandas as pd
import os

from vocabulary_grouping_tools.util.constants import HANGUL_COL_NM, HANJA_COL_NM, ENGLISH_COL_NM

STOP_WORDS = {
    'a',
    'at',
    'or',
    'to',
    'of',
    'in',
    'be',
    'that',
    'this',
    'for',
    'when',
    'the',
    'and',
    'do',
    'from'
}


FILE_NAME = 'vocab_active_korean_1_2_ttmik_1_2_refined'
EXTENSION = 'csv'
FILE_PATH = 'input_files_from_output'
OUTPUT_PATH = f'output/{FILE_NAME}'

refined_file_nm = f'../{FILE_PATH}/{FILE_NAME}.{EXTENSION}'
print(refined_file_nm)
eng_korean_df = pd.read_csv(refined_file_nm).rename(columns={
    'english_phrase': 'hangeul_english_translation'
})

hangul_hanja_df = pd.read_csv('../metadata/hanja_hangul_pairs.csv')

eng_korean_df['does_end_in_hada'] = eng_korean_df[HANGUL_COL_NM].apply(lambda x: x.endswith('하다'))

hanja_translation = pd.read_csv('../metadata/hanja_subsequence_translation.csv').rename(
    columns={
        'english_translation': 'hanja_english_translation'
    }
)

hanja_translation = hanja_translation[hanja_translation['hanja_english_translation'].apply(
    lambda x: sum([_char.isalnum() for _char in x]) > 1)]

def remove_hada_if_there(row):
    if row['does_end_in_hada']:
        return row[HANGUL_COL_NM][:-2]
    else:
        return row[HANGUL_COL_NM]


eng_korean_df['hangul_word_wo_hada'] = eng_korean_df.apply(remove_hada_if_there, axis=1)

../input_files_from_output/vocab_active_korean_1_2_ttmik_1_2_refined.csv


In [66]:
eng_korean_df.head()

Unnamed: 0,hangul_word,hangeul_english_translation,does_end_in_hada,hangul_word_wo_hada
0,가게,store / shop,False,가게
1,가다,to go,False,가다
2,가르치다,to teach,False,가르치다
3,가방,bag,False,가방
4,가족,family,False,가족


In [67]:
eng_korean_df_with_hanja = eng_korean_df.join(
    hangul_hanja_df.set_index(HANGUL_COL_NM), on='hangul_word_wo_hada', how='left')

eng_korean_df_with_hanja['hanja_exists'] = eng_korean_df_with_hanja[HANJA_COL_NM].notna()

In [68]:
eng_korean_df_with_hanja_trans = eng_korean_df_with_hanja.join(
    hanja_translation.set_index('hanja_subsequence'),
    on='hanja_word', how='left')

In [69]:
def do_english_defs_possibly_match(eng_def_1, eng_def_2):
    eng_def_1_words = set(str(eng_def_1).split(' ')).difference(STOP_WORDS)
    eng_def_2_words = set(str(eng_def_2).split(' ')).difference(STOP_WORDS)
    return len(eng_def_2_words.intersection(eng_def_1_words)) > 0

eng_korean_df_with_hanja_trans['eng_defs_possible_match'] = eng_korean_df_with_hanja_trans.apply(
    lambda row: do_english_defs_possibly_match(
        row['hanja_english_translation'],
        row['hangeul_english_translation']),
    axis=1)

In [70]:
eng_korean_df_with_hanja_trans.columns

Index(['hangul_word', 'hangeul_english_translation', 'does_end_in_hada',
       'hangul_word_wo_hada', 'hanja_word', 'hanja_exists', 'Unnamed: 0',
       'hanja_english_translation', 'eng_defs_possible_match'],
      dtype='object')

In [71]:
eng_korean_df_with_trans_info = eng_korean_df_with_hanja_trans.groupby([
    'hangul_word',
    'hangeul_english_translation',
    'does_end_in_hada',
    'hangul_word_wo_hada'
]).agg({'hanja_exists': list,
        'hanja_english_translation': list,
        'hanja_word': list,
        'eng_defs_possible_match': list})

In [72]:
def _get_matching_indices_is_good(row):
    if not row['hanja_exists'][0]:
        return [-1], False
    elif sum(row['eng_defs_possible_match']) < 1:
        if len(row['eng_defs_possible_match']) > 2:
            return [-1], False
        else:
            return list(range(len(row['eng_defs_possible_match']))), False
    out_indices = [index for index, _val in enumerate(row['eng_defs_possible_match']) if _val]
    return out_indices, True

eng_korean_df_with_trans_info['out_indices_is_good'] = eng_korean_df_with_trans_info.apply(
    _get_matching_indices_is_good, axis=1)

In [73]:
eng_korean_df_with_trans_info['is_good_translation'] = \
    eng_korean_df_with_trans_info['out_indices_is_good'].apply(lambda x: x[1])

In [74]:
def _get_pairs(row):
    out_inds = row['out_indices_is_good'][0]
    if out_inds[0] == -1:
        return []
    out_pairs = []
    for _ind in out_inds:
        out_pairs.append((row['hanja_word'][_ind], row['hanja_english_translation'][_ind]))
    return out_pairs

eng_korean_df_with_trans_info['hanja_english_pairs'] = eng_korean_df_with_trans_info.apply(_get_pairs, axis=1)

In [77]:

eng_korean_df_with_hanja_info = eng_korean_df_with_trans_info.reset_index()[[
    'hangul_word',
    'hangeul_english_translation',
    'does_end_in_hada',
    'hangul_word_wo_hada',
    'is_good_translation',
    'hanja_english_pairs'
]]

In [78]:
eng_korean_df_with_hanja_info.to_csv('../output/temp_info2.csv')

In [None]:
'''
TODO: UPDATE THE BOTTOM ONES HERE
'''
def add_hada_to_hanja_words(row):
    if row['does_end_in_hada'] and row['hanja_exists']:
        return row['hanja_word'] + '하다'
    else:
        return row['hanja_word']


eng_korean_df_with_hanja['hanja_word_with_hada'] = eng_korean_df_with_hanja.apply(add_hada_to_hanja_words, axis=1)

num_hangul_words = eng_korean_df_with_hanja.groupby(HANGUL_COL_NM).agg('count')[HANJA_COL_NM].rename('num_hangul_words')
joined_df_2 = eng_korean_df_with_hanja.join(num_hangul_words, on=HANGUL_COL_NM)

joined_df_2 = joined_df_2.reset_index().sort_values(['num_hangul_words', HANGUL_COL_NM], ascending=False)

joined_df_3 = joined_df_2.join(hanja_translation.set_index('hanja_subsequence'), on='hanja_word', how='left')




#
# joined_df_3 = joined_df_3[
#
# os.makedirs(OUTPUT_PATH, exist_ok=True)
# joined_df_3.to_csv(f'{OUTPUT_PATH}/{FILE_NAME}_with_hanja_v5.csv')
