In [2]:
from pathlib import Path
import json
import re
from tqdm import tqdm

settings = json.loads(Path('../../../settings.json').read_text())
preprocessed_data_path = Path(settings['preprocessed_data_path'])
data_path = Path('.').resolve()
data_name = data_path.name
source_name = data_path.parent.name
preprocessed_dir = preprocessed_data_path/source_name/data_name

source_data_dir = preprocessed_dir/'preprocessed'
splits = ['train']
tasks = ['단어 설명', '문장 내 단어 뜻 질문', '속담 설명']
task_data_dir = preprocessed_dir/'preprocessed_task'
task_data_dir.mkdir(exist_ok=True)
for task in tasks:
    task_path = task_data_dir/task
    task_path.mkdir(exist_ok=True)
    
#### prepare for task preprocess
word_definitions = {}
proverb_definitions = {}

#### prepare for task preprocess end

#### task preprocess
for split in splits:
    source_data_dir_split = source_data_dir/split
    task_files = [(task_data_dir/task/f'{split}.jsonl').open('w', encoding='utf-8') for task in tasks]
    for source_data in tqdm(list(source_data_dir_split.iterdir()), desc=split):
        source_data = source_data.open()
        while True:
            line = source_data.readline()
            if not line:
                break
            line = json.loads(line)
            
            #### data preprocess
            if line['wordinfo']['word_unit'] in ['어휘', '구'] and \
                line['wordinfo']['word_type'] in ['고유어', '한자어', '혼종어'] and \
                line['senseinfo']['type'] in ['일반어'] and \
                line['senseinfo'].get('pos') in ['동사', '형용사', '보조 동사', '명사']:
                    
                word = line['wordinfo']['word']
                original_language_info = line['wordinfo'].get('original_language_info', None)
                word_type = line['wordinfo']['word_type']
                
                sense_no = line['senseinfo']['sense_no']
                definition = line['senseinfo']['definition']
                if '<img style' in definition:
                    definition = line['senseinfo']['definition_original']
                    definition = re.sub(r'<sense_no>\d*</sense_no>', '', definition)
                translation_info = line['senseinfo'].get('translation_info', None)
                relation_info = line['senseinfo'].get('relation_info', None)
                if relation_info:
                    relation_info = [relation for relation in relation_info if relation['type'] not in ['옛말', '북한어', '방언']]
                    [relation.pop('link') for relation in relation_info]
                example_info = line['senseinfo'].get('example_info', None)
                if example_info:
                    example_info = [example['example'] for example in example_info]
                pos = line['senseinfo']['pos']
                cat_info = line['senseinfo'].get('cat_info', None)
                if cat_info:
                    cat_info = [cat['cat'] for cat in cat_info]
                proverb_info = line['senseinfo'].get('proverb_info', None)
                
                new_word = {
                    'word': word,
                    'original_language_info': original_language_info,
                    'cat_info': cat_info,
                    'word_type': word_type,
                    'sense_no': sense_no,
                    'definition': definition,
                    'translation_info': translation_info,
                    'relation_info': relation_info,
                    'example_info': example_info,
                    'pos': pos,
                    'proverb_info': proverb_info
                }
                if word in word_definitions:
                    word_definitions[word][sense_no] = new_word
                else:
                    word_definitions[word] = {sense_no: new_word}
                    
            elif line['wordinfo']['word_unit'] in ['속담', '관용구'] and\
                line['senseinfo']['type'] not in ['북한어']:
                    
                proverb = line['wordinfo']['word']
                sense_no = line['senseinfo']['sense_no']
                definition = line['senseinfo']['definition']
                synonyms = None
                if sep:=re.findall(r'<동의 .{2,3}>', definition):
                    definition, synonyms = definition.split(sep[0])
                    synonyms = re.sub(r'‘|’', '\'', synonyms)
                    definition = definition.strip()
                    synonyms = [synonym.strip() for synonym in synonyms.split('\'') if synonym.strip()]
                    
                
                example_info = line['senseinfo'].get('example_info', None)
                if example_info:
                    example_info = [example['example'] for example in example_info]
                
                new_proverb = {
                    'proverb': proverb,
                    'sense_no': sense_no,
                    'definition': definition,
                    'example_info': example_info,
                    'synonyms': synonyms,
                }
                if proverb in proverb_definitions:
                    proverb_definitions[proverb][sense_no] = new_proverb
                else:
                    proverb_definitions[proverb] = {sense_no: new_proverb}


    for word, definitions in sorted(word_definitions.items()):
                
        #### data preprocess end 
        
        #### 단어 설명
        data = {'input': None, 'output': None}
        data['input'] = word.replace('-', '')
        data['output'] = definitions
        ## preprocess data from line
        
        ## preprocess data from line end
        task_files[0].write(json.dumps(data, ensure_ascii=False)+'\n')
        # print(json.dumps(data, indent=4, ensure_ascii=False))
        #### 단어 설명 end
        
        
        for sense_no, definition in definitions.items():
            if definition['example_info']:
                for example in definition['example_info']:
                    #### 문장 내 단어 뜻 질문
                    data = {'input': None, 'output': None}
                    ## preprocess data from line
                    data['input'] = example
                    data['output'] = definition
                    ## preprocess data from line end
                    task_files[1].write(json.dumps(data, ensure_ascii=False)+'\n')
                    # print(json.dumps(data, indent=4, ensure_ascii=False))
                    #### 문장 내 단어 뜻 질문 end
        # break

    for proverb, definitions in sorted(proverb_definitions.items()):
        #### 속담 설명
        data = {'input': None, 'output': None}
        ## preprocess data from line
        data['input'] = proverb
        data['output'] = definitions
        ## preprocess data from line end
        task_files[2].write(json.dumps(data, ensure_ascii=False)+'\n')
        # print(json.dumps(data, indent=4, ensure_ascii=False))
        #### 속담 설명 end
        # break

for path in task_files:
    path.close()      
    

        
#### task preprocess end      

train: 100%|██████████| 1/1 [00:17<00:00, 17.33s/it]
