In [1]:
from pathlib import Path
import json
import re
from tqdm import tqdm

settings = json.loads(Path('../../../settings.json').read_text())preprocessed_data_path = Path(settings['preprocessed_data_path'])data_path = Path('.').resolve()data_name = data_path.namesource_name = data_path.parent.namepreprocessed_dir = preprocessed_data_path/source_name/data_namesource_data_dir = preprocessed_dir/'preprocessed'
splits = ['train', 'valid']
tasks = ['LM(한국어)', 'dialog', '대화 기반 페르소나 예측']
task_data_dir = preprocessed_dir/'preprocessed_task'
task_data_dir.mkdir(exist_ok=True)
for task in tasks:
    task_path = task_data_dir/task
    task_path.mkdir(exist_ok=True)
    
#### prepare for task preprocess
speaker_persona_code = {
    'A01' : '청소년',
    'A02' : '청년',
    'A03' : '중년',
    'A04' : '노년',
    'G01' : '남성',
    'G02' : '여성'
}
speaker_emotion_code = {
    'S01': '가족관계',
    'S02': '학업 및 진로',
    'S03': '학교폭력/따돌림',
    'S04': '대인관계',
    'S05': '연애,결혼,출산',
    'S06': '진로,취업,직장',
    'S07': '대인관계(부부, 자녀)',
    'S08': '재정,은퇴,노후준비',
    'S09': '건강',
    'S10': '직장, 업무 스트레스',
    'S11': '건강,죽음',
    'S12': '대인관계(노년)',
    'S13': '재정',
    'D01': '만성질환 유',
    'D02': '만성질환 무',
    'E10' : '분노',
    'E11' : '툴툴대는',
    'E12' : '좌절한',
    'E13' : '짜증내는',
    'E14' : '방어적인',
    'E15' : '악의적인',
    'E16' : '안달하는',
    'E17' : '구역질 나는',
    'E18' : '노여워하는',
    'E19' : '성가신',
    'E20' : '슬픔',
    'E21' : '실망한',
    'E22' : '비통한',
    'E23' : '후회되는',
    'E24' : '우울한',
    'E25' : '마비된',
    'E26' : '염세적인',
    'E27' : '눈물이 나는',
    'E28' : '낙담한',
    'E29' : '환멸을 느끼는',
    'E30' : '불안',
    'E31' : '두려운',
    'E32' : '스트레스 받는',
    'E33' : '취약한',
    'E34' : '혼란스러운',
    'E35' : '당혹스러운',
    'E36' : '회의적인',
    'E37' : '걱정스러운',
    'E38' : '조심스러운',
    'E39' : '초조한',
    'E40' : '상처',
    'E41' : '질투하는',
    'E42' : '배신당한',
    'E43' : '고립된',
    'E44' : '충격 받은',
    'E45' : '가난한, 불우한',
    'E46' : '희생된',
    'E47' : '억울한',
    'E48' : '괴로워하는',
    'E49' : '버려진',
    'E50' : '당황',
    'E51' : '고립된(당황한)',
    'E52' : '남의 시선을 의식하는',
    'E53' : '외로운',
    'E54' : '열등감',
    'E55' : '죄책감의',
    'E56' : '부끄러운',
    'E57' : '혐오스러운',
    'E58' : '한심한',
    'E59' : '혼란스러운(당황한)',
    'E60' : '기쁨',
    'E61' : '감사하는',
    'E62' : '신뢰하는',
    'E63' : '편안한',
    'E64' : '만족스러운',
    'E65' : '흥분',
    'E66' : '느긋',
    'E67' : '안도',
    'E68' : '신이 난',
    'E69' : '자신하는',
}
get_speaker_persona = lambda profile: [speaker_persona_code[code] for code in profile['persona']['human']]
get_spaker_emotion = lambda profile: [speaker_emotion_code[code] for code in profile['emotion']['emotion-id'].split('_')]

#### prepare for task preprocess end

#### task preprocess
for split in splits:
    source_data_dir_split = source_data_dir/split
    task_files = [(task_data_dir/task/f'{split}.jsonl').open('w', encoding='utf-8') for task in tasks]
    for source_data in tqdm(list(source_data_dir_split.iterdir()), desc=split):
        source_data = source_data.open()
        for line in source_data.readlines():
            line = json.loads(line)
            
            #### data preprocess
            profile = line['profile']
            dialog = line['talk']['content']
            dialog_id = line['talk']['id']['profile-id']
            speaker_persona = get_speaker_persona(profile)
            speaker_emotion = get_spaker_emotion(profile)
            #### data preprocess end 


            #### LM(한국어)
            data = {'text': None}
            ## preprocess data from line
            data['text'] = '##화자정보 : ' + str(speaker_persona) + '\n'
            data['text'] += '##감정정보 : ' + str(speaker_emotion) + '\n'
            data['text'] += '##대화 :\n' + '\n'.join([('Human : ' if speaker[0] == 'H' else 'System : ') + utt 
                                      for speaker, utt in line['talk']['content'].items() if utt])
            ## preprocess data from line end
            task_files[0].write(json.dumps(data, ensure_ascii=False)+'\n')
            # print(data['text'])
            #### LM(한국어) end


            #### dialog
            data = {'text': None}
            ## preprocess data from line
            data['text'] = {}
            data['text']['dialog_id'] = dialog_id
            data['text']['persona'] = speaker_persona
            data['text']['persona_code'] = profile['persona']['human']
            data['text']['emotion'] = speaker_emotion
            data['text']['emotion_code'] = profile['emotion']['emotion-id'].split('_')
            data['text']['dialog'] = [v for v in dialog.values() if v]
            ## preprocess data from line end
            task_files[1].write(json.dumps(data, ensure_ascii=False)+'\n')
            # print(json.dumps(data, indent=4, ensure_ascii=False))
            #### dialog end


            #### 대화 기반 페르소나 예측
            data2 = {'input': None, 'output': None}
            ## preprocess data from line
            data2['input'] = {}
            data2['input']['dialog_id'] = data['text'].pop('dialog_id')
            data2['input']['dialog'] = data['text'].pop('dialog')
            data2['output'] = {}
            data2['output'].update(data)
            ## preprocess data from line end
            task_files[2].write(json.dumps(data2, ensure_ascii=False)+'\n')
            # print(json.dumps(data2, indent=4, ensure_ascii=False))
            #### 대화 기반 페르소나 예측 end


            
    #         break
    #     break
    # break

    for path in task_files:
        path.close()      
        
#### task preprocess end      

train: 100%|██████████| 1/1 [00:01<00:00,  1.65s/it]
valid: 100%|██████████| 1/1 [00:00<00:00,  4.89it/s]
