In [2]:
from pathlib import Path
import json
import re
from tqdm import tqdm

settings = json.loads(Path('../../../settings.json').read_text())
preprocessed_data_path = Path(settings['preprocessed_data_path'])
data_path = Path('.').resolve()
data_name = data_path.name
source_name = data_path.parent.name
preprocessed_dir = preprocessed_data_path/source_name/data_name

source_data_dir = preprocessed_dir/'preprocessed'
splits = ['train', 'valid']
tasks = ['LM(한국어)', '화자 인식', '화자 관계 식별', '화자 나이대 예측', 'dialog']
task_data_dir = preprocessed_dir/'preprocessed_task'
task_data_dir.mkdir(exist_ok=True)
for task in tasks:
    task_path = task_data_dir/task
    task_path.mkdir(exist_ok=True)
    
#### prepare for task preprocess
regex_pattern = re.compile(r'\{[^}]*\}|\(\([^)]*\)\)|\([^)]*\)')
#### prepare for task preprocess end

#### start task preprocess
for split in splits:
    source_data_dir_split = source_data_dir/split
    task_files = [(task_data_dir/task/f'{split}.jsonl').open('w', encoding='utf-8') for task in tasks]
    for source_data in tqdm(list(source_data_dir_split.iterdir()), desc=split):
        source_data = source_data.open()
        for line in source_data.readlines():
            line = json.loads(line)
            
            #### data preprocess 
            preprocessed_dialog = []
            for utt in line['utterance']:
                new_utt = {}
                new_utt['speaker_id'] = utt['speaker_id']
                original_form = regex_pattern.sub('', utt['original_form'])
                original_form = original_form.replace('  ', ' ').strip()
                original_form = original_form.replace(')', '').replace('(', '')
                original_form = original_form.replace('}', '').replace('{', '')
                new_utt['original_form'] = original_form
                if new_utt['original_form']:
                    preprocessed_dialog.append(new_utt)
            #### data preprocess end            
            
            #### LM(한국어)
            data = {'text':None}
            data['text'] = ' '.join([utt['original_form'] for utt in preprocessed_dialog])
            data['text'] = data['text'].replace('  ', ' ').strip()
            task_files[0].write(json.dumps(data, ensure_ascii=False)+'\n')
            # print(json.dumps(data, indent=4, ensure_ascii=False))
            
            
            #### 화자 인식
            data = {'input':None, 'output':None}
            data['input'] = {}
            data['output'] = {}
            data['input']['metadata'] = line['metadata']
            data['input']['speaker'] = line['speaker']
            input_dialog = []
            output_speaker_id = []
            for utt in preprocessed_dialog:
                if utt['original_form']:
                    input_dialog.append(utt['original_form'])
                    output_speaker_id.append(utt['speaker_id'])
            data['input']['dialog'] = input_dialog
            data['output']['speaker_id'] = output_speaker_id

            task_files[1].write(json.dumps(data, ensure_ascii=False)+'\n')
            # print(json.dumps(data, indent=4, ensure_ascii=False))
            
            
            #### 화자 관계 식별
            data = {'input':None, 'output':None}
            data['input'] = {}
            data['output'] = {}
            data['input']['metadata'] = line['metadata']
            input_speaker = []
            output_role = set()
            for speaker in line['speaker']:
                new_speaker = speaker.copy()
                output_role.add(new_speaker.pop('role'))
                input_speaker.append(new_speaker)
            data['input']['speaker'] = input_speaker
            
            input_dialog = []
            for utt in preprocessed_dialog:
                new_utt = utt.copy()
                input_dialog.append(new_utt)
            data['input']['dialog'] = input_dialog
            data['output']['role'] = list(output_role)
            task_files[2].write(json.dumps(data, ensure_ascii=False)+'\n')
            # print(json.dumps(data, indent=4, ensure_ascii=False))


            #### 화자 나이대 예측
            data = {'input':None, 'output':None}
            data['input'] = {}
            data['output'] = {}
            data['input']['metadata'] = line['metadata']
            input_speaker = []
            for speaker in line['speaker']:
                new_speaker = speaker.copy()
                new_speaker.pop('age')
                input_speaker.append(new_speaker)
            data['input']['speaker'] = input_speaker
            output_role = set()
            for utt in line['utterance']:
                output_role.add(utt['speaker_role'])
            data['input']['dialog'] = input_dialog
            output_age = []
            for speaker in line['speaker']:
                output_age.append(speaker['age'])
            data['output']['age'] = output_age
            task_files[3].write(json.dumps(data, ensure_ascii=False)+'\n')
            # print(json.dumps(data, indent=4, ensure_ascii=False))
            
            
            #### dialog
            data = {'text':None}
            data['text'] = {}
            data['text']['metadata'] = line['metadata']
            data['text']['speaker'] = line['speaker']
            data['text']['dialog'] = preprocessed_dialog
            task_files[4].write(json.dumps(data, ensure_ascii=False)+'\n')
            # print(json.dumps(data, indent=4, ensure_ascii=False))
            
            
    #         break
    #     break
    # break

    for path in task_files:
        path.close()            

train: 100%|██████████| 10/10 [00:50<00:00,  5.03s/it]
valid: 100%|██████████| 1/1 [00:06<00:00,  6.60s/it]
