In [4]:
from pathlib import Path
import json
import re
from tqdm import tqdm

settings = json.loads(Path('../../../settings.json').read_text())preprocessed_data_path = Path(settings['preprocessed_data_path'])data_path = Path('.').resolve()data_name = data_path.namesource_name = data_path.parent.namepreprocessed_dir = preprocessed_data_path/source_name/data_namesource_data_dir = preprocessed_dir/'preprocessed'
splits = ['train', 'valid']
tasks = ['LM(한국어)', '대화 주제 식별', '화자 인식', '문장 정규화', 'dialog']
task_data_dir = preprocessed_dir/'preprocessed_task'
task_data_dir.mkdir(exist_ok=True)
for task in tasks:
    task_path = task_data_dir/task
    task_path.mkdir(exist_ok=True)
    
#### prepare for task preprocess
regex_pattern = re.compile(r'\{[^}]*\}|\(\([^)]*\)\)|\([^)]*\)')
#### prepare for task preprocess end

#### task preprocess
for split in splits:
    source_data_dir_split = source_data_dir/split
    task_files = [(task_data_dir/task/f'{split}.jsonl').open('w', encoding='utf-8') for task in tasks]
    for source_data in tqdm(list(source_data_dir_split.iterdir()), desc=split):
        source_data = source_data.open()
        for line in source_data.readlines():
            line = json.loads(line)
            
            #### data preprocess
            preprocessed_dialog = []
            for utt in line['utterance']:
                new_utt = {}
                new_utt['speaker_id'] = utt['speaker_id']
                original_form = regex_pattern.sub('', utt['original_form'])
                original_form = original_form.replace('  ', ' ').strip()
                original_form = original_form.replace(')', '').replace('(', '')
                original_form = original_form.replace('}', '').replace('{', '')
                new_utt['original_form'] = original_form
                new_utt['hangeulToNumber'] = utt['hangeulToNumber']
                if original_form:
                    preprocessed_dialog.append(new_utt)
            #### data preprocess end 

            #### LM(한국어)
            data = {'text':None}
            ## preprocess data from line
            data['text'] = ' '.join([utt['original_form'] for utt in preprocessed_dialog])
            data['text'] = data['text'].replace('  ', ' ').strip()
            ## preprocess data from line end
            task_files[0].write(json.dumps(data, ensure_ascii=False)+'\n')
            # print(json.dumps(data, indent=4, ensure_ascii=False))
            #### LM(한국어) end


            #### 대화 주제 식별
            data = {'input': None, 'output': None}
            ## preprocess data from line
            data['input'] = {}
            data['output'] = {}
            data['input']['metadata'] = line['metadata'].copy()
            del data['input']['metadata']['topic']
            data['input']['speaker'] = line['speaker']
            input_dialog = []
            output_speaker_id = []
            for utt in preprocessed_dialog:
                new_utt = utt.copy()
                del new_utt['hangeulToNumber']
                input_dialog.append(new_utt)
            data['input']['dialog'] = input_dialog
            data['output']['topic'] = line['metadata']['topic']
            ## preprocess data from line end
            task_files[1].write(json.dumps(data, ensure_ascii=False)+'\n')
            # print(json.dumps(data, indent=4, ensure_ascii=False))
            #### 대화 주제 식별 end


            #### 화자 인식
            data = {'input': None, 'output': None}
            ## preprocess data from line
            data['input'] = {}
            data['output'] = {}
            data['input']['metadata'] = line['metadata']
            data['input']['speaker'] = line['speaker']
            input_dialog = []
            output_speaker_id = []
            for utt in preprocessed_dialog:
                if utt['original_form']:
                    input_dialog.append(utt['original_form'])
                    output_speaker_id.append(utt['speaker_id'])
            data['input']['dialog'] = input_dialog
            data['output']['speaker_id'] = output_speaker_id
            ## preprocess data from line end
            task_files[2].write(json.dumps(data, ensure_ascii=False)+'\n')
            # print(json.dumps(data, indent=4, ensure_ascii=False))
            #### 화자 인식 end


            #### 문장 정규화
            ## preprocess data from line
            for utt in preprocessed_dialog:
                if utt['hangeulToNumber'] is not None:
                    data = {'input': None, 'output': None}
                    data['input'] = utt['original_form']
                    spans = []
                    texts = []
                    for h2n in utt['hangeulToNumber']:
                        spans.append((h2n['begin'], h2n['end']))
                        texts.append(h2n['number'])

                    # Split the input text
                    text_parts = []
                    start = 0
                    for span in spans:
                        end = span[0] - 1
                        text_parts.append(data['input'][start:end])
                        start = span[1]
                    text_parts.append(data['input'][start:])

                    # Add the replace texts
                    output_text = ""
                    for i, part in enumerate(text_parts):
                        output_text += part
                        if i < len(texts):
                            output_text += texts[i]
                    data['output'] = output_text
                    ## preprocess data from line end
                    task_files[3].write(json.dumps(data, ensure_ascii=False)+'\n')
                    # print(json.dumps(data, indent=4, ensure_ascii=False))
            #### 문장 정규화 end


            #### dialog
            data = {'text': None}
            ## preprocess data from line
            data['text'] = {}
            data['text']['metadata'] = line['metadata']
            data['text']['speaker'] = line['speaker']
            for utt in preprocessed_dialog:
                del utt['hangeulToNumber']
            data['text']['dialog'] = preprocessed_dialog
            ## preprocess data from line end
            task_files[4].write(json.dumps(data, ensure_ascii=False)+'\n')
            # print(json.dumps(data, indent=4, ensure_ascii=False))
            #### dialog end


            
    #         break
    #     break
    # break

    for path in task_files:
        path.close()      
        
#### task preprocess end      

train: 100%|██████████| 7/7 [00:30<00:00,  4.31s/it]
valid: 100%|██████████| 1/1 [00:04<00:00,  4.48s/it]
