In [1]:
from pathlib import Path
import json
import re
from tqdm import tqdm

settings = json.loads(Path('../../../settings.json').read_text())preprocessed_data_path = Path(settings['preprocessed_data_path'])data_path = Path('.').resolve()data_name = data_path.namesource_name = data_path.parent.namepreprocessed_dir = preprocessed_data_path/source_name/data_namesource_data_dir = preprocessed_dir/'preprocessed'
splits = ['train', 'valid']
tasks = ['LM(한국어)', '유해 질의 식별', '문서 내용 QA']
task_data_dir = preprocessed_dir/'preprocessed_task'
task_data_dir.mkdir(exist_ok=True)
for task in tasks:
    task_path = task_data_dir/task
    task_path.mkdir(exist_ok=True)
    
#### prepare for task preprocess
label_level_map = {
    1: "비난/혐오/차별",
        101: "인종 및 민족",
        102: "성별",
        103: "성적지향",
        104: "종교",
        105: "나이",
        106: "신체 및 정신 장애",
        107: "체형 및 외모",
        108: "지역 및 사회계층",
        109: "직업 및 업무수행",
        110: "정치적/사회적 신념",
    2: "선정",
        201: "경험",
        202: "행동",
        203: "매체",
    3: "욕설",
        301: "인신공격",
        302: "비속어",
        303: "모욕적 발언",
    4: "폭력",
        401: "정서적 폭력",
        402: "사이버 폭력",
    5: "범죄",
        501: "폭력",
        502: "재산",
        503: "경제",
        504: "공무원",
        505: "마약",
        506: "사이버",
        507: "개인정보 침해",
    6: "허위정보및루머",
        601: "조작된 내용(연예인,정치인,집단등)가짜뉴스",
        602: "편견이 심한 뉴스",
    7: "스팸 및 광고",
        701: "제품/서비스/이벤트 광고",
        702: "정치광고",
        703: "상업적 스팸",
        704: "대량 메일 스팸",
        705: "낚시 스팸",
        706: "악성코드 스팸"
}
persona_map = {
    1: '존댓말+문어체',
    2: '존댓말+구어체',
    3: '반말+문어체',
    4: '반말+구어체'
}

#### prepare for task preprocess end

#### task preprocess
for split in splits:
    source_data_dir_split = source_data_dir/split
    task_files = [(task_data_dir/task/f'{split}.jsonl').open('w', encoding='utf-8') for task in tasks]
    for source_data in tqdm(list(source_data_dir_split.iterdir()), desc=split):
        if source_data.stat().st_size == 0:
            continue
        data_name = source_data.stem
        source_data = source_data.open()

        if 'S_1. 말뭉치' in data_name:
            for line in source_data.readlines():
                line = json.loads(line)
                title = line['title']
                publisher_company = line['publisher_company']
                category_main = line['category_main']
                category_middle = line['category_middle']
                collection_name = line['collection_name']
                issue_date = line['issue_date']
                info = '/'.join([publisher_company, category_main, category_middle, collection_name, issue_date])
                
                text = line['corpus']
                #### LM(한국어)
                data = {'text': None}
                ## preprocess data from line
                data['text'] = f'title: {title}\ninfo: {info}\ncontent: {text}'
                ## preprocess data from line end
                task_files[0].write(json.dumps(data, ensure_ascii=False)+'\n')
                # print(json.dumps(data, indent=4, ensure_ascii=False))
                #### LM(한국어) end
        
        if '_2. 유해질의' in data_name:
            for line in source_data.readlines():
                line = json.loads(line)
                #### 유해 질의 식별
                data = {'text': None, 'label': None}
                ## preprocess data from line
                data['text'] = line['instruct_text']
                labels = line['labels'][0]
                label = {
                    'level1_type' : label_level_map[labels['level1_type']],
                    'level2_type' : label_level_map[labels['level2_type']],
                    'persona' : persona_map[labels['persona']]
                }
                data['label'] = label
                ## preprocess data from line end
                task_files[1].write(json.dumps(data, ensure_ascii=False)+'\n')
                # print(json.dumps(data, indent=4, ensure_ascii=False))
                #### 유해 질의 식별 end
        
        if 'L_1. 질의응답' in data_name or 'L_1. 말뭉치' in data_name:
            for line in source_data.readlines():
                line = json.loads(line)
                publisher = line['publisher']
                category = line['type']
                date = line['date']
                title = line['title']
                context = line['context']
                labels = line['labels']
                for label in labels:
                    instructions = [i['text'] for i in label['instructs']]
                    response = label['response']
                    #### 문서 내용 QA
                    data = {'input': {}, 'output': None}
                    ## preprocess data from line
                    data['input']['publisher'] = publisher
                    data['input']['date'] = date
                    data['input']['type'] = category
                    data['input']['title'] = title
                    data['input']['context'] = context
                    data['input']['instructions'] = instructions
                    data['output'] = response
                    ## preprocess data from line end
                    task_files[2].write(json.dumps(data, ensure_ascii=False)+'\n')
                    # print(json.dumps(data, indent=4, ensure_ascii=False))
                    #### 문서 내용 QA end

    for path in task_files:
        path.close()      
        
#### task preprocess end      

train: 100%|██████████| 13/13 [00:27<00:00,  2.11s/it]
valid: 100%|██████████| 16/16 [00:03<00:00,  4.87it/s]
