In [1]:
from pathlib import Path
import json
import re
from tqdm import tqdm

settings = json.loads(Path('../../../settings.json').read_text())preprocessed_data_path = Path(settings['preprocessed_data_path'])data_path = Path('.').resolve()data_name = data_path.namesource_name = data_path.parent.namepreprocessed_dir = preprocessed_data_path/source_name/data_namesource_data_dir = preprocessed_dir/'preprocessed'
splits = ['train', 'valid']
tasks = ['LM(한국어)']
task_data_dir = preprocessed_dir/'preprocessed_task'
task_data_dir.mkdir(exist_ok=True)
for task in tasks:
    task_path = task_data_dir/task
    task_path.mkdir(exist_ok=True)
    
#### prepare for task preprocess
key_mapper = {
    'info' : '기초정보',
    'caseField' : '사건유형',
    'detailField' : '세부유형',
    'trailField' : '심급유형',
    'caseNm' : '사건명',
    'courtNm' : '법원명',
    'judmnAdjuDe' : '판결선고일',
    'caseNo' : '사건번호',
    'relateLaword' : '참조조문',
    'qotatPrcdnt' : '참조판례',
    'concerned' : '사건관계자',
    'acusr' : '원고',
    'dedat' : '피고',
    'org' : '원심판결',
    'orgJdgmnCourtN' : '원심법원명',
    'orgJdgmnAdjuDe' : '원심선고일',
    'orgJdgmnCaseNo' : '원심사건번호',
    'disposal' : '처분',
    'disposalform' : '처분종류',
    'disposalcontent' : '처분내용',
    'mentionedItems' : '취지',
    'rqestObjet' : '청구취지',
    'assrs' : '주장',
    'acusrAssrs' : '원고의 주장',
    'dedatAssrs' : '피고의 주장',
    'facts' : '사실',
    'bsisFacts' : '기초사실',
    'dcss' : '판단',
    'courtDcss' : '재판부의 판단',
    'close' : '결론',
    'cnclsns' : '재판의 결론',
    'clauseField' : '약관분야',
    'ftcCnclsns' : '공정위 심결례',
    'clauseArticle' : '약관조항',
    'dvAntageous' : '유불리판단',
    'comProvision' : '비교근거',
    'illdcssBasiss' : '위법성 판단근거',
    'relateLaword' : '참조조문',
    'unfavorableProvision' : '불리한 조항 유형',
}
caseField_mapper = {
    '1' : '민사',
    '2' : '형사',
    '3' : '행정',
}
detailField_mapper = {
    '1' : '민사',
    '2' : '신청',
    '3' : '가사',
    '4' : '특허',
    '5' : '행정',
    '6' : '형사',
}
trailField_mapper = {
    '1' : '1심',
    '2' : '2심',
}
acusr_mapper = {
    '1' : '자연인',
    '2' : '법인',
    '3' : '국가',
    '4' : '검사',
    '5' : '기타',
}
dedat_mapper = {
    '1' : '자연인',
    '2' : '법인',
    '3' : '국가',
    '4' : '검사',
    '5' : '기타',
}
disposalform_mapper = {
    '1' : '손해배상금',
    '2' : '손실보상금',
    '3' : '재산분할액',
    '4' : '위자료',
    '5' : '양육비',
    '6' : '징역',
    '7' : '금고',
    '8' : '집행유예',
    '9' : '벌금',
    '10' : '취소',
}
clauseField_mapper = {
    '1' : '가맹계약',
    '2' : '공급계약',
    '3' : '분양계약',
    '4' : '신탁계약',
    '5' : '임대차계약',
    '6' : '입소, 입주, 입점계약',
    '7' : '신용카드',
    '8' : '은행여신',
    '9' : '은행전자금융서비스',
    '10' : '전자결제수단',
    '11' : '전자금융거래',
    '12' : '상해보험',
    '13' : '손해보험',
    '14' : '질병보험',
    '15' : '연금보험',
    '16' : '자동차보험',
    '17' : '책임보험',
    '18' : '화재보험',
    '19' : '증권사1',
    '20' : '증권사2',
    '21' : '증권사3',
    '22' : '여객운송',
    '23' : '화물운송',
    '24' : '개인정보취급방침',
    '25' : '게임',
    '26' : '국내·외 여행',
    '27' : '결혼정보서비스',
    '28' : '렌트(자동차 이외)',
    '29' : '마일리지/포인트',
    '30' : '보증',
    '31' : '사이버',
    '32' : '산후조리원',
    '33' : '상조서비스',
    '34' : '상품권',
    '35' : '생명보험',
    '36' : '예식업',
    '37' : '온라인서비스',
    '38' : '자동차 리스 및 렌트',
    '39' : '체육시설',
    '40' : '택배',
    '41' : '통신, 방송서비스',
    '42' : '교육',
    '43' : '매매계약',
}
ftcCnclsns_mapper = {
    '1' : '해당',
    '2' : '비해당',
}
dvAntageous_mapper = {
    '1' : '유리',
    '2' : '불리',
}
unfavorableProvision_mapper = {
    '1' : '신의성실의 원칙 위반',
    '2' : '개별금지 조항의 위반',
}
value_mapper = {
    'caseField' : caseField_mapper,
    'detailField' : detailField_mapper,
    'trailField' : trailField_mapper,
    'acusr' : acusr_mapper,
    'dedat' : dedat_mapper,
    'disposalform' : disposalform_mapper,
    'clauseField' : clauseField_mapper,
    'ftcCnclsns' : ftcCnclsns_mapper,
    'dvAntageous' : dvAntageous_mapper,
    'unfavorableProvision' : unfavorableProvision_mapper,
}
def replace_keys_values(data):
    if isinstance(data, dict):
        new_data = {}
        for key, value in data.items():
            new_key = key_mapper.get(key, key)
            new_value = replace_keys_values(value)
            if key in value_mapper:
                new_value = value_mapper[key].get(str(new_value), new_value)
            new_data[new_key] = new_value
        return new_data
    elif isinstance(data, list):
        return [replace_keys_values(item) for item in data]
    else:
        return data
#### prepare for task preprocess end

#### task preprocess
for split in splits:
    source_data_dir_split = source_data_dir/split
    task_files = [(task_data_dir/task/f'{split}.jsonl').open('w', encoding='utf-8') for task in tasks]
    for source_data in tqdm(list(source_data_dir_split.iterdir()), desc=split):
        source_data = source_data.open()
        for line in source_data.readlines():
            line = json.loads(line)
            
            #### data preprocess
            line = replace_keys_values(line)
            line.pop('zip_filename')
            #### data preprocess end 

            #### LM(한국어)
            data = {'text': None}
            ## preprocess data from line
            data['text'] = json.dumps(line, indent=4, ensure_ascii=False)
            ## preprocess data from line end
            task_files[0].write(json.dumps(data, ensure_ascii=False)+'\n')
            # print(json.dumps(data, indent=4, ensure_ascii=False))
            #### LM(한국어) end


            
            # break
        # break
    # break

    for path in task_files:
        path.close()      
        
#### task preprocess end      

train: 100%|██████████| 2/2 [00:01<00:00,  1.06it/s]
valid: 100%|██████████| 2/2 [00:00<00:00,  8.55it/s]
