In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [1]:
import os
import zipfile
import json
import pandas as pd
from functools import reduce

## 압축해제, 데이터 불러오기

In [2]:
def unzip(dir = '.'):
  i = 0
  for root, dirs, files in os.walk(dir):
    for file in files:
        if file.endswith(".zip"):
            i += 1
            zip_path = os.path.join(root, file)
            extract_dir = os.path.join(root, file[:-4])
            os.makedirs(extract_dir, exist_ok=True)
            print(f"Extracting {i}: {zip_path} → {extract_dir}")
            with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                zip_ref.extractall(extract_dir)
  if i == 0:
    print('No Zip')
  else:
    print(f'총 {i}개의 파일을 해제했음 ,')

In [3]:
# def load_concat(dir = '.'):
#   total_json = []
#   total_csv = []
#   for root, dirs, files in os.walk(dir):
#     for file in files:
#       if file.endswith('.json'):
#         with open(os.path.join(root, file), 'r', encoding='utf-8-sig') as f:
#           data = json.load(f)
#           total_json.append(data)
#       elif file.endswith('.csv'):
#         df = pd.read_csv(os.path.join(root, file), encoding='utf-8-sig')
#         total_csv.append(df)

#   return total_json, total_csv

## 금융법률문서기계독해_데이터

In [13]:
# 압축해제
%cd /content/drive/MyDrive/데이콘/dataset/금융법롤문서기계독해_데이터

unzip('.')

/content/drive/MyDrive/데이콘/dataset/금융법롤문서기계독해_데이터
Extracting 1: ./Validation/02.라벨링데이터/VL_2.YesNo 단문형.zip → ./Validation/02.라벨링데이터/VL_2.YesNo 단문형
Extracting 2: ./Validation/02.라벨링데이터/VL_4.다지선다형.zip → ./Validation/02.라벨링데이터/VL_4.다지선다형
Extracting 3: ./Validation/02.라벨링데이터/VL_5.절차(방법)형.zip → ./Validation/02.라벨링데이터/VL_5.절차(방법)형
Extracting 4: ./Validation/02.라벨링데이터/VL_1.정답경계 추출형.zip → ./Validation/02.라벨링데이터/VL_1.정답경계 추출형
Extracting 5: ./Validation/01.원천데이터/VS_4.다지선다형.zip → ./Validation/01.원천데이터/VS_4.다지선다형
Extracting 6: ./Validation/01.원천데이터/VS_2.YesNo 단문형.zip → ./Validation/01.원천데이터/VS_2.YesNo 단문형
Extracting 7: ./Validation/01.원천데이터/VS_5.절차(방법)형.zip → ./Validation/01.원천데이터/VS_5.절차(방법)형
Extracting 8: ./Validation/01.원천데이터/VS_1.정답경계 추출형.

In [152]:
# 원천데이터 전처리
def source_data_preprocessing(dir = '.'):
  data_list = []
  for root, dirs, files in os.walk(dir):
    for file in files:
      if file.endswith('.json'):
        with open(os.path.join(root, file), 'r', encoding='utf-8-sig') as f:
          data = json.load(f)

          for doc in data['data']:
            for para in doc['paragraphs']:
              context = para['context']
              data_list.append({
                'doc_id': doc['doc_id'],
                'doc_title': doc['doc_title'],
                'doc_source': doc['doc_source'],
                'doc_published': doc['doc_published'],
                'doc_created': doc['created'],
                'doc_class': doc['doc_class'].get('class',''),
                'context_id': para['context_id'],
                'context': context,
                'category': root
                })

  df = pd.DataFrame(data_list)

  return df

In [153]:
# training 원천데이터
%cd /content/drive/MyDrive/데이콘/dataset/금융법롤문서기계독해_데이터/Training/01.원천데이터

금융법률문서_training_source_data = source_data_preprocessing('.')
금융법률문서_training_source_data['class'] = 'train'
print(금융법률문서_training_source_data['category'].value_counts())

/content/drive/MyDrive/데이콘/dataset/금융법롤문서기계독해_데이터/Training/01.원천데이터
category
./TS_1.정답경계 추출형    45418
./TS_5.절차(방법)형        27883
./TS_4.다지선다형            14679
./TS_2.YesNo 단문형         14563
Name: count, dtype: int64


In [154]:
# valid 원천데이터
%cd /content/drive/MyDrive/데이콘/dataset/금융법롤문서기계독해_데이터/Validation/01.원천데이터

금융법률문서_valid_source_data = source_data_preprocessing('.')
금융법률문서_valid_source_data['class'] = 'valid'
print(금융법률문서_valid_source_data['category'].value_counts())

/content/drive/MyDrive/데이콘/dataset/금융법롤문서기계독해_데이터/Validation/01.원천데이터
category
./VS_1.정답경계 추출형    5678
./VS_5.절차(방법)형        3485
./VS_4.다지선다형            1835
./VS_2.YesNo 단문형         1820
Name: count, dtype: int64


In [165]:
# 원천데이터 결합 및 저장
금융법률문서_source_data = pd.concat([금융법률문서_training_source_data, 금융법률문서_valid_source_data])
금융법률문서_source_data
금융법률문서_source_data.to_csv('/content/drive/MyDrive/데이콘/dataset/금융법롤문서기계독해_데이터/금융법률문서_source_data',
                          encoding='utf-8-sig',
                          index=False)

In [159]:
# labeling 데이터 전처리
def labeling_data_preprocessing(dir = '.'):
  data_list = []
  for root, dirs, files in os.walk(dir):
    for file in files:
      if file.endswith('.json'):
        with open(os.path.join(root, file), 'r', encoding='utf-8-sig') as f:
          data = json.load(f)

          for doc in data['data']:
            for para in doc['paragraphs']:
              context = para['context']
              for qa in para['qas']:
                question = qa['question']
                answer = qa['answer']['text']
                clue = qa['answer'].get('clue_text', '')
                data_list.append({
                  'doc_id': doc['doc_id'],
                  'doc_title': doc['doc_title'],
                  'doc_source': doc['doc_source'],
                  'doc_published': doc['doc_published'],
                  'doc_created': doc['created'],
                  'doc_class': doc['doc_class'].get('class',''),
                  'context_id': para['context_id'],
                  'context': context,
                  'question': question,
                  'answer': answer,
                  'clue_text': clue,
                  'category': root
                  })

  df = pd.DataFrame(data_list)

  return df

In [160]:
# trainging 라벨링데이터
%cd /content/drive/MyDrive/데이콘/dataset/금융법롤문서기계독해_데이터/Training/02.라벨링데이터

금융법률문서_training_labeling_data = labeling_data_preprocessing('.')
금융법률문서_training_labeling_data['class'] = 'train'
print(금융법률문서_training_labeling_data['category'].value_counts())

/content/drive/MyDrive/데이콘/dataset/금융법롤문서기계독해_데이터/Training/02.라벨링데이터
category
./TL_1.정답경계 추출형    93828
./TL_5.절차(방법)형        61746
./TL_2.YesNo 단문형         32750
./TL_4.다지선다형            32092
Name: count, dtype: int64


In [162]:
# valid 라벨링데이터
%cd /content/drive/MyDrive/데이콘/dataset/금융법롤문서기계독해_데이터/Validation/02.라벨링데이터

금융법률문서_valid_labeling_data = labeling_data_preprocessing('.')
금융법률문서_valid_labeling_data['class'] = 'valid'
print(금융법률문서_training_labeling_data['category'].value_counts())

/content/drive/MyDrive/데이콘/dataset/금융법롤문서기계독해_데이터/Validation/02.라벨링데이터
category
./TL_1.정답경계 추출형    93828
./TL_5.절차(방법)형        61746
./TL_2.YesNo 단문형         32750
./TL_4.다지선다형            32092
Name: count, dtype: int64


In [166]:
# 라벨링 데이터 결합 및 저장
금융법률문서_labeling_data = pd.concat([금융법률문서_training_labeling_data, 금융법률문서_valid_labeling_data])
금융법률문서_labeling_data
금융법률문서_labeling_data.to_csv('/content/drive/MyDrive/데이콘/dataset/금융법롤문서기계독해_데이터/금융법률문서_labeling_data',
                          encoding='utf-8-sig',
                          index=False)


## 민사법 데이터

In [None]:
%cd /content/drive/MyDrive/1데이콘/2025금융AIChallenge금융AI모델경쟁/dataset/민사법_데이터/
# print(os.getcwd())

# unzip('.')

/content/drive/MyDrive/1데이콘/2025금융AIChallenge금융AI모델경쟁/dataset/민사법_데이터


In [None]:
민사법_json, 민사법_csv = load_concat('.')

In [None]:
민사법_flatten = pd.json_normalize(민사법_json)
민사법_flatten['category'] = '민사법'

In [None]:
# 민사법_flatten.to_csv('/content/drive/MyDrive/1데이콘/2025금융AIChallenge금융AI모델경쟁/dataset/민사법_flatten.csv', index=False)

## 형사법 데이터

In [134]:
%cd /content/drive/MyDrive/데이콘/dataset/형사법_데이터/1.데이터

unzip('.')

/content/drive/MyDrive/데이콘/dataset/형사법_데이터/1.데이터
Extracting 1: ./Training/02.라벨링데이터/TL_법령_QA.zip → ./Training/02.라벨링데이터/TL_법령_QA
Extracting 2: ./Training/02.라벨링데이터/TL_해석례_SUM.zip → ./Training/02.라벨링데이터/TL_해석례_SUM
Extracting 3: ./Training/02.라벨링데이터/TL_해석례_QA.zip → ./Training/02.라벨링데이터/TL_해석례_QA
Extracting 4: ./Training/01.원천데이터/TS_법령.zip → ./Training/01.원천데이터/TS_법령
Extracting 5: ./Training/01.원천데이터/TS_해석례.zip → ./Training/01.원천데이터/TS_해석례
Extracting 6: ./Validation/01.원천데이터/VS_해석례.zip → ./Validation/01.원천데이터/VS_해석례
Extracting 7: ./Validation/01.원천데이터/VS_법령.zip → ./Validation/01.원천데이터/VS_법령
Extracting 8: ./Validation/02.라벨링데이터/VL_법령_QA.zip → ./Validation/02.라벨링데이터/VL_법령_QA
Extracting 9: ./Validation/02.라벨링데이터/VL_해석례_SUM.zip → ./Validation/02.라벨링데이터/VL_해석례_SUM
Extracting 10

In [178]:
path = '/content/drive/MyDrive/데이콘/dataset/형사법_데이터/1.데이터/Training/01.원천데이터/TS_법령/HS_B_000006.csv'
# with open(path, 'r', encoding='utf-8-sig') as f:
          # data = json.load(f)
data = pd.read_csv(path)
data

Unnamed: 0,법령일련번호,MST,구분,문장번호,내용
0,6,253317,조문,1,제1장 총칙
1,6,253317,조문,2,"제1조(목적) 이 법은 대한민국 국적 및 신분을 증명하는 여권(旅券)의 발급, 효력..."
2,6,253317,조문,3,제2조(여권의 소지) 외국을 여행하려는 국민은 이 법에 따라 발급된 여권을 소지하여...
3,6,253317,조문,4,제3조(발급권자) 여권은 외교부장관이 발급한다. <개정 2013.3.23>
4,6,253317,조문,5,제2장 여권의 종류 및 유효기간
...,...,...,...,...,...
197,6,253317,조문,198,제26조(벌칙) 다음 각 호의 어느 하나에 해당하는 사람은 1년 이하의 징역 또는 ...
198,6,253317,항,199,
199,6,253317,호,200,1. 제16조제4호(제14조제3항에 따라 준용되는 경우를 포함한다)를 위반하여 사용...
200,6,253317,호,201,2. 제16조제5호(제14조제3항에 따라 준용되는 경우를 포함한다)를 위반하여 채무...


In [205]:
# def pre_trained_source_data_preprocessing(dir = ','):
#   data_list = []
#   for root, dirs, files in os.walk(dir):
#     for file in files:
#       if file.endswith('.csv'):
#         df = pd.read_csv(os.path.join(root, file), encoding='utf-8-sig')

#         # A 그룹: 해석례일련번호 기준 (질의요지 / 회답 / 이유)
#         # df_a = df[df['법령일련번호'].notna() & df['해석례일련번호'].isna() & df['구분'].isin(['질의요지', '회답', '이유'])]

#         results_a = []

#         for key, group in df.groupby('MST'):
#           entry = {'MST': key}
#           for kind in group['구분'].unique():
#             texts = group[group['구분'] == kind].sort_values('법령일련번호')['내용'].dropna().tolist()
#             entry[kind] = "\n".join(texts)

#           results_a.append(entry)

#         grouped_df = (
#     data.dropna().groupby(['MST', '구분'])['내용']
#     .apply(lambda x: '\n'.join(x))
#     .reset_index()
#     .pivot(index='MST', columns='구분', values='내용')
#     .reset_index()
# )
# grouped_df.head()

#   return pd.DataFrame(results_a)

In [206]:
# grouped_df = (
#     data.dropna().groupby(['MST', '구분'])['내용']
#     .apply(lambda x: '\n'.join(x))
#     .reset_index()
#     .pivot(index='MST', columns='구분', values='내용')
#     .reset_index()
# )
# grouped_df.head()

In [208]:
# %cd /content/drive/MyDrive/데이콘/dataset/형사법_데이터/1.데이터/Training/01.원천데이터/TS_법령

# aa = pre_trained_source_data_preprocessing('.')
# aa

In [209]:
# labeling 데이터 전처리
def pre_trained_labeling_data_preprocessing(dir = '.'):
  data_list = []
  for root, dirs, files in os.walk(dir):
    for file in files:
      if file.endswith('.json'):
        with open(os.path.join(root, file), 'r', encoding='utf-8-sig') as f:
          data = json.load(f)

          for doc in data['data']:
            for para in doc['paragraphs']:
              context = para['context']
              for qa in para['qas']:
                question = qa['question']
                answer = qa['answer']['text']
                clue = qa['answer'].get('clue_text', '')
                data_list.append({
                  'doc_id': doc['doc_id'],
                  'doc_title': doc['doc_title'],
                  'doc_source': doc['doc_source'],
                  'doc_published': doc['doc_published'],
                  'doc_created': doc['created'],
                  'doc_class': doc['doc_class'].get('class',''),
                  'context_id': para['context_id'],
                  'context': context,
                  'question': question,
                  'answer': answer,
                  'clue_text': clue,
                  'category': root
                  })

  df = pd.DataFrame(data_list)

  return df

{'info': {'lawClass': '02',
  'DocuType': '03',
  'interpreId': '311551',
  'agenda': '경찰청 - 「도로교통법」 제87조(개정된 정기적성검사기간의 소급적용 여부) 관련',
  'agendaNum': '07-0479',
  'interpreDate': '2008.02.01',
  'interpreMinCode': '1170000',
  'interpreMinName': '법제처',
  'questionMinCode': '1320000',
  'questionMinName': '경찰청 교통관리관 교통기획담당관',
  'fullText': 'N',
  'smClass': '',
  'sentenceType': '서술형'},
 'label': {'instruction': '질의에 대한 응답은 ‘해석 법령’을 포함하여 15어절 이상의 서술형으로 생성하시오.',
  'input': '정기적성검사기간과 운전면허증 갱신기간이 3개월에서 6개월로 연장되었을 때, 법 시행일 이전에 갱신기간이 만료된 사람에게도 개정된 법이 적용될 수 있나요?',
  'output': '개정된 도로교통법 제87조제1항 및 제3항에 따라 정기적성검사기간 또는 운전면허증 갱신기간이 연장되었는데, 해당 개정 사항은 법 시행일 이전에 갱신기간이 만료된 사람에게도 소급적용될 수 있습니다. 법령의 소급적용은 일반적으로 허용되지 않지만, 이번 개정은 국민의 불이익이나 고통을 완화하는 특별한 사정이 있는 경우에 해당하여 소급적용이 인정됩니다. 따라서 2008년 6월 22일 이전에 갱신기간이 이미 만료된 사람도 이 법의 혜택을 받을 수 있습니다.',
  'originwordCnt': '50',
  'labelwordCnt': '56'}}

In [None]:
def pre_trained_labeling_data_preprocessing(dir = ','):

            df_grouped_a = pd.DataFrame(results_a)
        # B 그룹: 법령일련번호 기준 (조문 / 항 / 호)
        df_b = df[df['해석례일련번호'].isna() & df['법령일련번호'].notna() & df['구분'].isin(['조문', '항', '호'])]

        results_b = []
        for key, group in df_b.groupby('법령일련번호'):
          entry = {'그룹기준': key}
          for kind in ['조문', '항', '호']:
            texts = group[group['구분'] == kind]['내용'].tolist()
            entry[kind] = "\n".join(texts)
            results_b.append(entry)

            df_grouped_b = pd.DataFrame(results_b)

In [None]:
형사법_flatten = pd.json_normalize(형사법_json)
형사법_flatten['category'] = '형사법'

## 행정법

In [None]:
%cd /content/drive/MyDrive/1데이콘/2025금융AIChallenge금융AI모델경쟁/dataset/행정법_데이터/

# unzip('.')

In [None]:
행정법_json, 행정법_csv = load_concat('.')

In [None]:
행정법_flatten = pd.json_normalize(행정법_json)
행정법_flatten['category'] = '행정법'

csv dd

In [None]:
df = pd.concat(행정법_csv, ignore_index=True)

In [None]:
# A 그룹: 해석례일련번호 기준 (질의요지 / 회답 / 이유)
df_a = df[df['해석례일련번호'].notna() & df['구분'].isin(['질의요지', '회답', '이유'])]

results_a = []
for key, group in df_a.groupby('해석례일련번호'):
    entry = {'그룹기준': key}
    for kind in ['질의요지', '회답', '이유']:
        texts = group[group['구분'] == kind]['내용'].tolist()
        entry[kind] = "\n".join(texts)
    results_a.append(entry)
df_grouped_a = pd.DataFrame(results_a)


# B 그룹: 법령일련번호 기준 (조문 / 항 / 호)
df_b = df[df['해석례일련번호'].isna() & df['법령일련번호'].notna() & df['구분'].isin(['조문', '항', '호'])]

results_b = []
for key, group in df_b.groupby('법령일련번호'):
    entry = {'그룹기준': key}
    for kind in ['조문', '항', '호']:
        texts = group[group['구분'] == kind]['내용'].tolist()
        entry[kind] = "\n".join(texts)
    results_b.append(entry)
df_grouped_b = pd.DataFrame(results_b)

In [None]:
df_final = pd.concat([df_grouped_a, df_grouped_b], ignore_index=True)

## 데이터 결합

In [None]:
json_flatten = [금융법률_flatten, 민사법_flatten, 형사법_flatten, 행정법_flatten]
json_all = reduce(lambda left, right: pd.concat([left,right], ignore_index=False), json_flatten)

csv = [금융법률_csv, 민사법_csv, 형사법_csv, 행정법_csv]
csv_all = reduce(lambda left, right: pd.concat([left,right], ignore_index=False), csv)