In [1]:
# 기본 경로 설정
# ===============================================================
import os
import subprocess

PROJECT_NAME = "rag"

ROOT_DIR = os.getcwd()

try:
    from google.colab import drive, userdata
    IS_COLAB_MODE = True
    print("코랩 모드")

except ModuleNotFoundError as e:
    IS_COLAB_MODE = False
    ROOT_DIR = os.path.abspath(os.path.join(ROOT_DIR, ".."))
    os.environ["KMP_DUPLICATE_LIB_OK"] = "True"
    os.environ["TOKENIZERS_PARALLELISM"] = "false"
    print(f"로컬 모드")


    DATA_DIR = os.path.join(ROOT_DIR, "data")
    RAW_DIR = os.path.join(DATA_DIR, "raw")
    PDF_DIR = os.path.join(DATA_DIR, "pdf")
    os.makedirs(DATA_DIR, exist_ok=True)


# 환경변수 로드 설정
def get_secret(key_name: str):
    if IS_COLAB_MODE:
        return userdata.get(key_name)
    else:
        from dotenv import load_dotenv
        load_dotenv(dotenv_path=os.path.join(ROOT_DIR, ".env"))
        return os.getenv(key_name)


if IS_COLAB_MODE:
    drive.mount('/content/drive')

    DATA_DIR = os.path.join(ROOT_DIR, "drive", "MyDrive", "codeit-part3-team4", "data")
    RAW_DIR = os.path.join(DATA_DIR, "raw")
    PDF_DIR = os.path.join(DATA_DIR, "pdf")
    os.makedirs(DATA_DIR, exist_ok=True)

    SAVE_DIR = os.path.join(DATA_DIR, "runs", PROJECT_NAME)
    os.makedirs(SAVE_DIR, exist_ok=True)


# .DS_Store 파일 삭제
_ = subprocess.run(
    ["find", DATA_DIR, "-name", ".DS_Store", "-type", "f", "-delete"],
    check=True
)

로컬 모드


In [2]:
import preprocess_v3 as pp
from preprocess_v3 import all_data_path, ALL_DATA

# data_list.csv & data_list.xlsx

In [3]:
from glob import glob

DATA_CSV_PATH, DATA_XLSX_PATH, _ = glob(os.path.join(RAW_DIR, "*"))

In [3]:
import pandas as pd

xlsx_df = pd.read_excel(DATA_XLSX_PATH)
csv_df = pd.read_csv(DATA_CSV_PATH)

compare_df = csv_df == xlsx_df

### data_list.csv와 data_list.xlsx 정보에 차이 좀 있습니다. 

### 나중에 제대로 들여다보겠습니다.

In [4]:
for column in compare_df.columns:
    print(column, compare_df[column].sum())

공고 번호 9
공고 차수 82
사업명 99
사업 금액 99
발주 기관 100
공개 일자 100
입찰 참여 시작일 74
입찰 참여 마감일 92
사업 요약 100
파일형식 100
파일명 100
텍스트 5


### xlsx, csv 파일에 중복치 없습니다.

In [5]:
sum(xlsx_df.duplicated()), sum(csv_df.duplicated())

(0, 0)

# pdf 파일들

In [4]:
import pdfplumber

from img2table.document import PDF
from img2table.ocr import TesseractOCR

from langchain_community.document_loaders import PyPDFLoader

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
PDF_PATH_LIST = glob(os.path.join(PDF_DIR, "*"))

### 표 데이터 수집

In [None]:
# tables_by_page = PDF(sample_pdf).extract_tables(
#     ocr=TesseractOCR(n_threads=1, lang="eng"),
#     min_confidence=1
# )

In [None]:
# TABLE_DICT = dict()

# for page_num, tables in tables_by_page.items():
#     if tables:
#         for i, table in enumerate(tables):
#             TABLE_DICT[page_num] = {
#                 "table": table.df,
#                 "table_title": table.title,
#                 "table_num": float(f"{page_num}.{i}")
#             }

### 텍스트 데이터 수집

In [6]:
import json

import re
from langchain_text_splitters import RecursiveCharacterTextSplitter

### 문서마다 I, 1 등 숫자 체계 등이 불규칙하니

### 목차 페이지에서 얻은 정보로 청킹하려 합니다.

### 목차 없는 문서는 잠시 보류하겠습니다.

In [8]:
index_pages_json_path = os.path.join(DATA_DIR, "01_index_pages.json")

with open(index_pages_json_path, "r", encoding="utf-8") as f:
    index_pages_dict = json.load(f)

In [19]:
section_words_json_path = os.path.join(DATA_DIR, "02_section_words.json")

with open(section_words_json_path, "r", encoding="utf-8") as f:
    section_words_dict = json.load(f)

In [20]:
# section_words.json -> text_metadata.json

pattern = re.compile(r'^(?P<section>.+?)\s+(?P<page_label>\d+)$')

metadata_dict = dict()

for file_name, section_word_list in section_words_dict.items():
    # index_pages = index_pages_dict[file_name]["index_page_label"]
    start_page_label = index_pages_dict[file_name]["start_page_label"]

    results = []

    for section_word in section_word_list:
        m = pattern.match(section_word.strip())
        if not m:
            continue  # 페이지 없는 항목은 스킵

        section = m.group('section')
        page_label = int(m.group('page_label'))

        # 점선 제거 (있을 경우만)
        section = re.sub(r'[·\.]{2,}', '', section).strip()

        results.append({
            "section": section,
            "page_label": page_label + start_page_label - 1
            })


    loader = PyPDFLoader(os.path.join(RAW_DIR, "files", file_name))
    docs = loader.load()
    doc_metadata_dict = {i: [] for i in range(len(docs))}

    for dict_ in results:
        section = dict_["section"]
        page_label = dict_["page_label"]

        doc_metadata_dict[page_label - 1].append(section)

    section = ""
    for page_num in doc_metadata_dict.keys():
        if doc_metadata_dict[page_num]:
            section = doc_metadata_dict[page_num]
        else:
            doc_metadata_dict[page_num] = section

    metadata_dict[file_name] = doc_metadata_dict

Ignoring wrong pointing object 23 0 (offset 0)
Ignoring wrong pointing object 25 0 (offset 0)
Ignoring wrong pointing object 227 0 (offset 0)
Ignoring wrong pointing object 38 0 (offset 0)
Ignoring wrong pointing object 35 0 (offset 0)
Ignoring wrong pointing object 22 0 (offset 0)


In [21]:
text_metadata_path = os.path.join(DATA_DIR, "03_text_metadata.json")

with open(text_metadata_path, "w", encoding="utf-8") as f:
    json.dump(metadata_dict, f, indent=2, ensure_ascii=False)