In [1]:
# 기본 경로 설정
# ===============================================================
import os
import subprocess

PROJECT_NAME = "rag"

ROOT_DIR = os.getcwd()

try:
    from google.colab import drive, userdata
    IS_COLAB_MODE = True
    print("코랩 모드")

except ModuleNotFoundError as e:
    IS_COLAB_MODE = False
    ROOT_DIR = os.path.abspath(os.path.join(ROOT_DIR, ".."))
    os.environ["KMP_DUPLICATE_LIB_OK"] = "True"
    os.environ["TOKENIZERS_PARALLELISM"] = "false"
    print(f"로컬 모드")


    DATA_DIR = os.path.join(ROOT_DIR, "data")
    RAW_DIR = os.path.join(DATA_DIR, "raw")
    PDF_DIR = os.path.join(DATA_DIR, "pdf")
    os.makedirs(DATA_DIR, exist_ok=True)


# 환경변수 로드 설정
def get_secret(key_name: str):
    if IS_COLAB_MODE:
        return userdata.get(key_name)
    else:
        from dotenv import load_dotenv
        load_dotenv(dotenv_path=os.path.join(ROOT_DIR, ".env"))
        return os.getenv(key_name)


if IS_COLAB_MODE:
    drive.mount('/content/drive')

    DATA_DIR = os.path.join(ROOT_DIR, "drive", "MyDrive", "codeit-part3-team4", "data")
    RAW_DIR = os.path.join(DATA_DIR, "raw")
    PDF_DIR = os.path.join(DATA_DIR, "pdf")
    os.makedirs(DATA_DIR, exist_ok=True)

    SAVE_DIR = os.path.join(DATA_DIR, "runs", PROJECT_NAME)
    os.makedirs(SAVE_DIR, exist_ok=True)


# .DS_Store 파일 삭제
_ = subprocess.run(
    ["find", DATA_DIR, "-name", ".DS_Store", "-type", "f", "-delete"],
    check=True
)

로컬 모드


In [28]:
import json
from glob import glob

# 표
from img2table.document import PDF
from img2table.ocr import TesseractOCR
import pandas as pd

# 텍스트
from langchain_community.document_loaders import PyPDFLoader

In [3]:
metadata_json_path = os.path.join(DATA_DIR, "metadata.json")

with open(metadata_json_path, "r", encoding="utf-8") as f:
    metadata_dict = json.load(f)

PDF_PATH_LIST = glob(os.path.join(PDF_DIR, "*"))

sample_pdf = PDF_PATH_LIST[0]

## 표

In [33]:
def trim_table(df: pd.DataFrame):
    df.columns = df.iloc[0]
    df = df[1:]
    df.reset_index(drop=True, inplace=True)

    return df

In [34]:
tables_by_page = PDF(sample_pdf).extract_tables(
    ocr=TesseractOCR(n_threads=1, lang="eng"),
    min_confidence=1
)

tesseract 5.5.2
 leptonica-1.87.0
  libgif 5.2.2 : libjpeg 8d (libjpeg-turbo 3.1.3) : libpng 1.6.54 : libtiff 4.7.1 : zlib 1.2.12 : libwebp 1.6.0 : libopenjp2 2.5.4
 Found NEON
 Found libarchive 3.8.5 zlib/1.2.12 liblzma/5.8.2 bz2lib/1.0.8 liblz4/1.10.0 libzstd/1.5.7 expat/expat_2.7.1 CommonCrypto/system libb2/system
 Found libcurl/8.7.1 SecureTransport (LibreSSL/3.3.6) zlib/1.2.12 nghttp2/1.67.1


In [None]:
TABLE_DICT = dict()

for page_num, tables in tables_by_page.items():
    if not tables:
        continue

    TABLE_DICT[page_num] = []

    for i, table in enumerate(tables):
        table_num = float(f"{page_num}.{i}")
        table_content = trim_table(table.df).to_dict(orient="split")
        table_content.pop("index", None)

        TABLE_DICT[page_num].append({
            "table_title": table.title,
            "table_num": table_num,
            "table_content": table_content
        })

  "table_content": trim_table(table.df).to_dict(orient="split")
  "table_content": trim_table(table.df).to_dict(orient="split")
  "table_content": trim_table(table.df).to_dict(orient="split")
  "table_content": trim_table(table.df).to_dict(orient="split")
  "table_content": trim_table(table.df).to_dict(orient="split")
  "table_content": trim_table(table.df).to_dict(orient="split")
  "table_content": trim_table(table.df).to_dict(orient="split")
  "table_content": trim_table(table.df).to_dict(orient="split")
  "table_content": trim_table(table.df).to_dict(orient="split")
  "table_content": trim_table(table.df).to_dict(orient="split")
  "table_content": trim_table(table.df).to_dict(orient="split")
  "table_content": trim_table(table.df).to_dict(orient="split")
  "table_content": trim_table(table.df).to_dict(orient="split")
  "table_content": trim_table(table.df).to_dict(orient="split")
  "table_content": trim_table(table.df).to_dict(orient="split")
  "table_content": trim_table(table.df).

## 텍스트

In [6]:
loader = PyPDFLoader(sample_pdf)
docs = loader.load()

Ignoring wrong pointing object 26 0 (offset 0)
Ignoring wrong pointing object 372 0 (offset 0)


In [None]:
for doc in docs:
    page = doc.metadata["page"]
    page_label = doc.metadata["page_label"]
    title = doc.metadata["source"].split("/")[-1]
    section = metadata_dict[title][str(page)]
    if page not in TABLE_DICT.keys():
        table = None
    else:
        table = TABLE_DICT[page]

    doc.metadata = {
        "page": page,
        "page_label": page_label,
        "title": title,
        "section": section,
        "table": table
    }

In [8]:
docs

[Document(metadata={'page': 0, 'page_label': '1', 'title': '사단법인 보험개발원_실손보험 청구 전산화 시스템 구축 사업.pdf', 'section': '', 'table': None}, page_content='실손보험 청구 전산화 시스템 구축제안요청서>\n2024. 3'),
 Document(metadata={'page': 1, 'page_label': '2', 'title': '사단법인 보험개발원_실손보험 청구 전산화 시스템 구축 사업.pdf', 'section': '', 'table': None}, page_content='목   목   목   목   목   목   목   목   목   목   목   목   목   목   목   목   목   목   목   목   목   목   목   목   목   목   목   목   목   목   목   목   목   차차차차차차차차차차차차차차차차차차차차차차차차차차차차차차차차차목   차Ⅰ. 사업 개요   1. 개요 ···························································································· 2   2. 추진배경 및 필요성··································································· 2   3. 실손보험 청구 전산화 시스템 개요······································· 3   4. 사업 수행 대상·········································································· 4   5. 기대효과································································