In [41]:
# 기본 경로 설정
# ===============================================================
import os
import subprocess

PROJECT_NAME = "rag"

ROOT_DIR = os.getcwd()

try:
    from google.colab import drive, userdata
    IS_COLAB_MODE = True
    print("코랩 모드")

except ModuleNotFoundError as e:
    IS_COLAB_MODE = False
    ROOT_DIR = os.path.abspath(os.path.join(ROOT_DIR, ".."))
    os.environ["KMP_DUPLICATE_LIB_OK"] = "True"
    os.environ["TOKENIZERS_PARALLELISM"] = "false"
    print(f"로컬 모드")


    DATA_DIR = os.path.join(ROOT_DIR, "data")
    RAW_DIR = os.path.join(DATA_DIR, "raw")
    PDF_DIR = os.path.join(DATA_DIR, "pdf")
    os.makedirs(DATA_DIR, exist_ok=True)


# 환경변수 로드 설정
def get_secret(key_name: str):
    if IS_COLAB_MODE:
        return userdata.get(key_name)
    else:
        from dotenv import load_dotenv
        load_dotenv(dotenv_path=os.path.join(ROOT_DIR, ".env"))
        return os.getenv(key_name)


if IS_COLAB_MODE:
    drive.mount('/content/drive')

    DATA_DIR = os.path.join(ROOT_DIR, "drive", "MyDrive", "codeit-part3-team4", "data")
    RAW_DIR = os.path.join(DATA_DIR, "raw")
    PDF_DIR = os.path.join(DATA_DIR, "pdf")
    os.makedirs(DATA_DIR, exist_ok=True)

    SAVE_DIR = os.path.join(DATA_DIR, "runs", PROJECT_NAME)
    os.makedirs(SAVE_DIR, exist_ok=True)


# .DS_Store 파일 삭제
_ = subprocess.run(
    ["find", DATA_DIR, "-name", ".DS_Store", "-type", "f", "-delete"],
    check=True
)

로컬 모드


In [42]:
from glob import glob

DATA_CSV_PATH, DATA_XLSX_PATH, _ = glob(os.path.join(RAW_DIR, "*"))

In [43]:
import pandas as pd

xlsx_df = pd.read_excel(DATA_XLSX_PATH)
csv_df = pd.read_csv(DATA_CSV_PATH)

compare_df = csv_df == xlsx_df

In [44]:
sum(xlsx_df.duplicated()), sum(csv_df.duplicated())

(0, 0)

In [45]:
for column in compare_df.columns:
    print(column, compare_df[column].sum())

공고 번호 9
공고 차수 82
사업명 99
사업 금액 99
발주 기관 100
공개 일자 100
입찰 참여 시작일 74
입찰 참여 마감일 92
사업 요약 100
파일형식 100
파일명 100
텍스트 5


In [46]:
PDF_PATH_LIST = glob(os.path.join(PDF_DIR, "*"))

In [47]:
PDF_PATH_LIST[0]

'/Users/won/dev/00_codeit/0_mission/200_DL_RAG/data/pdf/사단법인 보험개발원_실손보험 청구 전산화 시스템 구축 사업.pdf'

In [48]:
import pdfplumber

from img2table.document import PDF
from img2table.ocr import TesseractOCR

In [49]:
from langchain_community.document_loaders import PyPDFLoader

In [50]:
sample_pdf = PDF_PATH_LIST[0]
sample_pdf

'/Users/won/dev/00_codeit/0_mission/200_DL_RAG/data/pdf/사단법인 보험개발원_실손보험 청구 전산화 시스템 구축 사업.pdf'

In [53]:
tables_by_page = PDF(sample_pdf).extract_tables(
    ocr=TesseractOCR(n_threads=1, lang="eng"),
    min_confidence=1
)

tesseract 5.5.2
 leptonica-1.87.0
  libgif 5.2.2 : libjpeg 8d (libjpeg-turbo 3.1.3) : libpng 1.6.54 : libtiff 4.7.1 : zlib 1.2.12 : libwebp 1.6.0 : libopenjp2 2.5.4
 Found NEON
 Found libarchive 3.8.5 zlib/1.2.12 liblzma/5.8.2 bz2lib/1.0.8 liblz4/1.10.0 libzstd/1.5.7 expat/expat_2.7.1 CommonCrypto/system libb2/system
 Found libcurl/8.7.1 SecureTransport (LibreSSL/3.3.6) zlib/1.2.12 nghttp2/1.67.1


In [86]:
dict_ = dict()

for page_num, tables in tables_by_page.items():
    if tables:
        for i, table in enumerate(tables):
            dict_[page_num] = {
                "table": table.df,
                "table_title": table.title,
                "table_num": float(f"{page_num}.{i}")
            }

## 텍스트 데이터

In [147]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from typing import Optional, Tuple
import json
import re

In [110]:
# for pdf_path in PDF_PATH_LIST:
    # loader = PyPDFLoader(pdf_path)
    # docs = loader.load()

loader = PyPDFLoader(sample_pdf)
docs = loader.load()

Ignoring wrong pointing object 26 0 (offset 0)
Ignoring wrong pointing object 372 0 (offset 0)


In [180]:
index_json_path = os.path.join(DATA_DIR, "index.json")

chapter_splitter = RecursiveCharacterTextSplitter(
    chunk_size=30,
    chunk_overlap=0,
    separators=[
        r"\n",
        r"\s+\s+"
    ],
    is_separator_regex=True,
    keep_separator="start"
)

In [181]:
NOISE_PATTERNS = [
    r"^목\s*$",
    r"^목(\s+목)+$",
    r"^차(\s*차)+$",
    r"^[-=]{5,}$",
    r"Ignoring wrong pointing object",
]

def is_noise(line: str) -> bool:
    line = line.strip()
    if not line:
        return True
    return any(re.search(p, line) for p in NOISE_PATTERNS)

In [179]:
with open(index_json_path, "r", encoding="utf-8") as f:
    index_dict = json.load(f)

In [186]:
loader = PyPDFLoader(sample_pdf)
docs = loader.load()

docs[10].page_content.splitlines()

Ignoring wrong pointing object 26 0 (offset 0)
Ignoring wrong pointing object 372 0 (offset 0)


['- 9 -',
 ' <그림3>모바일 앱을 이용한 보험금 청구 개요    ※ 환자는 요양기관으로부터 의료비 증명서를 발급받은 후 해당 서류를 스마트폰 카메라로 촬영하여 이미지 파일로 저장   o 요양기관 단말기를 이용하는 방식은 보험가입자가 요양기관 단말에서청구 필요서류를 선택하여 청구할 보험사에 바로 서류를 전송함   - 요양기관과 보험사 정보시스템을 서로 연결하는 무인단말기를 설치하고, 환자가 해당 단말기를 이용해 자신의 의료비 증명서류 등을 발급받거나, 전자문서로 보험사에 송부할 수 있음   - 대형병원 및 무인단말기 사업자와 제휴하여 청구 서비스를 제공 중',
 '<그림4>무인단말기를 이용한 보험금 청구 개요']

In [None]:
count = 0

for file_name, index_page_list in index_dict.items():
    if index_page_list:
        start = index_page_list[0]
        end = index_page_list[-1]

        pdf_path = os.path.join(PDF_DIR, file_name)

        loader = PyPDFLoader(pdf_path)
        docs = loader.load()[start : end+1]

        index = chapter_splitter.split_documents(docs)

        for i in index:
            print(i.page_content.strip())
            print(is_noise(i.page_content.strip()))
            print("-"*50)

        print("=" * 50)
        
        count += 1

        if count == 5:
            break

Ignoring wrong pointing object 26 0 (offset 0)
Ignoring wrong pointing object 372 0 (offset 0)
Ignoring wrong pointing object 9 0 (offset 0)
Ignoring wrong pointing object 106 0 (offset 0)


목   목   목   목   목   목   목   목   목   목   목   목   목   목   목   목   목   목   목   목   목   목   목   목   목   목   목   목   목   목   목   목   목   차차차차차차차차차차차차차차차차차차차차차차차차차차차차차차차차차목   차Ⅰ. 사업 개요   1. 개요 ···························································································· 2   2. 추진배경 및 필요성··································································· 2   3. 실손보험 청구 전산화 시스템 개요······································· 3   4. 사업 수행 대상·········································································· 4   5. 기대효과······················································································ 5Ⅱ. 업무 현황   1. 실손보험 청구 현황··································································· 6   2. 문제점··························································································10Ⅲ. 사업추진체계 및 일정   1. 추진 방향····················································································11   2. 사업추진체계································································

Ignoring wrong pointing object 34 0 (offset 0)
Ignoring wrong pointing object 92 0 (offset 0)


목   차 Ⅰ. 사업 개요······································································ 1 Ⅱ. 사업추진 방안······························································ 3 Ⅲ. 제안 요청내용······························································ 7   󰊱 요구사항 구성···································································· 7      󰊲 요구사항 목록···································································· 8      󰊳 요구사항 상세·····································································10        · 기능 요구사항 ·········································································· 10             · 성능 요구사항 ············································································ 18            · 인터페이스 요구사항 ······························································· 20             · 데이터 요구사항 ······································································ 22             · 테스트 요구사항 ······································································ 27             · 보안 요구사항 ······

Ignoring wrong pointing object 182 0 (offset 0)
Ignoring wrong pointing object 215 0 (offset 0)
Ignoring wrong pointing object 236 0 (offset 0)


목    차1. 사업 개요···················································································································· 11.1. 개 요 ························································································································································ 11.2. 추진배경 및 필요성 ······························································································································· 11.3. 사업범위 ··················································································································································· 21.4. 주요사업내용 ··········································································································································· 21.5. 시스템 구성현황 ····································································································································· 31.6. 기대효과 ·························································································

Ignoring wrong pointing object 25 0 (offset 0)


-목   차 -Ⅰ. 사업개요     1. 개    요 --------------------------------------------------------------------1    2. 사업목적 --------------------------------------------------------------------1    3. 사업범위 --------------------------------------------------------------------1    4. 기대효과 --------------------------------------------------------------------1Ⅱ. 사업 추진방향     1. 추진방향 --------------------------------------------------------------------2    2. 추진체계 --------------------------------------------------------------------2    3. 소프트웨어 개발 사업의 적정 사업기간 종합산정서 --------------------4Ⅲ. 시스템현황     1. 경영정보시스템현황 --------------------------------------------------------5    2. 시스템 구성도 --------------------------------------------------------------5    3. 관련 시스템 정보 ----------------------------------------------------------6Ⅳ. 제안요청 내용     1. 요구사항 목록 --------------------------------------------------------------7    2. 요구사항 세부내용 ---------------------------------------------------------8Ⅴ. 제안일반사항     1. 제안일반사항 -----

In [None]:
for i, doc in enumerate(docs):
    doc.metadata = {
        "page": i,
        "title": doc.metadata["source"].split("/")[-1].split(".")[0]
    }