In [None]:
!uv add PyMuPDF

In [10]:
import json
import os
from pathlib import PurePath
from pydoc import doc
import re
from turtle import title
from typing import Literal, Optional, Union
from xml.dom.minidom import Document
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import CharacterTextSplitter
from regex import D
from sqlalchemy import false
import fitz # PyMuPDF

file_path = "./docs/국가계약법_시행규칙.pdf"

class HiPDFLoader(PyPDFLoader):
    def __init__(
        self,
        file_path: Union[str, PurePath],
        separators: Optional[list[str]] = ['^제[1-9][0-9]*장 .+$', '^제[1-9][0-9]*조\\(.+\\)$'],
        header_margin=50,
        footer_margin=50) -> None:
        super().__init__(file_path=file_path)
        self.file_path = file_path
        self.header_margin = header_margin
        self.footer_margin = footer_margin
        self.separators = separators
        
    def load(self, split:bool = False) -> list[Document]:
        docs = self.trim_header_footer()
        return self.split(docs) if split else docs
        
    def split(self, docs:list[Document]) -> list[Document]:
        split_doc = []
        
        for doc in docs:
            text_splitter = CharacterTextSplitter(
                chunk_size=1000,
                chunk_overlap=200,
                separator="\n",
                )
            splits = text_splitter.split_text(doc.page_content)
            split_docs = []
            for j, split in enumerate(splits):
                new_doc = Document()
                new_doc.page_content = split
                new_doc.metadata = {
                    "source": doc.metadata['source'],
                    "page": doc.metadata['page'],
                    "total_pages": doc.metadata['total_pages'],
                    "chunk": j,
                    }
                split_docs.append(new_doc)
            split_doc.extend(split_docs)
        # pages = [doc.page_content for doc in docs]
        # metadatas = [doc.metadata for doc in docs]
        # full_text = '\n\n'.join(pages)
        # title = docs[0].page_content.lstrip()
        # re.compile(p)
        # abbreviation = ''
        return split_doc

    def trim_header_footer(self) -> list[Document]:
        docs = []
        clean_pages = []
        pages = fitz.open(self.file_path)
        title = os.path.basename(self.file_path)
        patterns = [re.compile(p) for p in self.separators]

        for page in pages:
            # 페이지 전체 크기 (width, height)
            page_rect = page.rect
            
            # 제외할 영역을 뺀 '본문 영역' 정의
            # fitz.Rect(x0, y0, x1, y1) -> 좌상단(x0, y0), 우하단(x1, y1)
            content_box = fitz.Rect(
                0,                     # 왼쪽 끝
                self.header_margin,         # 위에서 머리말만큼 아래로
                page_rect.width,       # 오른쪽 끝
                page_rect.height - self.footer_margin  # 아래에서 꼬리말만큼 위로
            )
            
            # 해당 영역(crop box) 내부의 텍스트만 추출
            text = page.get_text('text', clip=content_box).lstrip().rstrip()
            lines = text.lstrip().split('\n')
            page_number = page.number + 1
            
            if page_number == 1:
                title = lines[0] if lines else title
            
            doc = None
            jang = []
            
            for line in lines:
                for p in patterns:
                    match = p.search(line)
                    if match:
                        doc = match.group(0)
                        break
                if doc:
                    break

            document = Document()
            document.page_content = text
            document.metadata = {
                "page": page.number,
                'source': self.file_path,
                'title': title,
                'total_pages': len(pages),
                'jang': jang,
                # 'matchs': [sep for sep in self.separators if re.search(sep, text, re.MULTILINE)]
                }
            docs.append(document)

                    
            
            # p_jang = re.compile(self.separators[0])
            
            # jang_matches = p_jang.search(text, re.MULTILINE)
            # if jang_matches:
            #     for line in lines:
            #         jang_match = p_jang.search(line)
            #         if jang_match:
            #             jang.append(jang_match.group(0))
            #             break
            #     jang_match = p_jang.search(text, re.MULTILINE)
            #     if jang_match:
            #         jang = jang_match.group(0)
                
            # clean_pages.append(text)
            # document = Document()
            # document.page_content = text
            # document.metadata = {
            #     "page": page.number,
            #     'source': self.file_path,
            #     'title': title,
            #     'total_pages': len(pages),
            #     'jang': jang,
            #     # 'matchs': [sep for sep in self.separators if re.search(sep, text, re.MULTILINE)]
            #     }
            # docs.append(document)
            
        return docs


In [None]:
loader = HiPDFLoader(file_path)
docs = loader.load()
# docs
# # --- 본문(first body) 첫줄 추출 로직 ---
# full_text = '\n\n'.join([d.page_content for d in docs]) if docs else ""

# --- 끝 ---
docs[0].metadata
docs[0].page_content

{'page': 0,
 'source': './docs/국가계약법_시행규칙.pdf',
 'title': '국가를 당사자로 하는 계약에 관한 법률 시행규칙 ( 약칭: 국가계약법 시행규칙 )',
 'total_pages': 24,
 'jang': []}