In [1]:
from glob import glob

for g in glob('data/*.pdf'):
    print(g)

data\국방전력발전업무훈령(국방부훈령)(제3007호)(20250111).pdf
data\방위사업법(법률)(제20807호)(20250619).pdf


In [2]:
from langchain_community.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

def read_pdf_and_split_text(pdf_path, chunk_size=1000, chunk_overlap=100):
    """
    주어진 PDF 파일을 일고 텍스트를 분할합니다.
    매개변수:
        pdf_path (str) : PDF 파일의 경로
        chunk_size (int, 선택적) : 각 텍스트 청크의 크기, 기본값은 1000입니다.
        chunk_overlap(int, 선택적) : 청크 간의 중첩 크기, 기본값은 100입니다.
    
    변환값:
        list : 분할된 텍스트 청크의 리스트 
    """

    print(f"PDF: {pdf_path}---------------------------------------------")

    pdf_loader = PyMuPDFLoader(pdf_path)
    date_from_pdf = pdf_loader.load()

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, chunk_overlap=chunk_overlap
    )

    splits = text_splitter.split_documents(date_from_pdf)

    print(f"Number of splits:{len(splits)}\n")
    return splits

In [None]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma

embeddings = HuggingFaceEmbeddings(
    model_name="BAAI/bge-m3",
    model_kwargs={'device': 'cpu'},
    encode_kwargs={'normalize_embeddings': True}
)

import os
persist_directory = 'chroma_store'

if os.path.exists(persist_directory):
    print("Loading existing Chroma store")
    vectorStore = Chroma(
        persist_directory=persist_directory,
        embedding_function=embeddings.embed_documents,  # 함수로 전달
    )
else:
    print("Creating new Chroma store")
    all_splits = []
    for g in glob('data/*.pdf'):
        all_splits.extend(read_pdf_and_split_text(g))

    print(f"Total number of splits: {len(all_splits)}")

    vectorstore = Chroma.from_documents(
        documents=all_splits,
        embedding=embeddings,  # 함수로 전달
        persist_directory=persist_directory
    )

  from .autonotebook import tqdm as notebook_tqdm


Creating new Chroma store
PDF: data\국방전력발전업무훈령(국방부훈령)(제3007호)(20250111).pdf---------------------------------------------
Number of splits:251

PDF: data\방위사업법(법률)(제20807호)(20250619).pdf---------------------------------------------
Number of splits:66

Total number of splits: 317


AttributeError: 'function' object has no attribute 'embed_documents'

In [None]:
retriever = vectorStore.as_retriever(search_kwargs={"k": 5})

chunks = retriever.invoke("서울시 온실가스 저감 정책")

for chunk in chunks:
    print(chunk.metadata)
    print(chunk.page_content)
    