In [1]:
# API 키를 환경변수로 관리하기 위한 설정 파일
from dotenv import load_dotenv

# API 키 정보 로드
load_dotenv()

True

In [2]:
from langchain_teddynote import logging

# 프로젝트 이름을 입력합니다.
logging.langsmith("FAQ DEMO")

LangSmith 추적을 시작합니다.
[프로젝트명]
FAQ DEMO


In [3]:
from langchain_community.document_loaders import PDFPlumberLoader
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma

In [4]:
# 단계 1: 문서 로드(Load Documents)
loader = PDFPlumberLoader("data/2024 주택청약 FAQ-removed.pdf")
docs = loader.load()

In [5]:
# 단계 2: 문서 분할(Split Documents)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=25)
split_documents = text_splitter.split_documents(docs)

In [6]:
# 단계 3: 임베딩(Embedding) 생성
embedding = OpenAIEmbeddings()

In [7]:
# 단계 4: DB 생성(Create DB) 및 저장
DB_PATH = "./chroma_db"

# 문서를 디스크에 저장합니다. 저장시 persist_directory에 저장할 경로를 지정합니다.
persist_db = Chroma.from_documents(
    split_documents, OpenAIEmbeddings(), persist_directory=DB_PATH, collection_name="my_db"
)

In [8]:
# 저장된 데이터 확인
persist_db.get()

{'ids': ['0003df1a-995c-45d5-91ad-e85a1e7b3371',
  '000d176f-9735-45b5-b5da-339343fdd10f',
  '008fe94b-ef94-42e5-ab53-209589118470',
  '00c9a02e-456e-4680-beed-7e060528cdc8',
  '0124bc53-94de-4862-a9d9-039cba8c829d',
  '01d5ad52-a132-44e1-93d3-26ac5e67e0ab',
  '02d19abd-a65c-491c-b777-15b5adaf20ab',
  '02ea25e4-d7c9-4766-9ae3-83861fe951a6',
  '032342ed-1713-44e9-9cce-8a435cde3536',
  '04411bcc-6f38-42ea-8330-bf81d4910f51',
  '044a62cc-50ca-4f16-ae88-b3baa02c4a50',
  '047aab20-54be-4218-bf9a-18936e65cec5',
  '0497d429-3443-4fcc-b139-02056b1db96c',
  '04f56457-b386-4e5c-baf2-018eb3363559',
  '05281451-cf5d-44d0-96df-88abd741cfa1',
  '05349db9-44c5-4ee5-8ba7-a4e1eb8ab59b',
  '05d5f76e-c6a5-4e51-a78a-935111adf9f1',
  '05d93a83-be32-4215-bc30-646d89a79066',
  '05f574c9-3213-4de3-b1af-260e1987db25',
  '067a90ca-38f8-4b02-80d4-86169e1b4ee6',
  '06e2e99c-01ff-421c-847a-46575a702ff4',
  '06e8d13c-d757-4e8c-8c61-f9dc4665be52',
  '0717810e-5dc5-4aff-a0ae-75b934af77c3',
  '072be5e9-0134-4a1c-ad35-

In [9]:
# 단계 1: 문서 로드(Load Documents)
loader = PDFPlumberLoader("data/질문 1.pdf")
docs = loader.load()

In [10]:
# 단계 2: 문서 분할(Split Documents)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=400)
split_documents = text_splitter.split_documents(docs)

In [11]:
# 단계 3: 임베딩(Embedding) 생성
embedding = OpenAIEmbeddings()

In [14]:
# 기존 벡터 데이터베이스를 로드
persist_db = Chroma(persist_directory=DB_PATH, embedding_function=embedding, collection_name="my_db")

# 새로운 문서들을 벡터 데이터베이스에 추가
persist_db.add_documents(split_documents)

['33ba7f61-f37c-4c3a-919b-c88ce3bc0fae',
 'a8800bdd-684a-4cf5-9835-5fab03f83b76',
 '38b346b5-66e6-4ddb-b170-05039d513504',
 '3e6b5963-b789-4c90-a3d7-c8dc4f363302',
 '38d5d131-b2c3-456b-8684-fb63f2295666',
 '3c2a3242-4dd6-4b15-9224-62fd1eb3cdb2',
 '79531a89-4834-4653-84c2-8ae0304fb34a',
 '9152c086-7a00-4c9d-8455-cc2a696b8440',
 'f728dc21-5ecd-4405-9de0-1ab1c976ba90',
 'b346c400-3f3e-4efd-8a99-c403959faafd',
 'a84a02e9-c0d5-49f9-86dd-d3f2df60b18b',
 '9f66027b-4d35-46f3-a705-8001df797286',
 'a4d2fb8c-3013-4918-a730-ed805d03f525',
 '43e3e193-fd81-4b7b-aee5-5602723fff93',
 'c6504026-777c-41df-b03f-ceca7c95ed34',
 'c25037b2-09c3-4081-9362-1ea524b1473d',
 '41450f29-15bf-44f8-897f-485a77901e19',
 'bd31525d-2112-4dbb-8804-4cbc0484fe0b',
 '356b89a5-7289-47cd-99bf-0057f6f2f9ea',
 '9f9545ac-aee1-4ebf-bf5d-dc63d2bf72ea',
 '0d3d68a6-bd76-4957-9645-5b2f179e6032',
 'bbbb0cdc-6c7e-4869-8ba5-bd1cb4dade64',
 '06c36805-088d-4cf1-b052-9d942adb7891',
 '427bd7ad-d4cf-4445-86e9-e2fa34f85789',
 '7691a033-29f1-