In [1]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain_text_splitters import RecursiveCharacterTextSplitter
from dotenv import load_dotenv
import os
import getpass

load_dotenv()
if not os.getenv("OPENAI_API_KEY"):
    os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter your OpenAI API key: ")

os.environ["LANGSMITH_TRACING_V2"] = "true"
if not os.getenv("LANGSMITH_API_KEY"):
    os.environ["LANGSMITH_API_KEY"] = getpass.getpass("Enter your LangSmith API key: ")
    
loader = PyPDFLoader("./mh_docs/masterigrandview.pdf")
pages = loader.load()
len(pages)

# 2. Splitter
text_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=1000,
    chunk_overlap=150,
    length_function=len
)

docs = text_splitter.split_documents(pages)
embeddings = OpenAIEmbeddings(
model="text-embedding-3-large",
)

persist_directory = './docs/chroma/'

vectordb = Chroma.from_documents(
    documents=docs,
    embedding=embeddings,
    persist_directory=persist_directory
)
len(pages)
len(docs)

62

In [2]:
import getpass

query = "Dự án khi nào bàn giao" # Thời gian bàn giao dự kiến thì tìm đúng thông tin, Hỏi thời gian bàn giao thì không chính xác
retrieved_docs = vectordb.similarity_search(query, k=2)
serialized = "\n\n".join(
    (f"Source: {doc.metadata}\n" f"Content: {doc.page_content}")
    for doc in retrieved_docs
)

In [3]:
print(retrieved_docs)

[Document(metadata={'page': 4, 'source': './mh_docs/masterigrandview.pdf'}, page_content='HẠ TẦNG ĐỒNG BỘ - HOÀN CHỈNH\n'), Document(metadata={'page': 4, 'source': './mh_docs/masterigrandview.pdf'}, page_content='HẠ TẦNG ĐỒNG BỘ - HOÀN CHỈNH\n')]


In [4]:
print(serialized)

Source: {'page': 4, 'source': './mh_docs/masterigrandview.pdf'}
Content: HẠ TẦNG ĐỒNG BỘ - HOÀN CHỈNH


Source: {'page': 4, 'source': './mh_docs/masterigrandview.pdf'}
Content: HẠ TẦNG ĐỒNG BỘ - HOÀN CHỈNH

