In [28]:
from langchain.document_loaders import PyPDFLoader

#loads the pdf files from local storage
loaders = [
    PyPDFLoader("C:/db/rest.pdf"), 
]
docs = []
for loader in loaders:
    docs.extend(loader.load())

In [29]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

chunk_size=150
chunk_overlap = 50

#splits the document into small parts with a size equals to chunk_size
r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap
    )

splits = r_splitter.split_documents(docs)

In [30]:
from langchain_community.embeddings import HuggingFaceBgeEmbeddings

model_name = 'BAAI/bge-small-en'
model_kwargs = {'device':'cpu'}
encode_kwargs = {'normalize_embeddings':True}

#using hugging face embedding model for converting texts to vectors
embedding = HuggingFaceBgeEmbeddings(
    model_name = model_name,
    model_kwargs = model_kwargs,
    encode_kwargs = encode_kwargs
    )

In [31]:
from langchain.vectorstores import Chroma

persist_directory = 'docs/chroma'

#using Chromadb as database to store the embedded vectors
vectordb = Chroma.from_documents(
    documents = splits,
    embedding = embedding,
    persist_directory = persist_directory
    )

docs = vectordb.similarity_search("what is the closing time",k=3)

In [32]:
from langchain_community.llms import HuggingFaceHub

# Initializing LLM for understanding and analyzing the best answer
llm = HuggingFaceHub(
    repo_id = 'google/flan-t5-xxl',
    model_kwargs = {'temperature':0.5,'max_length':500},
    huggingfacehub_api_token = ""
    )


In [33]:
from langchain.chains import RetrievalQA
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever = vectordb.as_retriever()
    )

In [36]:
result = qa_chain({'query':"what is the closing time"})

In [37]:
result['result']

'9:00 AM'