In [16]:
import os
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import OllamaEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain_community.llms import Ollama
import streamlit as st
import warnings
import pprint
warnings.filterwarnings("ignore")

In [5]:
def replace_t_with_space(documents):
        processed_docs = []
        for doc in documents:
            # Create a new document with processed content
            doc.page_content = doc.page_content.replace('\t', ' ')
            processed_docs.append(doc)
        return processed_docs

In [6]:
def encode_pdf(path,chunk_size=2000,chunk_overlap=200):
    """ 
    This method will chunk a pdf and then convert the chunks into embedding and store them into a vector database

    Args:
        path: path to the pdf file
        chunk_size: paragraph length of each chunk
        chunk_overlap: max amount of consicutive overlap between chunks

    Return:
        A fassi vector store containing the encoded pdf content
    """

    # Load pdf documents
    loader = PyPDFLoader(path,mode='page')
    docs = loader.load()

    # Load pdf documents into chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size,chunk_overlap=chunk_overlap,length_function=len)
    texts = text_splitter.split_documents(docs)
    texts = replace_t_with_space(texts)
    embeddings = OllamaEmbeddings(model="nomic-embed-text:latest")
    vectorestore = FAISS.from_documents(texts,embeddings)
    return vectorestore





In [7]:
doc_path = "harry_potter_1.pdf"
vectorstore = encode_pdf(doc_path)

  embeddings = OllamaEmbeddings(model="nomic-embed-text:latest")


In [8]:
retriver = vectorstore.as_retriever(search_kwargs={"k":2})

In [None]:
query = "Where does harry potter lives?"
docs=retriver.invoke(query)

In [None]:
for idx,doc in enumerate(docs):
    print(f"doc no {idx}")
    print(doc)

doc no 0
page_content='a Black boy even taller than Ron, joined Harry at the Gryffindor table.
"Turpin, Lisa," became a Ravenclaw and then it was Ron's turn. He was
pale green by now. Harry crossed his fingers under the table and a
second later the hat had shouted, "GRYFFINDOR!"
Harry clapped loudly with the rest as Ron collapsed into the chair next
to him.' metadata={'producer': 'Acrobat Distiller 4.0 for Windows', 'creator': 'Microsoft Word 8.0', 'creationdate': '2001-02-13T16:47:14+00:00', 'subject': 'Harry Potter', 'author': 'J.K. Rowling', 'moddate': '2005-11-26T18:01:39+02:00', 'title': "Harry Potter, Book 1; The Sorcerer's Stone", 'source': 'harry_potter_1.pdf', 'total_pages': 250, 'page': 97, 'page_label': '98'}
doc no 1
page_content='1
Harry Potter and the Sorcerer's Stone
CHAPTER ONE
THE BOY WHO LIVED
Mr. and Mrs. Dursley, of number four, Privet Drive, were proud to say
that they were perfectly normal, thank you very much. They were the last
people you'd expect to be involved

In [9]:
prompt_template = """Use the following context to answer the question. If you don’t know, say so.
Context: {context}
Question: {question}
Answer: """

llm = Ollama(model="qwen2.5:3b")
prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriver,  # Fix typo from 'retriver' to 'retriever'
    chain_type_kwargs={"prompt": prompt}  # Pass the prompt here
)

  llm = Ollama(model="qwen2.5:3b")


In [10]:
def chat(query):
    result=qa_chain({"query":query})
    return result["result"]

In [17]:
pp = pprint.PrettyPrinter(width=80)

In [20]:
query = "Describe the train in which harry potter travelling"
output=chat(query)
pp.pprint(output)

('Based on the context provided, Harry Potter is traveling on a Hogwarts '
 'Express train. The passage describes various carriages of the train where '
 'students are gathered either to say goodbye or simply waiting for their '
 'journey to begin. It mentions that many compartments are already packed with '
 'students who are talking to their families through open windows and fighting '
 'over seats inside. Some students have animals like toads in boxes, which '
 'causes excitement among others due to the leg poking out of one of the '
 "animal's boxes.\n"
 '\n'
 'The train is described as being filled with a crowd of people, including '
 'those waiting for platforms nine and ten. The atmosphere on board the train '
 'appears lively with conversations, laughter, and even some unpleasant sounds '
 'from owls. Smoke drifts over the heads of people as they walk around the '
 'crowded platform. \n'
 '\n'
 'Harry Potter finally finds an empty compartment to place his owl Hedwig '
 'inside 