In [1]:
import PyPDF2

from langchain_community.embeddings import OllamaEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma

from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_community.chat_models import ChatOllama
from langchain_core.runnables import RunnablePassthrough
from langchain.retrievers.multi_query import MultiQueryRetriever

# Ingesting PDF

In [2]:
# Define the path to your PDF file
pdf_path = "Artificial_Intelligence.pdf"

# Read the PDF file
with open(pdf_path, "rb") as file:
    reader = PyPDF2.PdfReader(file)
    text = ""
    for page_num in range(len(reader.pages)):
        page = reader.pages[page_num]
        text += page.extract_text()

In [3]:
# Convert text to a document object
class Document:
    def __init__(self, page_content, metadata=None):
        self.page_content = page_content
        self.metadata = metadata or {}

documents = [Document(page_content=text)]

# Vector Embedding

In [4]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=7500, chunk_overlap=100)
chunks = text_splitter.split_documents(documents)

In [5]:
vector_db = Chroma.from_documents(
    documents=chunks,
    embedding=OllamaEmbeddings(model="nomic-embed-text", show_progress=True),
    collection_name="local-rag"
)

OllamaEmbeddings: 100%|██████████| 1/1 [00:05<00:00,  5.16s/it]


# Retrieval

In [6]:
# LLM from Ollama
local_model = "mistral"
llm = ChatOllama(model=local_model)

In [7]:
QUERY_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""
    You are an AI language model assistent. Your task is to generate five different versions of the given 
    user question to retrieve relevant documents from a vector database. By generating multiplle
    perspectives on the user question, your goal is to help the user overcome some of the limitations of 
    the distance-based similarity search. Provide these alternative questions seperated by newlines.
    Original question: {question}
    """,
)

In [8]:
retriever = MultiQueryRetriever.from_llm(
    vector_db.as_retriever(),
    llm,
    prompt=QUERY_PROMPT
)

In [9]:
#RAG Prompt
template= """Answer the question based ONLY on the following context:
{context}
Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

In [10]:
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [13]:
chain.invoke(input("What is this document about?"))

In [None]:
# Delete all collections in the db
vector_db.delete_collection()