# RAG: Data Ingestion and Retrieval for complex documents



In [2]:
import time
from typing import List, Optional, Union

from langchain_community.chat_models import ChatOllama
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from rag_101.retriever import (
    create_parent_retriever,
    load_embedding_model,
    load_pdf,
    load_reranker_model,
    retrieve_context,
)
from rich import print

Load PDF documents and create retriever

In [3]:
files = [
    "/teamspace/studios/this_studio/example_data/x-t30.pdf",  # DocLLM paper
]

docs = load_pdf(files=files)

embedding_model = load_embedding_model()
retriever = create_parent_retriever(docs, embedding_model)
reranker_model = load_reranker_model()

The PDF <_io.BufferedReader name='/teamspace/studios/this_studio/example_data/x-t30.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case


Initialize Langchain's Ollama Chat API integration using `mistral` model and create the chain with prompt template.

In [4]:
llm = ChatOllama(model="mistral")
prompt_template = ChatPromptTemplate.from_template(
    (
        "Please answer the following question based on the provided `context` that follows the question.\n"
        "If you do not know the answer then just say 'I do not know'\n"
        "question: {question}\n"
        "context: ```{context}```\n"
    )
)
chain = prompt_template | llm | StrOutputParser()

Retrieve document and run the chain using `context` and `question`.

In [5]:
query = "What is the source of the dataset the model was trained on?"
context, similarity_score = retrieve_context(query, retriever, reranker_model)[0]
context = context.page_content

output = chain.invoke({"context": context, "question": query})
print("LLM Response:", output)

In [13]:
# Run through some sample queries
samples = [
    "How do I configure my camera to take photos automatically at a preset interval",
    "How do I adjust camera's sensitivity to light?",
    "My pictures are turning out darker than I want, what should I do"
]

In [14]:
for query in samples:
    print(query)
    context = retrieve_context(
        query, retriever=retriever, reranker_model=reranker_model
    )[0]
    output = chain.invoke({"context": context[0].page_content, "question": query})
    print("LLM Response:", output)

    print("\n", "=" * 100, "\n\n")