## RAG pipeline with chromadb and huggingface embeddings

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_core.documents import Document
import numpy as np
from typing import List

In [8]:
loader=PyMuPDFLoader("data/pdf/attention_is_all_you_need.pdf")
documents=loader.load()

print(f"Loaded {len(documents)} pages")
print(f"First page preview: {documents[0].page_content[:200]}...")

Loaded 15 pages
First page preview: Provided proper attribution is provided, Google hereby grants permission to
reproduce the tables and figures in this paper solely for use in journalistic or
scholarly works.
Attention Is All You Need
...


In [9]:
text_splitter=RecursiveCharacterTextSplitter(
    chunk_size=300,
    chunk_overlap=30,
    length_function=len
)

splits=text_splitter.split_documents(documents)
print(f"Split into {len(splits)} chunks")
print(f"First chunk: {splits[0].page_content[:200]}...")


Split into 162 chunks
First chunk: Provided proper attribution is provided, Google hereby grants permission to
reproduce the tables and figures in this paper solely for use in journalistic or
scholarly works.
Attention Is All You Need
...


In [10]:
embeddings=HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

In [17]:
vector_store=Chroma.from_documents(
    documents=splits,
    embedding=embeddings,
    persist_directory="./chroma_db"
)

In [22]:
retriever = vector_store.as_retriever(
    search_type="similarity",  # or "mmr" for Maximum Marginal Relevance
    search_kwargs={"k": 5}
)

query = "what is the main topic?"
retrieved_docs = retriever.invoke(query)

print(f"Retrieved {len(retrieved_docs)} documents")
for i, doc in enumerate(retrieved_docs):
    print(f"\n--- Document {i+1} ---")
    print(doc.page_content[:200])

Retrieved 5 documents

--- Document 1 ---
Google Brain
noam@google.com
Niki Parmar∗
Google Research
nikip@google.com
Jakob Uszkoreit∗
Google Research
usz@google.com
Llion Jones∗
Google Research
llion@google.com
Aidan N. Gomez∗†
University of 

--- Document 2 ---
[28] Romain Paulus, Caiming Xiong, and Richard Socher. A deep reinforced model for abstractive
summarization. arXiv preprint arXiv:1705.04304, 2017.
[29] Slav Petrov, Leon Barrett, Romain Thibaux, and

--- Document 3 ---
Provided proper attribution is provided, Google hereby grants permission to
reproduce the tables and figures in this paper solely for use in journalistic or
scholarly works.
Attention Is All You Need


--- Document 4 ---
heads clearly learn to perform different tasks, many appear to exhibit behavior related to the syntactic
and semantic structure of the sentences.
5
Training
This section describes the training regime 

--- Document 5 ---
Proceedings of the Human Language Technology Conference of the NAACL, Main

In [24]:
# MMR balances relevance with diversity
retriever_mmr = vector_store.as_retriever(
    search_type="mmr",
    search_kwargs={"k": 5, "fetch_k": 20, "lambda_mult": 0.5}
)

results_mmr = retriever_mmr.invoke(query)

print("MMR Results (more diverse):")
for i, doc in enumerate(results_mmr):
    print(f"\n--- Result {i+1} ---")
    print(doc.page_content[:200])


MMR Results (more diverse):

--- Result 1 ---
Google Brain
noam@google.com
Niki Parmar∗
Google Research
nikip@google.com
Jakob Uszkoreit∗
Google Research
usz@google.com
Llion Jones∗
Google Research
llion@google.com
Aidan N. Gomez∗†
University of 

--- Result 2 ---
[28] Romain Paulus, Caiming Xiong, and Richard Socher. A deep reinforced model for abstractive
summarization. arXiv preprint arXiv:1705.04304, 2017.
[29] Slav Petrov, Leon Barrett, Romain Thibaux, and

--- Result 3 ---
1: Long Papers), pages 434–443. ACL, August 2013.
12

--- Result 4 ---
architectures [38, 24, 15].
Recurrent models typically factor computation along the symbol positions of the input and output
sequences. Aligning the positions to steps in computation time, they genera

--- Result 5 ---
,
in
my
opinion
.
<EOS>
<pad>
Figure 4: Two attention heads, also in layer 5 of 6, apparently involved in anaphora resolution. Top:
Full attentions for head 5. Bottom: Isolated attentions from just th


### Modern RAG


In [42]:
from langchain_groq import ChatGroq
import os

llm = ChatGroq(
    model="llama-3.3-70b-versatile",
    temperature=0.7,
    groq_api_key=os.getenv("GROQ_API_KEY")
)


In [43]:
retriever = vector_store.as_retriever(
    search_kwargs={"k": 5}
)
retriever

VectorStoreRetriever(tags=['Chroma', 'HuggingFaceEmbeddings'], vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x11d7199a0>, search_kwargs={'k': 5})

In [44]:
from langchain_core.prompts import ChatPromptTemplate

# Create a prompt template for RAG
prompt_template = ChatPromptTemplate.from_template(
    """You are an assistant for question-answering tasks. 
Use the following pieces of retrieved context to answer the question. 
If you don't know the answer, just say that you don't know. 
Keep the answer concise.

Question: {question}

Context: {context}

Answer:"""
)

print("Prompt template created")


Prompt template created


In [None]:
# RAG LCEL
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

# Create a function to format and print retrieved docs
def format_docs(docs):
    print("\n" + "="*80)
    print("RETRIEVED DOCUMENTS:")
    print("="*80)
    for i, doc in enumerate(docs, 1):
        print(f"\n--- Document {i} ---")
        print(doc.page_content[:300] + "...")
        print(f"Metadata: {doc.metadata}")
    print("\n" + "="*80)
    return "\n\n".join(doc.page_content for doc in docs)

# Build the chain with debugging
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt_template
    | llm
    | StrOutputParser()
)

print("RAG chain created successfully")


RAG chain created successfully


In [53]:
# Step-by-step with full visibility
question = "What is the transformer architecture?"

print("\n" + "="*80)
print("STEP 1: QUESTION")
print("="*80)
print(question)

# Step 2: Retrieve documents
print("\n" + "="*80)
print("STEP 2: RETRIEVING DOCUMENTS")
print("="*80)
retrieved_docs = retriever.invoke(question)
print(f"Retrieved {len(retrieved_docs)} documents\n")
for i, doc in enumerate(retrieved_docs, 1):
    print(f"--- Document {i} ---")
    print(doc.page_content[:200] + "...")
    print()

# Step 3: Format context
context = "\n\n".join(doc.page_content for doc in retrieved_docs)

# Step 4: Create prompt
print("\n" + "="*80)
print("STEP 3: FORMATTED PROMPT")
print("="*80)
formatted_prompt = prompt_template.invoke({"question": question, "context": context})
print(formatted_prompt)

# Step 5: Get LLM response
print("\n" + "="*80)
print("STEP 4: FINAL ANSWER")
print("="*80)
response = llm.invoke(formatted_prompt)
print(response.content)
print("\n" + "="*80)



STEP 1: QUESTION
What is the transformer architecture?

STEP 2: RETRIEVING DOCUMENTS
Retrieved 5 documents

--- Document 1 ---
Figure 1: The Transformer - model architecture.
The Transformer follows this overall architecture using stacked self-attention and point-wise, fully
connected layers for both the encoder and decoder, ...

--- Document 2 ---
∗Equal contribution. Listing order is random. Jakob proposed replacing RNNs with self-attention and started
the effort to evaluate this idea. Ashish, with Illia, designed and implemented the first Tra...

--- Document 3 ---
relying entirely on an attention mechanism to draw global dependencies between input and output.
The Transformer allows for significantly more parallelization and can reach a new state of the art in
t...

--- Document 4 ---
in the distance between positions, linearly for ConvS2S and logarithmically for ByteNet. This makes
it more difficult to learn dependencies between distant positions [12]. In the Transformer this is
r