# Retrieval Augmented Generation (RAG)

Importing necessary libraries and installing required packages

In [1]:
from dotenv import load_dotenv
import pandas as pd
from pathlib import Path
import json
from dotenv import load_dotenv
import os 
import shutil 
from IPython.display import display, Markdown
import pprint

In [None]:
# ‚ùå IMPORTS DEPRECADOS (comentados):
# from langchain.document_loaders.pdf import PyPDFDirectoryLoader 
# from langchain.document_loaders import PyPDFLoader

# ‚úÖ IMPORTS ACTUALIZADOS:
from langchain_community.document_loaders import PyPDFDirectoryLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter 
# from langchain_openai import OpenAIEmbeddings  # ‚ùå Antiguo: requiere OpenAI API key
from langchain_huggingface import HuggingFaceEmbeddings  # ‚úÖ Nuevo: embeddings locales gratuitos
from langchain.schema import Document 
from langchain.vectorstores.chroma import Chroma # This is a Chroma wrapper from Langchain
# from langchain_openai import ChatOpenAI  # ‚ùå Antiguo: OpenAI LLM
from langchain_groq import ChatGroq  # ‚úÖ Nuevo: Groq LLM gratuito
from langchain_core.vectorstores import InMemoryVectorStore
from langchain import hub
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser

from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import CrossEncoderReranker
from langchain_community.cross_encoders import HuggingFaceCrossEncoder

In [3]:
# %pip install pypdf langchain-huggingface sentence-transformers

In [3]:
# Load environment variables from .env file
load_dotenv()

True

In [None]:
# ‚úÖ Configurar modelo de embeddings local (HuggingFace)
EMBEDDING_MODEL = "all-MiniLM-L6-v2"
embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)

print(f"‚úÖ Embedding model: {EMBEDDING_MODEL} (local, free)")
print(f"üì¶ Will download model on first use (~90MB)")

## Leveraging Semantic Search (with movies)

In [None]:
# Get the same dataset as in the other notebook
# input_datapath = "../semantic-search/dataset.json"  # ‚ùå Antiguo: path incorrecto
input_datapath = "dataset.json"  # ‚úÖ Usar dataset en mismo directorio

with open(input_datapath, 'r') as f:
    movie_data = json.load(f)

df = pd.DataFrame(movie_data)
print(df.shape)
df.head()

We will create one document per movie

In [5]:
import ast

documents = []
for index, row in df.iterrows():
    genres = ast.literal_eval(row['genres'])
    md_dict = {
        "language": row['original_language'], 
        "genre": genres[0], 
        "release_date": row['release_date'],
        "source": index
    }
    doc = Document(id=index, page_content=row['title']+" - "+row['overview'], metadata=md_dict)
    documents.append(doc)
print(len(documents), "documents")

10 documents


In [6]:
documents

[Document(id='0', metadata={'language': 'English', 'genre': 'Horror', 'release_date': '2023-04-05', 'source': 0}, page_content="The Pope's Exorcist - Father Gabriele Amorth, Chief Exorcist of the Vatican, investigates a young boy's terrifying possession and ends up uncovering a centuries-old conspiracy the Vatican has desperately tried to keep hidden."),
 Document(id='1', metadata={'language': 'English', 'genre': 'Action', 'release_date': '2023-02-15', 'source': 1}, page_content="Ant-Man and the Wasp: Quantumania - Super-Hero partners Scott Lang and Hope van Dyne, along with with Hope's parents Janet van Dyne and Hank Pym, and Scott's daughter Cassie Lang, find themselves exploring the Quantum Realm, interacting with strange new creatures and embarking on an adventure that will push them beyond the limits of what they thought possible."),
 Document(id='2', metadata={'language': 'English', 'genre': 'Action', 'release_date': '2023-04-18', 'source': 2}, page_content='Ghosted - Salt-of-the

We store all the movies into an in-memory vector store for simplicity (it could be any other kind of vector store)

In [None]:
# vectorstore = InMemoryVectorStore(OpenAIEmbeddings())  # ‚ùå Antiguo
vectorstore = InMemoryVectorStore(embeddings)  # ‚úÖ Usar embeddings locales
_ = vectorstore.add_documents(documents=documents)

In [8]:
def _filter_function(doc: Document) -> bool:
    return doc.metadata.get("genre") == 'Horror'

# Alternative ways of performing a semantic search

query = "Something about religion"
#¬†results = vectorstore.similarity_search(query, k=2)
results = vectorstore.similarity_search_with_score(query, k=2)
#¬†results = vectorstore.similarity_search_with_score(query, k=2, filter=_filter_function)

for r in results:
    print(r)

(Document(id='0', metadata={'language': 'English', 'genre': 'Horror', 'release_date': '2023-04-05', 'source': 0}, page_content="The Pope's Exorcist - Father Gabriele Amorth, Chief Exorcist of the Vatican, investigates a young boy's terrifying possession and ends up uncovering a centuries-old conspiracy the Vatican has desperately tried to keep hidden."), 0.7919395004819728)
(Document(id='3', metadata={'language': 'English', 'genre': 'Action', 'release_date': '2023-03-15', 'source': 3}, page_content='Shazam! Fury of the Gods - Billy Batson and his foster siblings, who transform into superheroes by saying "Shazam!", are forced to get back into action and fight the Daughters of Atlas, who they must stop from using a weapon that could destroy the world.'), 0.7548893852785512)


In Langchain, we often use a *retriever* on top of the vector store

In [9]:
retriever = vectorstore.as_retriever(
    search_kwargs={
        'k': 3
    }
)

retriever.invoke(input=query)

[Document(id='0', metadata={'language': 'English', 'genre': 'Horror', 'release_date': '2023-04-05', 'source': 0}, page_content="The Pope's Exorcist - Father Gabriele Amorth, Chief Exorcist of the Vatican, investigates a young boy's terrifying possession and ends up uncovering a centuries-old conspiracy the Vatican has desperately tried to keep hidden."),
 Document(id='3', metadata={'language': 'English', 'genre': 'Action', 'release_date': '2023-03-15', 'source': 3}, page_content='Shazam! Fury of the Gods - Billy Batson and his foster siblings, who transform into superheroes by saying "Shazam!", are forced to get back into action and fight the Daughters of Atlas, who they must stop from using a weapon that could destroy the world.'),
 Document(id='8', metadata={'language': 'English', 'genre': 'Adventure', 'release_date': '2023-03-23', 'source': 8}, page_content='Dungeons & Dragons: Honor Among Thieves - A charming thief and a band of unlikely adventurers undertake an epic heist to retri

Let's create an LLM for the RAG chain

In [None]:
llm_model = os.environ["OPENAI_MODEL"]  # Usa llama-3.1-8b-instant de Groq
print(f"ü§ñ LLM model: {llm_model}")

# llm = ChatOpenAI(model=llm_model, temperature=0.1)  # ‚ùå Antiguo: OpenAI
llm = ChatGroq(model=llm_model, temperature=0.1)  # ‚úÖ Nuevo: Groq (gratis)

The typical RAG prompt considers the *context* and the *question*

In [11]:
# Example for a public prompt (https://smith.langchain.com/hub/rlm/rag-prompt)
rag_prompt = hub.pull("rlm/rag-prompt", include_model=True)
rag_prompt.messages[0].prompt



PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:")

A basic chain that connects to the retriever

In [12]:
# The prompt is predefined, but other prompts could be used
rag_chain = (
    {"context": retriever,  "question": RunnablePassthrough()} 
    | rag_prompt 
    | llm
    | StrOutputParser()
)

query = "I want to get a movie about religion"
result = rag_chain.invoke(query)
#¬†pprint.pprint(result)
display(Markdown(result))

You might consider watching "The Pope's Exorcist," which involves themes of religion as it follows Father Gabriele Amorth investigating a young boy's possession and uncovering a Vatican conspiracy.

## Ingestion (chunks) and RAG

In [13]:
# We consider a large PDF file
pdf_path = "./data/Understanding_Climate_Change.pdf"

loader = PyPDFLoader(pdf_path)
pdf_documents = loader.load() # Each document corresponds actually to a page
print(len(pdf_documents), "loaded")

33 loaded


In [14]:
def replace_t_with_space(list_of_documents):
    for doc in list_of_documents:
        doc.page_content = doc.page_content.replace('\t', ' ')  # Replace tabs with spaces
    return list_of_documents


# Split documents into chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=100)

texts = text_splitter.split_documents(pdf_documents)
cleaned_texts = replace_t_with_space(texts)
print(len(cleaned_texts), "chunks")

97 chunks


In [None]:
# We use a vector store for the chunks
# vectorstore = Chroma.from_documents(cleaned_texts, OpenAIEmbeddings())  # ‚ùå Antiguo
vectorstore = Chroma.from_documents(cleaned_texts, embeddings)  # ‚úÖ Usar embeddings locales
my_retriever = vectorstore.as_retriever(search_kwargs={"k": 5})

In [16]:
# Helper function for printing docs
def pretty_print_docs(docs):
    print(
        f"\n{'-' * 100}\n".join(
            [f"Document {i + 1}:\n\n" + d.page_content for i, d in enumerate(docs)]
        )
    )

test_query = "What is the main cause of climate change?"
context_docs =  my_retriever.invoke(test_query)
pretty_print_docs(context_docs)

Document 1:

change the amount of solar energy our planet receives. During the Holocene epoch, which 
began at the end of the last ice age, human societies flourished, but the industrial era has seen 
unprecedented changes. 
Modern Observations 
Modern scientific observations indicate a rapid increase in global temperatures, sea levels, 
and extreme weather events. The Intergovernmental Panel on Climate Change (IPCC) has 
documented these changes extensively. Ice core samples, tree rings, and ocean sediments 
provide a historical record that scientists use to understand past climate conditions and 
predict future trends. The evidence overwhelmingly shows that recent changes are primarily 
driven by human activities, particularly the emission of greenhouse gases. 
Chapter 2: Causes of Climate Change 
Greenhouse Gases 
The primary cause of recent climate change is the increase in greenhouse gases in the 
atmosphere. Greenhouse gases, such as carbon dioxide (CO2), methane (CH4), and nitro

In [17]:
# Then. we apply the RAG chain
rag_chain = (
    {"context": my_retriever,  "question": RunnablePassthrough()} 
    | rag_prompt 
    | llm
    | StrOutputParser()
)

test_query = "What was the latest storm on Earth?"
result = rag_chain.invoke(test_query)
#¬†pprint.pprint(result)
display(Markdown(result))

I don't know.

## Re-Ranking

In this example, we use a cross-encoding strategy from HuggingFace, but other strategies can be applied

In [18]:
# Initialize the cross encoder
model = HuggingFaceCrossEncoder(model_name="cross-encoder/ms-marco-MiniLM-L-6-v2")

# Create a reranker compressor
compressor = CrossEncoderReranker(model=model, top_n=3)

# Wrap your base retriever with the compression retriever
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=my_retriever
)

README.md: 0.00B [00:00, ?B/s]

In [19]:
# Use the compression retriever
compressed_docs = compression_retriever.invoke(test_query)
pretty_print_docs(compressed_docs)

Document 1:

Climate change is linked to an increase in the frequency and severity of extreme weather 
events, such as hurricanes, heatwaves, droughts, and heavy rainfall. These events can have 
devastating impacts on communities, economies, and ecosystems. 
Hurricanes and Typhoons 
Warmer ocean temperatures can intensify hurricanes and typhoons, leading to more 
destructive storms. Coastal regions are at heightened risk of storm surge and flooding. Early 
Droughts 
Increased temperatures and changing precipitation patterns are contributing to more frequent 
and severe droughts. This affects agriculture, water supply, and ecosystems, particularly in 
arid and semi-arid regions. Droughts can lead to food and water shortages and exacerbate 
conflicts. 
Flooding 
Heavy rainfall events are becoming more common, leading to increased flooding. Urban
----------------------------------------------------------------------------------------------------
Document 2:

The Arctic is warming at more 

In [20]:
rag_chain1 = (
    {"context": compression_retriever,  "question": RunnablePassthrough()} 
    | rag_prompt 
    | llm
    | StrOutputParser()
)

result = rag_chain.invoke(test_query)
#¬†pprint.pprint(result)
display(Markdown(result))

I don't know.

## Bonus: Visualization of Chunks and Query

https://github.com/gabrielchua/RAGxplorer

In [21]:
# %pip install ragexplorer nbformat

In [21]:
from ragxplorer import RAGxplorer

#¬†os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')

client_openai = RAGxplorer(embedding_model="text-embedding-3-small")
client_openai.load_pdf(
    document_path=pdf_path, 
    chunk_size=1000,
    chunk_overlap=100,
    verbose=True
)

 ~ Building the vector database...


README.md: 0.00B [00:00, ?B/s]

Completed Building Vector Database ‚úì
 ~ Reducing the dimensionality of embeddings...


OMP: Info #270: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 97/97 [00:03<00:00, 29.93it/s]

Completed reducing dimensionality of embeddings ‚úì





In [22]:
client_openai.visualize_query(
    query=test_query, 
    retrieval_method="HyDE", 
    top_k=6, 
    query_shape_size=10
)

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 16.56it/s]


---