# Retrieval Augmented Generation (RAG)

Importing necessary libraries and installing required packages

In [1]:
from dotenv import load_dotenv
import pandas as pd
from pathlib import Path
import json
from dotenv import load_dotenv
import os 
import shutil 
from IPython.display import display, Markdown
import pprint

In [None]:
# ‚úÖ IMPORTS ACTUALIZADOS para LangChain 1.0+

# Document loaders
from langchain_community.document_loaders import PyPDFDirectoryLoader, PyPDFLoader

# Text splitters
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Embeddings
from langchain_huggingface import HuggingFaceEmbeddings

# Core components
from langchain_core.documents import Document
from langchain_core.vectorstores import InMemoryVectorStore  
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

# Vector stores
from langchain_chroma import Chroma

# LLM
from langchain_groq import ChatGroq

# Hub for prompts (using langsmith)
from langsmith import Client as LangSmithClient
hub_client = LangSmithClient()

# Cross encoders
from langchain_community.cross_encoders import HuggingFaceCrossEncoder

In [3]:
# %pip install pypdf langchain-huggingface sentence-transformers

In [3]:
# Load environment variables from .env file
load_dotenv()

True

In [None]:
# ‚úÖ Configurar modelo de embeddings local (HuggingFace)
EMBEDDING_MODEL = "all-MiniLM-L6-v2"
embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)

print(f"‚úÖ Embedding model: {EMBEDDING_MODEL} (local, free)")
print(f"üì¶ Will download model on first use (~90MB)")

## Leveraging Semantic Search (with movies)

In [None]:
# Get the same dataset as in the other notebook
# input_datapath = "../semantic-search/dataset.json"  # ‚ùå Antiguo: path incorrecto
input_datapath = "dataset.json"  # ‚úÖ Usar dataset en mismo directorio

with open(input_datapath, 'r') as f:
    movie_data = json.load(f)

df = pd.DataFrame(movie_data)
print(df.shape)
df.head()

We will create one document per movie

In [5]:
import ast

documents = []
for index, row in df.iterrows():
    genres = ast.literal_eval(row['genres'])
    md_dict = {
        "language": row['original_language'], 
        "genre": genres[0], 
        "release_date": row['release_date'],
        "source": index
    }
    doc = Document(id=index, page_content=row['title']+" - "+row['overview'], metadata=md_dict)
    documents.append(doc)
print(len(documents), "documents")

10 documents


In [6]:
documents

[Document(id='0', metadata={'language': 'English', 'genre': 'Horror', 'release_date': '2023-04-05', 'source': 0}, page_content="The Pope's Exorcist - Father Gabriele Amorth, Chief Exorcist of the Vatican, investigates a young boy's terrifying possession and ends up uncovering a centuries-old conspiracy the Vatican has desperately tried to keep hidden."),
 Document(id='1', metadata={'language': 'English', 'genre': 'Action', 'release_date': '2023-02-15', 'source': 1}, page_content="Ant-Man and the Wasp: Quantumania - Super-Hero partners Scott Lang and Hope van Dyne, along with with Hope's parents Janet van Dyne and Hank Pym, and Scott's daughter Cassie Lang, find themselves exploring the Quantum Realm, interacting with strange new creatures and embarking on an adventure that will push them beyond the limits of what they thought possible."),
 Document(id='2', metadata={'language': 'English', 'genre': 'Action', 'release_date': '2023-04-18', 'source': 2}, page_content='Ghosted - Salt-of-the

We store all the movies into an in-memory vector store for simplicity (it could be any other kind of vector store)

In [None]:
# vectorstore = InMemoryVectorStore(OpenAIEmbeddings())  # ‚ùå Antiguo
vectorstore = InMemoryVectorStore(embeddings)  # ‚úÖ Usar embeddings locales
_ = vectorstore.add_documents(documents=documents)

In [8]:
def _filter_function(doc: Document) -> bool:
    return doc.metadata.get("genre") == 'Horror'

# Alternative ways of performing a semantic search

query = "Something about religion"
#¬†results = vectorstore.similarity_search(query, k=2)
results = vectorstore.similarity_search_with_score(query, k=2)
#¬†results = vectorstore.similarity_search_with_score(query, k=2, filter=_filter_function)

for r in results:
    print(r)

(Document(id='0', metadata={'language': 'English', 'genre': 'Horror', 'release_date': '2023-04-05', 'source': 0}, page_content="The Pope's Exorcist - Father Gabriele Amorth, Chief Exorcist of the Vatican, investigates a young boy's terrifying possession and ends up uncovering a centuries-old conspiracy the Vatican has desperately tried to keep hidden."), 0.7919395004819728)
(Document(id='3', metadata={'language': 'English', 'genre': 'Action', 'release_date': '2023-03-15', 'source': 3}, page_content='Shazam! Fury of the Gods - Billy Batson and his foster siblings, who transform into superheroes by saying "Shazam!", are forced to get back into action and fight the Daughters of Atlas, who they must stop from using a weapon that could destroy the world.'), 0.7548893852785512)


In Langchain, we often use a *retriever* on top of the vector store

In [9]:
retriever = vectorstore.as_retriever(
    search_kwargs={
        'k': 3
    }
)

retriever.invoke(input=query)

[Document(id='0', metadata={'language': 'English', 'genre': 'Horror', 'release_date': '2023-04-05', 'source': 0}, page_content="The Pope's Exorcist - Father Gabriele Amorth, Chief Exorcist of the Vatican, investigates a young boy's terrifying possession and ends up uncovering a centuries-old conspiracy the Vatican has desperately tried to keep hidden."),
 Document(id='3', metadata={'language': 'English', 'genre': 'Action', 'release_date': '2023-03-15', 'source': 3}, page_content='Shazam! Fury of the Gods - Billy Batson and his foster siblings, who transform into superheroes by saying "Shazam!", are forced to get back into action and fight the Daughters of Atlas, who they must stop from using a weapon that could destroy the world.'),
 Document(id='8', metadata={'language': 'English', 'genre': 'Adventure', 'release_date': '2023-03-23', 'source': 8}, page_content='Dungeons & Dragons: Honor Among Thieves - A charming thief and a band of unlikely adventurers undertake an epic heist to retri

Let's create an LLM for the RAG chain

In [None]:
llm_model = os.environ["OPENAI_MODEL"]  # Usa llama-3.1-8b-instant de Groq
print(f"ü§ñ LLM model: {llm_model}")

# llm = ChatOpenAI(model=llm_model, temperature=0.1)  # ‚ùå Antiguo: OpenAI
llm = ChatGroq(model=llm_model, temperature=0.1)  # ‚úÖ Nuevo: Groq (gratis)

The typical RAG prompt considers the *context* and the *question*

In [None]:
# Example for a public prompt (https://smith.langchain.com/hub/rlm/rag-prompt)
# ‚ùå Antiguo: rag_prompt = hub.pull("rlm/rag-prompt", include_model=True)

# ‚úÖ Nuevo: Usar langsmith Client con pull_prompt()
rag_prompt = hub_client.pull_prompt("rlm/rag-prompt")

# Mostrar el template del prompt
rag_prompt.messages[0].prompt

A basic chain that connects to the retriever

In [12]:
# The prompt is predefined, but other prompts could be used
rag_chain = (
    {"context": retriever,  "question": RunnablePassthrough()} 
    | rag_prompt 
    | llm
    | StrOutputParser()
)

query = "I want to get a movie about religion"
result = rag_chain.invoke(query)
#¬†pprint.pprint(result)
display(Markdown(result))

You might consider watching "The Pope's Exorcist," which involves themes of religion as it follows Father Gabriele Amorth investigating a young boy's possession and uncovering a Vatican conspiracy.

## Ingestion (chunks) and RAG

In [13]:
# We consider a large PDF file
pdf_path = "./data/Understanding_Climate_Change.pdf"

loader = PyPDFLoader(pdf_path)
pdf_documents = loader.load() # Each document corresponds actually to a page
print(len(pdf_documents), "loaded")

33 loaded


In [14]:
def replace_t_with_space(list_of_documents):
    for doc in list_of_documents:
        doc.page_content = doc.page_content.replace('\t', ' ')  # Replace tabs with spaces
    return list_of_documents


# Split documents into chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=100)

texts = text_splitter.split_documents(pdf_documents)
cleaned_texts = replace_t_with_space(texts)
print(len(cleaned_texts), "chunks")

97 chunks


In [None]:
# We use a vector store for the chunks
# vectorstore = Chroma.from_documents(cleaned_texts, OpenAIEmbeddings())  # ‚ùå Antiguo
vectorstore = Chroma.from_documents(cleaned_texts, embeddings)  # ‚úÖ Usar embeddings locales
my_retriever = vectorstore.as_retriever(search_kwargs={"k": 5})

In [16]:
# Helper function for printing docs
def pretty_print_docs(docs):
    print(
        f"\n{'-' * 100}\n".join(
            [f"Document {i + 1}:\n\n" + d.page_content for i, d in enumerate(docs)]
        )
    )

test_query = "What is the main cause of climate change?"
context_docs =  my_retriever.invoke(test_query)
pretty_print_docs(context_docs)

Document 1:

change the amount of solar energy our planet receives. During the Holocene epoch, which 
began at the end of the last ice age, human societies flourished, but the industrial era has seen 
unprecedented changes. 
Modern Observations 
Modern scientific observations indicate a rapid increase in global temperatures, sea levels, 
and extreme weather events. The Intergovernmental Panel on Climate Change (IPCC) has 
documented these changes extensively. Ice core samples, tree rings, and ocean sediments 
provide a historical record that scientists use to understand past climate conditions and 
predict future trends. The evidence overwhelmingly shows that recent changes are primarily 
driven by human activities, particularly the emission of greenhouse gases. 
Chapter 2: Causes of Climate Change 
Greenhouse Gases 
The primary cause of recent climate change is the increase in greenhouse gases in the 
atmosphere. Greenhouse gases, such as carbon dioxide (CO2), methane (CH4), and nitro

In [17]:
# Then. we apply the RAG chain
rag_chain = (
    {"context": my_retriever,  "question": RunnablePassthrough()} 
    | rag_prompt 
    | llm
    | StrOutputParser()
)

test_query = "What was the latest storm on Earth?"
result = rag_chain.invoke(test_query)
#¬†pprint.pprint(result)
display(Markdown(result))

I don't know.

## Re-Ranking

In this example, we use a cross-encoding strategy from HuggingFace, but other strategies can be applied

In [None]:
# ‚úÖ IMPLEMENTACI√ìN MANUAL DE RE-RANKING con HuggingFaceCrossEncoder
# ContextualCompressionRetriever no est√° disponible en LangChain 1.0+
# Implementamos re-ranking manual usando cross-encoder directamente

from sentence_transformers import CrossEncoder

# Cargar el modelo cross-encoder para re-ranking
cross_encoder_model = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")

def rerank_documents(query: str, documents: list, top_n: int = 3):
    """
    Re-rankea documentos usando un cross-encoder.
    
    Args:
        query: La pregunta del usuario
        documents: Lista de documentos recuperados
        top_n: N√∫mero de documentos top a retornar
    
    Returns:
        Lista de documentos re-rankeados (top_n mejores)
    """
    # Crear pares (query, documento) para el cross-encoder
    pairs = [[query, doc.page_content] for doc in documents]
    
    # Obtener scores de relevancia
    scores = cross_encoder_model.predict(pairs)
    
    # Ordenar documentos por score (descendente)
    scored_docs = list(zip(documents, scores))
    scored_docs.sort(key=lambda x: x[1], reverse=True)
    
    # Retornar top_n documentos
    reranked_docs = [doc for doc, score in scored_docs[:top_n]]
    
    # Imprimir scores para debugging
    print(f"Re-ranking results (top {top_n}):")
    for i, (doc, score) in enumerate(scored_docs[:top_n], 1):
        preview = doc.page_content[:100].replace('\n', ' ')
        print(f"  {i}. Score: {score:.4f} - {preview}...")
    
    return reranked_docs

print("‚úÖ Re-ranking function implemented with cross-encoder/ms-marco-MiniLM-L-6-v2")

In [None]:
# ‚úÖ Probar re-ranking con documentos del retriever
# Primero obtenemos m√°s documentos (k=10), luego re-rankeamos a top 3

# Obtener documentos del retriever (sin re-ranking)
initial_docs = my_retriever.invoke(test_query)
print(f"\nInitial retrieval: {len(initial_docs)} documents")

# Aplicar re-ranking manual
reranked_docs = rerank_documents(test_query, initial_docs, top_n=3)

print(f"\n{'-'*100}\nRe-ranked documents:\n{'-'*100}")
pretty_print_docs(reranked_docs)

In [None]:
# ‚úÖ RAG Chain con re-ranking integrado
# Creamos un retriever personalizado que incluye re-ranking

class RerankedRetriever:
    """Retriever personalizado que aplica re-ranking autom√°ticamente"""
    
    def __init__(self, base_retriever, rerank_function, top_n=3):
        self.base_retriever = base_retriever
        self.rerank_function = rerank_function
        self.top_n = top_n
    
    def invoke(self, query: str):
        """Obtiene documentos y los re-rankea"""
        # Obtener documentos iniciales
        docs = self.base_retriever.invoke(query)
        # Aplicar re-ranking
        reranked = self.rerank_function(query, docs, self.top_n)
        return reranked

# Crear retriever con re-ranking
reranked_retriever = RerankedRetriever(
    base_retriever=my_retriever,
    rerank_function=rerank_documents,
    top_n=3
)

# Crear RAG chain con re-ranking
rag_chain_reranked = (
    {"context": reranked_retriever.invoke,  "question": RunnablePassthrough()} 
    | rag_prompt 
    | llm
    | StrOutputParser()
)

# Probar el chain con re-ranking
print("\n" + "="*100)
print("RAG CHAIN WITH RE-RANKING")
print("="*100)
result = rag_chain_reranked.invoke(test_query)
print("\nAnswer:")
display(Markdown(result))

In [None]:
# üìä Comparaci√≥n: Sin re-ranking vs Con re-ranking

print("="*100)
print("COMPARISON: WITHOUT vs WITH RE-RANKING")
print("="*100)

# Query de prueba
comparison_query = "What causes climate change?"
print(f"\nQuery: {comparison_query}\n")

# 1. SIN RE-RANKING
print("\n[1] WITHOUT RE-RANKING (top 3 from vector search):")
print("-"*100)
docs_without = my_retriever.invoke(comparison_query)[:3]
for i, doc in enumerate(docs_without, 1):
    preview = doc.page_content[:120].replace('\n', ' ')
    print(f"  {i}. {preview}...")

# 2. CON RE-RANKING
print(f"\n[2] WITH RE-RANKING (cross-encoder scores):")
print("-"*100)
docs_with = rerank_documents(comparison_query, my_retriever.invoke(comparison_query), top_n=3)

print("\n‚úÖ Re-ranking mejora la relevancia de los documentos recuperados")
print("   (documentos con mayor score sem√°ntico seg√∫n el cross-encoder)")

## Bonus: Visualization of Chunks and Query

https://github.com/gabrielchua/RAGxplorer

In [None]:
# ‚úÖ Ya instalado en .venv
# %pip install ragexplorer nbformat

In [None]:
from ragxplorer import RAGxplorer

# ‚úÖ Usar modelo local all-MiniLM-L6-v2 (mismo que usamos en el resto del notebook)
# Por defecto RAGxplorer usa "all-MiniLM-L6-v2" si no se especifica
client = RAGxplorer(embedding_model="all-MiniLM-L6-v2")

# Cargar el PDF y construir la base de datos vectorial
client.load_pdf(
    document_path=pdf_path, 
    chunk_size=1000,
    chunk_overlap=100,
    verbose=True
)

In [None]:
# ‚úÖ Usar m√©todo "naive" (b√°sico) en lugar de "HyDE" 
# HyDE tiene un bug con embeddings locales en la versi√≥n actual de RAGxplorer
client.visualize_query(
    query=test_query, 
    retrieval_method="naive",  # Opciones: "naive", "HyDE", "multi_qns"
    top_k=6, 
    query_shape_size=10
)

---