In [1]:
from langchain_google_genai import ChatGoogleGenerativeAI
from dotenv import load_dotenv
import os

load_dotenv()

os.environ["GOOGLE_API_KEY"] = os.getenv("GOOGLE_API_KEY")

llm = ChatGoogleGenerativeAI(
        model = "gemini-2.5-flash",
    )

E0000 00:00:1761435268.455832   11828 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.


**Table of contents**<a id='toc0_'></a>    
- [Zero-shot Query Rewriting](#toc1_)    
- [Few-shot Query Rewriting](#toc2_)    
- [Sub-queries](#toc3_)    
- [Step-back prompt](#toc4_)    
- [HyDE](#toc5_)    
- [Ejemplo: RAG sin Query Rewriting vs RAG con Query Rewriting](#toc6_)    
  - [RAG sin Query Rewriting](#toc6_1_)    
  - [Con Query Rewriting](#toc6_2_)    

<!-- vscode-jupyter-toc-config
	numbering=false
	anchor=true
	flat=false
	minLevel=1
	maxLevel=6
	/vscode-jupyter-toc-config -->
<!-- THIS CELL WILL BE REPLACED ON TOC UPDATE. DO NOT WRITE YOUR TEXT IN THIS CELL -->

In [2]:
from langchain_huggingface import HuggingFaceEmbeddings

embeddings_model_name = "BAAI/bge-small-en"
model_kwargs = {"device": "cpu"}
encode_kwargs = {"normalize_embeddings": True}

embeddings_model = HuggingFaceEmbeddings(model_name = embeddings_model_name, model_kwargs = model_kwargs, encode_kwargs = encode_kwargs)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

# <a id='toc1_'></a>[Zero-shot Query Rewriting](#toc0_)


In [3]:
from langchain_core.prompts import ChatPromptTemplate

system_rewrite = """You are a helpful assistant that generates multiple search queries based on a single input query.

Perform query expansion. If there are multiple common ways of phrasing a user question
or common synonyms for key words in the question, make sure to return multiple versions
of the query with the different phrasings.

If there are acronyms or words you are not familiar with, do not try to rephrase them.

Return 3 different versions of the question."""

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_rewrite),
        ("human", "{question}")
        ]
)

chain = prompt | llm

In [4]:
response = chain.invoke({
    "question": "Which food items does this recipe need?"
})

response

AIMessage(content='1.  What ingredients does this recipe need?\n2.  What food items are required for this recipe?\n3.  Which ingredients are used in this recipe?', additional_kwargs={}, response_metadata={'prompt_feedback': {'block_reason': 0, 'safety_ratings': []}, 'finish_reason': 'STOP', 'safety_ratings': []}, id='run--a1d4730b-5454-477f-bed5-5c44bbb0a4f0-0', usage_metadata={'input_tokens': 102, 'output_tokens': 35, 'total_tokens': 380, 'input_token_details': {'cache_read': 0}})

# <a id='toc2_'></a>[Few-shot Query Rewriting](#toc0_)

In [5]:
from langchain_core.prompts import FewShotChatMessagePromptTemplate

examples = [
    {
        "question": "How tall is the Eiffel Tower? It looked so high when I was there last year",
        "answer": "What is the height of the Eiffel Tower?"
    },
    {
        "question": "1 oz is 28 grams, how many cm is 1 inch?",
        "answer": "Convert 1 inch to cm."
    },
    {
        "question": "What's the main point of the article? What did the author try to convey?",
        "answer": "What is the main key point of this article?"
    }
]

example_prompt = ChatPromptTemplate.from_messages(
    [
        ("human", "{question}"),
        ("ai", "{answer}"),
    ]
)

few_shot_prompt = FewShotChatMessagePromptTemplate(
    example_prompt = example_prompt,
    examples = examples
)

In [6]:
final_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_rewrite),
        few_shot_prompt,
        ("human", "{question}"),
    ]
)

chain = final_prompt | llm

In [7]:
response = chain.invoke({
    "question": "Which food items does this recipe need?"
})

response

AIMessage(content='1.  What ingredients are required for this recipe?\n2.  What food items are needed for this recipe?\n3.  Which ingredients does this recipe call for?', additional_kwargs={}, response_metadata={'prompt_feedback': {'block_reason': 0, 'safety_ratings': []}, 'finish_reason': 'STOP', 'safety_ratings': []}, id='run--00cb70bb-ea36-49c6-a438-30bc62fed14a-0', usage_metadata={'input_tokens': 185, 'output_tokens': 36, 'total_tokens': 344, 'input_token_details': {'cache_read': 0}})

# <a id='toc3_'></a>[Sub-queries](#toc0_)

In [8]:
system_decompose = """You are a helpful assistant that generates search queries based on a single input query.

Perform query decomposition. Given a user question, break it down into distinct sub questions that
you need to answer in order to answer the original question.

If there are acronyms or words you are not familiar with, do not try to rephrase them."""

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_decompose),
        ("human", "{question}"),
    ]
)

chain = prompt | llm

In [9]:
response = chain.invoke({
    "question": """Which is the most popular programming language for machine learning and
is it the most popular programming language overall?"""
})

response

AIMessage(content='Here are the sub-questions and search queries to answer your request:\n\n**Sub-questions:**\n\n1.  What is the most popular programming language used in machine learning?\n2.  Is that same language the most popular programming language across all domains?\n\n**Search Queries:**\n\n*   "most popular programming language machine learning"\n*   "top programming languages for AI"\n*   "most used programming language overall"\n*   "ranking of programming languages by popularity"', additional_kwargs={}, response_metadata={'prompt_feedback': {'block_reason': 0, 'safety_ratings': []}, 'finish_reason': 'STOP', 'safety_ratings': []}, id='run--73d23705-0355-4c8e-928e-78ea85fccc42-0', usage_metadata={'input_tokens': 91, 'output_tokens': 100, 'total_tokens': 245, 'input_token_details': {'cache_read': 0}})

# <a id='toc4_'></a>[Step-back prompt](#toc0_)

In [11]:
system_step_back = """You are an expert at taking a specific question and extracting a more generic question that gets at
the underlying principles needed to answer the specific question.

Given a specific user question, write a more generic question that needs to be answered in order to answer the specific question.

If you don't recognize a word or acronym to not try to rewrite it.

Write concise questions."""

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_step_back),
        ("human", "{question}"),
    ]
)

chain = prompt | llm

In [12]:
response = chain.invoke({
    "question": """Which is the most popular programming language for machine learning?"""
})

response

AIMessage(content='How does one determine the most popular programming language for a given domain?', additional_kwargs={}, response_metadata={'prompt_feedback': {'block_reason': 0, 'safety_ratings': []}, 'finish_reason': 'STOP', 'safety_ratings': []}, id='run--0d0d1e8d-3d5f-496d-bb59-fff532eba30f-0', usage_metadata={'input_tokens': 89, 'output_tokens': 14, 'total_tokens': 501, 'input_token_details': {'cache_read': 0}})

# <a id='toc5_'></a>[HyDE](#toc0_)

In [13]:
actual_document = """
Berkson's paradox, also known as Berkson's bias, collider bias, or Berkson's fallacy, is a result in conditional probability
and statistics which is often found to be counterintuitive, and hence a veridical paradox. It is a complicating factor arising in
statistical tests of proportions. Specifically, it arises when there is an ascertainment bias inherent in a study design. The effect is
related to the explaining away phenomenon in Bayesian networks, and conditioning on a collider in graphical models.

It is often described in the fields of medical statistics or biostatistics, as in the original description of the problem by Joseph Berkson.
"""

In [14]:
actual_document_emb = embeddings_model.embed_documents([actual_document])

system_hyde = """You are an expert at using a question to generate a document useful for answering the question.

Given a question, generate a paragraph of text that answers the question.
"""

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_hyde),
        ("human", "{question}"),
    ]
)

chain = prompt | llm

In [15]:
hypothetical_document = chain.invoke({
    "question": """What does Berkson's paradox consist on?"""
})

hypothetical_document

AIMessage(content='Berkson\'s paradox, also known as Berkson\'s bias or collider bias, is a conditional probability paradox where two independent (or even positively correlated) events appear to be negatively correlated within a *selected* subpopulation. It arises when the selection into the observed group is dependent on *both* events, specifically when the observed group consists of individuals where at least one of the events has occurred. For instance, if you only observe people admitted to a hospital (the selected group), and admission is more likely if you have either disease A or disease B (or both), you might observe a negative correlation between diseases A and B *within the hospital population*, even if they are independent in the general population. This is because if a patient *doesn\'t* have disease A, they *must* be more likely to have disease B (to explain their hospital admission), creating an artificial inverse relationship. The paradox highlights how conditioning on a

In [16]:
from sklearn.metrics.pairwise import cosine_similarity

question_embeddings = embeddings_model.embed_documents(["What does Berkson's paradox consist on?"])
hypothetical_document_emb = embeddings_model.embed_documents([hypothetical_document.content])

print(f"Similarity without HyDE: {cosine_similarity(question_embeddings, actual_document_emb)}")
print(f"Similarity with HyDE: {cosine_similarity(hypothetical_document_emb, actual_document_emb)}")

Similarity without HyDE: [[0.86675569]]
Similarity with HyDE: [[0.96079076]]


# <a id='toc6_'></a>[Ejemplo: RAG sin Query Rewriting vs RAG con Query Rewriting](#toc0_)

## <a id='toc6_1_'></a>[RAG sin Query Rewriting](#toc0_)

In [22]:
from langchain_community.document_loaders import PyPDFLoader

document_url = "https://arxiv.org/pdf/2312.10997.pdf"
loader = PyPDFLoader(document_url)
pages = loader.load()

In [23]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 400,
    chunk_overlap = 40,
    length_function = len,
    is_separator_regex = False,
)
chunks = text_splitter.split_documents(pages)

chunk_texts = list(map(lambda d: d.page_content, chunks))
embeddings = embeddings_model.embed_documents(chunk_texts)

In [24]:
from langchain_community.vectorstores import FAISS

text_embedding_pairs = zip(chunk_texts, embeddings)
db = FAISS.from_embeddings(text_embedding_pairs, embeddings_model)

In [25]:
query = "Which evaluation tools are useful for evaluating a RAG pipeline?"

contexts = db.similarity_search(query, k=1)

print(contexts[0])

page_content='D. Evaluation Benchmarks and Tools
A series of benchmark tests and tools have been proposed
to facilitate the evaluation of RAG.These instruments furnish
quantitative metrics that not only gauge RAG model perfor-
mance but also enhance comprehension of the model’s capabil-
ities across various evaluation aspects. Prominent benchmarks
such as RGB, RECALL and CRUD [167]–[169] focus on'


In [26]:
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", "You are an expert at answering questions based on a context extracted from a document. The context extracted from the document is: {context}"),
        ("human", "{question}"),
    ]
)

chain = prompt | llm

In [27]:
response = chain.invoke({
    "context": '\n\n'.join(list(map(lambda c: c.page_content, contexts))),
    "question": query
})

response

AIMessage(content='The context mentions the following prominent benchmarks that serve as evaluation tools for RAG pipelines:\n*   RGB\n*   RECALL\n*   CRUD', additional_kwargs={}, response_metadata={'prompt_feedback': {'block_reason': 0, 'safety_ratings': []}, 'finish_reason': 'STOP', 'safety_ratings': []}, id='run--1717c3a9-d216-4630-83b6-c19f96c8d785-0', usage_metadata={'input_tokens': 126, 'output_tokens': 30, 'total_tokens': 518, 'input_token_details': {'cache_read': 0}})

## <a id='toc6_2_'></a>[Con Query Rewriting](#toc0_)

In [31]:
from langchain.output_parsers import PydanticToolsParser
from pydantic import BaseModel, Field

class ParaphrasedQuery(BaseModel):
    
    """You have performed query expansion to generate a paraphrasing of a question."""
    paraphrased_query: str = Field(
        description = "A unique paraphrasing of the original question.",
    )

In [32]:
rewrite_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_rewrite),
        ("human", "{question}"),
    ]
)

llm_with_tools = llm.bind_tools([ParaphrasedQuery])
query_analyzer = rewrite_prompt | llm_with_tools | PydanticToolsParser(tools = [ParaphrasedQuery])

In [33]:
queries = query_analyzer.invoke({
    "question": query
})

queries

[ParaphrasedQuery(paraphrased_query='What are some effective evaluation tools for RAG pipelines?'),
 ParaphrasedQuery(paraphrased_query='What tools can be used to evaluate a Retrieval Augmented Generation (RAG) pipeline?'),
 ParaphrasedQuery(paraphrased_query='Recommended evaluation frameworks for RAG systems?')]

In [34]:
contexts = []
for query in queries:
    contexts = contexts + db.similarity_search(query.paraphrased_query, k = 1)

contexts = contexts + db.similarity_search("Which evaluation tools are useful for evaluating a RAG pipeline?", k =1)

contexts

[Document(id='1bc98a4c-2e63-4bd2-a875-88a979025cc5', metadata={}, page_content='appraising the essential abilities of RAG models. Concur-\nrently, state-of-the-art automated tools like RAGAS [164],\nARES [165], and TruLens 8 employ LLMs to adjudicate the\nquality scores. These tools and benchmarks collectively form\na robust framework for the systematic evaluation of RAG\nmodels, as summarized in Table IV.\nVII. D ISCUSSION AND FUTURE PROSPECTS'),
 Document(id='123d6a43-eefc-45b9-8654-869f176abc75', metadata={}, page_content='LLM-auto-eval-best-practices-RAG, 2023.\n[164] S. Es, J. James, L. Espinosa-Anke, and S. Schockaert, “Ragas: Au-\ntomated evaluation of retrieval augmented generation,” arXiv preprint\narXiv:2309.15217, 2023.\n[165] J. Saad-Falcon, O. Khattab, C. Potts, and M. Zaharia, “Ares: An\nautomated evaluation framework for retrieval-augmented generation\nsystems,” arXiv preprint arXiv:2311.09476 , 2023.'),
 Document(id='1bc98a4c-2e63-4bd2-a875-88a979025cc5', metadata={}, p

In [35]:
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", "You are an expert at answering questions based on a context extracted from a document. The context extracted from the document is: {context}"),
        ("human", "{question}"),
    ]
)

chain = prompt | llm

In [36]:
response = chain.invoke({
    "context": '\n\n'.join(list(map(lambda c: c.page_content, contexts))),
    "question": query
})

response

AIMessage(content='A robust framework for the systematic evaluation of RAG models is formed by a combination of state-of-the-art automated tools and prominent benchmarks.\n\nRecommended evaluation frameworks and tools include:\n*   **Automated Tools:** RAGAS [164], ARES [165], and TruLens 8. These tools utilize LLMs to assess quality scores.\n*   **Prominent Benchmarks:** RGB, RECALL, and CRUD [167]–[169].\n\nThese instruments provide quantitative metrics to gauge RAG model performance and enhance understanding of their capabilities across various evaluation aspects.', additional_kwargs={}, response_metadata={'prompt_feedback': {'block_reason': 0, 'safety_ratings': []}, 'finish_reason': 'STOP', 'safety_ratings': []}, id='run--144f31a4-af0f-42ce-b415-e5e57de20d56-0', usage_metadata={'input_tokens': 474, 'output_tokens': 127, 'total_tokens': 1118, 'input_token_details': {'cache_read': 0}})