<a href="https://colab.research.google.com/github/towardsai/ragbook-notebooks/blob/main/notebooks/Chapter%2008%20-%20RAG_Metrics%26Evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q llama-index==0.12.43 deeplake==4.2.10 openai==1.92.0 llama-index-vector-stores-deeplake==0.3.3 llama-index-llms-openai==0.4.7 llama-index-readers-web==0.4.1 \
                html2text==2024.2.26 ragas==0.2.15 jedi==0.19.2

In [None]:
import os

# os.environ["OPENAI_API_KEY"] = "<YOUR_OPENAI_KEY>"
# os.environ["ACTIVELOOP_TOKEN"] = "<YOUR_ACTIVELOOP_TOKEN>"

from google.colab import userdata

os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_API_KEY')
os.environ["ACTIVELOOP_TOKEN"] = userdata.get('ACTIVELOOP_TOKEN')

In [None]:
from llama_index.core import Settings
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding

# Configure Settings (replaces ServiceContext)
# Set up global settings
Settings.llm = OpenAI(model="gpt-4.1-mini", temperature=0.0)
Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small")
Settings.chunk_size = 512
Settings.chunk_overlap = 50

In [None]:
# necessary Imports
from llama_index.core.evaluation import (
    FaithfulnessEvaluator,
    RelevancyEvaluator,
    CorrectnessEvaluator,
    RetrieverEvaluator,
    BatchEvalRunner,
    generate_question_context_pairs
)

In [None]:
import asyncio
import nest_asyncio
import logging

# Enable nested async for Jupyter/Colab
nest_asyncio.apply()

# Disable verbose HTTP logging
logging.getLogger("httpx").setLevel(logging.WARNING)
logging.getLogger("openai").setLevel(logging.WARNING)
logging.getLogger("urllib3").setLevel(logging.WARNING)

# FaithfulnessEvaluator

In [None]:
from llama_index.vector_stores.deeplake import DeepLakeVectorStore
from llama_index.core import Settings, VectorStoreIndex, StorageContext


# Load DeepLake vector store
my_activeloop_org_id = "" # TODO: use your organization id here
vector_store = DeepLakeVectorStore(
    dataset_path=f"hub://{my_activeloop_org_id}/LlamaIndex_paulgraham_essay",
    overwrite=False
)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

# Create index
index = VectorStoreIndex.from_vector_store(
    vector_store,
    storage_context=storage_context
)


In [None]:
# Create evaluator
evaluator = FaithfulnessEvaluator()

In [None]:
# Query and evaluate
query_engine = index.as_query_engine(similarity_top_k=10)
response = query_engine.query("What does Paul Graham do?")

eval_result = evaluator.evaluate_response(response=response)

print(f"> Response: {response}")
print(f"> Faithfulness evaluation passed: {eval_result.passing}")
print(f"> Evaluation score: {eval_result.score}")
print(f"> Evaluation feedback: {eval_result.feedback}")

# RAGAS

In [None]:
# Load web content for RAGAS demonstration
from llama_index.readers.web import SimpleWebPageReader

documents = SimpleWebPageReader(html_to_text=True).load_data([
    "https://en.wikipedia.org/wiki/New_York_City"
])

# Create vector index
vector_index = VectorStoreIndex.from_documents(documents)
query_engine = vector_index.as_query_engine()

# Test query
response_vector = query_engine.query("How did New York City get its name?")
print(f"Response: {response_vector}")

In [None]:
## RAGAS Evaluation Setup (Updated API)
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_precision,
    context_recall,
    AspectCritic
)

# Prepare evaluation data
eval_questions = [
    "What is the population of New York City as of 2020?",
    "Which borough of New York City has the highest population?",
    "What is the economic significance of New York City?",
    "How did New York City get its name?",
    "What is the significance of the Statue of Liberty in New York City?",
    "How many people got killed in WW2?"
]

# Generate responses for all questions
eval_responses = []
eval_contexts = []


for question in eval_questions:
    response = query_engine.query(question)
    eval_responses.append(str(response))
    # Extract contexts from source nodes
    contexts = [node.node.get_text() for node in response.source_nodes]
    eval_contexts.append(contexts)

# Ground truth answers
eval_answers = [
    "As of 2020, New York City has a population of approximately 8.3 million people.",
    "Brooklyn is the most populous borough of New York City.",
    "New York City is a global financial center and major economic hub, housing Wall Street and numerous multinational corporations.",
    "New York City was named after the Duke of York when the British took control from the Dutch in 1664.",
    "The Statue of Liberty symbolizes freedom and democracy, serving as a welcoming beacon for immigrants arriving in America.",
    "World War II resulted in an estimated 70 to 85 million fatalities, making it the deadliest conflict in human history"
]

In [None]:
# Create dataset for RAGAS evaluation
from datasets import Dataset

eval_dataset = Dataset.from_dict({
    "question": eval_questions,
    "answer": eval_responses,
    "contexts": eval_contexts,
    "ground_truth": eval_answers
})

In [None]:
## Comment this cell if the harmfulness metric is not required, and uncomment the cell below.

from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini"))
evaluator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings(model="text-embedding-3-small"))

harmfulness = AspectCritic(
    name="harmfulness",
    definition="Does the submission cause or have the potential to cause harm to individuals, groups, or society at large?",
    llm=evaluator_llm
)

# Run RAGAS evaluation
metrics = [faithfulness, answer_relevancy, context_precision, context_recall, harmfulness]

# Run RAGAS evaluation using LlamaIndex integration
result = evaluate(
    metrics=metrics,
    dataset=eval_dataset,
    llm=evaluator_llm,
    embeddings=evaluator_embeddings
)

print("RAGAS Evaluation Results:")
print(result)

In [None]:
## Uncomment if the harmfulness metric is not required

# metrics = [faithfulness, answer_relevancy, context_precision, context_recall]
# result = evaluate(eval_dataset, metrics=metrics)

# print("RAGAS Evaluation Results:")
# print(result)

In [None]:
# Convert to pandas for better visualization
import pandas as pd
results_df = result.to_pandas()
print("\nDetailed Results:")
print(results_df)

# Custom RAG pipeline

In [None]:
!wget 'https://raw.githubusercontent.com/idontcalculate/data-repo/main/venus_transmission.txt'

In [None]:
from llama_index.core import SimpleDirectoryReader

reader = SimpleDirectoryReader(input_files=["/content/venus_transmission.txt"])

docs_venus = reader.load_data()
print(f"Loaded {len(docs_venus)} docs")

In [None]:
from llama_index.core.node_parser import TokenTextSplitter

node_parser = TokenTextSplitter()
nodes = node_parser.get_nodes_from_documents(docs_venus, show_progress=True)

# Create vector index with doc
vector_index = VectorStoreIndex(nodes, show_progress=True)

In [None]:
# Test query
query_engine = vector_index.as_query_engine()
response = query_engine.query("What were the first beings to inhabit the planet?")

print(f"Response: {response.response}\n\n")
print(f"First source node: {response.source_nodes[0].get_text()}")

## Generate Question-Context Pairs for Evaluation

In [None]:
from llama_index.core.evaluation import generate_question_context_pairs

# Generate evaluation dataset
qa_dataset = generate_question_context_pairs(
    nodes,
    llm=Settings.llm,
    num_questions_per_chunk=2
)

queries = list(qa_dataset.queries.values())
print(f"Generated {len(queries)} questions for evaluation")
print("Sample queries:")
for i, query in enumerate(queries[:3]):
    print(f"{i+1}. {query}")

In [None]:
## Retriever Evaluation
retriever = vector_index.as_retriever(similarity_top_k=20)

# Set up retriever evaluator
retriever_evaluator = RetrieverEvaluator.from_metric_names(
    ["mrr", "hit_rate"],
    retriever=retriever
)

async def evaluate_retriever():
    eval_results = await retriever_evaluator.aevaluate_dataset(qa_dataset, workers=32)
    return eval_results

# Run evaluation
eval_results = asyncio.run(evaluate_retriever())

In [None]:
# Display results
def display_results(name, eval_results):
    """Display results from evaluate."""
    metric_dicts = []
    for eval_result in eval_results:
        metric_dict = eval_result.metric_vals_dict
        metric_dicts.append(metric_dict)

    full_df = pd.DataFrame(metric_dicts)

    hit_rate = full_df["hit_rate"].mean()
    mrr = full_df["mrr"].mean()

    metric_df = pd.DataFrame({
        "Retriever Name": [name],
        "Hit Rate": [hit_rate],
        "MRR": [mrr]
    })

    return metric_df

results_df = display_results("OpenAI Embedding Retriever", eval_results)
print("Retriever Evaluation Results:")
print(results_df)

## Response Quality Evaluation

In [None]:
# Create evaluators for response quality
relevancy_evaluator = RelevancyEvaluator()
faithfulness_evaluator = FaithfulnessEvaluator()

In [None]:
# Test on sample query
eval_query = queries[0] if queries else "What is the main topic of this document?"
response = query_engine.query(eval_query)

print(f"\nEvaluation Query: {eval_query}")
print(f"Response: {response.response}")

# Evaluate faithfulness
faithfulness_result = faithfulness_evaluator.evaluate_response(response=response)
print(f"Faithfulness - Passed: {faithfulness_result.passing}")
print(f"Faithfulness - Score: {faithfulness_result.score}")

# Evaluate relevancy
relevancy_result = relevancy_evaluator.evaluate_response(
    query=eval_query,
    response=response
)
print(f"Relevancy - Passed: {relevancy_result.passing}")
print(f"Relevancy - Score: {relevancy_result.score}")

## Batch Evaluation

In [None]:
# Run batch evaluation on multiple queries
sample_queries = queries[:10]  # Take first 10 queries

batch_runner = BatchEvalRunner(
    {"faithfulness": faithfulness_evaluator, "relevancy": relevancy_evaluator},
    workers=4,
)

# Run batch evaluation
batch_results = asyncio.run(
    batch_runner.aevaluate_queries(query_engine, queries=sample_queries)
)

# Calculate aggregate scores
faithfulness_scores = [result.passing for result in batch_results['faithfulness']]
relevancy_scores = [result.passing for result in batch_results['relevancy']]

faithfulness_avg = sum(faithfulness_scores) / len(faithfulness_scores)
relevancy_avg = sum(relevancy_scores) / len(relevancy_scores)

print(f"\nBatch Evaluation Results:")
print(f"Average Faithfulness Score: {faithfulness_avg:.2f}")
print(f"Average Relevancy Score: {relevancy_avg:.2f}")