# Notebook 8 · Comparative Evaluation

With multiple pipelines in place, this notebook benchmarks them across quality and latency metrics using a small evaluation set.

In [None]:
import pandas as pd
from ragas import evaluate as ragas_evaluate
from ragas.metrics import answer_relevancy, faithfulness

from pprint import pprint

from shared import (
    DEFAULT_MODEL,
    RetrievalContext,
    build_baseline_chain,
    build_retrieval_context,
    pretty_print_json,
    time_execution,
)


In [None]:
evaluation_set = pd.DataFrame([
    {'question': 'How do I transfer workspace ownership?'},
    {'question': 'What are the storage limits on the Enterprise tier?'},
    {'question': 'Can contractors access billing dashboards?'},
])
context = build_retrieval_context(top_k=4)
qa_chain = build_baseline_chain(context.retriever)


In [None]:
def run_baseline(row):
    return qa_chain.run(row.question)

evaluation_set['baseline_answer'] = evaluation_set.apply(run_baseline, axis=1)
print(evaluation_set.head())


In [None]:
retrieved_corpora = [
    '\n\n'.join(doc.page_content for doc in context.retriever.get_relevant_documents(question))
    for question in evaluation_set['question']
]
metric_suite = [faithfulness, answer_relevancy]
ragas_report = ragas_evaluate(
    questions=evaluation_set['question'].tolist(),
    answers=evaluation_set['baseline_answer'].tolist(),
    contexts=retrieved_corpora,
    metrics=metric_suite,
)
ragas_report


## Visual analysis

Use the resulting `ragas_report` to chart how each pipeline performs. Extend this notebook by importing the functions defined in previous notebooks and repeating the evaluation for planner/executor, reflective, and verified variants.