# Ragas Evaluation with Llama Stack - Demo

This notebook demonstrates how to use the Ragas out-of-tree provider.


## 1. Setup and Imports

In [120]:
# Install dev packages if not already installed
# !pip install -e ".[dev]""

from datetime import datetime
from rich.pretty import pprint

from llama_stack_client import LlamaStackClient


## 2. Llama Stack Client Setup
- Make sure we have an inference model (model_type='llm')
- Make sure we have an embedding model (model_type='embedding')

In [121]:
client = LlamaStackClient(
    base_url="http://localhost:8321"
)

models = client.models.list()
pprint(models)

INFO:httpx:HTTP Request: GET http://localhost:8321/v1/models "HTTP/1.1 200 OK"


## 4. Dataset Preparation

Create a sample RAG evaluation dataset. In a real scenario, you would load your own dataset.

In [122]:
# Sample RAG evaluation dataset
# Each sample needs: question, contexts, answer, ground_truth
evaluation_data = [
    {
        "question": "What is the capital of France?",
        "contexts": [
            "France is a country in Western Europe. Its capital city is Paris, which is also the largest city in the country.",
            "Paris is located in northern France and serves as the political, economic, and cultural center of the nation."
        ],
        "answer": "The capital of France is Paris.",
        "ground_truth": "Paris"
    },
    {
        "question": "Who wrote the novel '1984'?",
        "contexts": [
            "George Orwell was a British author known for his dystopian novels.",
            "'1984' is a dystopian novel published in 1949 by George Orwell.",
            "The novel depicts a totalitarian society and is considered one of the most influential books of the 20th century."
        ],
        "answer": "George Orwell wrote the novel '1984'.",
        "ground_truth": "George Orwell"
    },
    {
        "question": "What is photosynthesis?",
        "contexts": [
            "Photosynthesis is the process by which plants convert light energy into chemical energy.",
            "During photosynthesis, plants use sunlight, carbon dioxide, and water to produce glucose and oxygen.",
            "This process is essential for life on Earth as it produces the oxygen we breathe."
        ],
        "answer": "Photosynthesis is the process by which plants convert sunlight, carbon dioxide, and water into glucose and oxygen using chlorophyll.",
        "ground_truth": "Photosynthesis is the process by which plants use sunlight, carbon dioxide, and water to produce glucose and oxygen."
    },
    {
        "question": "When did World War II end?",
        "contexts": [
            "World War II was a global conflict that lasted from 1939 to 1945.",
            "The war ended in 1945 with the surrender of Germany in May and Japan in September.",
            "VJ Day (Victory over Japan Day) on September 2, 1945, marked the official end of World War II."
        ],
        "answer": "World War II ended in 1945, officially on September 2, 1945 (VJ Day).",
        "ground_truth": "1945"
    },
    {
        "question": "What is the largest planet in our solar system?",
        "contexts": [
            "Jupiter is the fifth planet from the Sun and the largest in our solar system.",
            "Jupiter is a gas giant with a mass greater than all other planets combined.",
            "The planet has a diameter of about 143,000 kilometers, making it 11 times wider than Earth."
        ],
        "answer": "Jupiter is the largest planet in our solar system.",
        "ground_truth": "Jupiter"
    }
]

## 5. Dataset Registration

Register the dataset with Llama Stack's Datasets API using the direct rows approach.

In [123]:
# Register the dataset
dataset_id = f"ragas_demo_dataset_{datetime.now().strftime('%Y%m%d_%H%M%S')}"

# Register dataset with Datasets API
dataset_response = client.datasets.register(
    dataset_id=dataset_id,
    purpose="eval/question-answer",  # RAG evaluation purpose
    source={
        "type": "rows",
        "rows": evaluation_data
    },
    metadata={
        "provider_id": "localfs", # seems there's a bug in datasets
        "description": "Sample RAG evaluation dataset for Ragas demo",
        "size": len(evaluation_data),
        "format": "ragas",
        "created_at": datetime.now().isoformat()
    }
)
pprint(dataset_response)

INFO:httpx:HTTP Request: POST http://localhost:8321/v1/datasets "HTTP/1.1 200 OK"


## 6. Benchmark Registration

Register a benchmark that defines what metrics to use for evaluation.

In [124]:
benchmark_id = f"ragas_demo_benchmark_{datetime.now().strftime('%Y%m%d_%H%M%S')}"

ragas_metrics = [
    # "answer_relevancy",      # How relevant is the answer to the question?
    # "context_precision",     # How precise are the retrieved contexts?
    # "faithfulness",          # How faithful is the answer to the contexts?
    # "context_recall",        # How much of the ground truth is covered by contexts?
    "answer_correctness"     # How correct is the answer compared to ground truth?
]

benchmark_response = client.benchmarks.register(
    benchmark_id=benchmark_id,
    dataset_id=dataset_id,
    scoring_functions=ragas_metrics,
    # metadata={
    #     "provider": "ragas",
    #     "version": "1.0",
    #     "metrics_count": len(ragas_metrics),
    #     "created_at": datetime.now().isoformat()
    # }
)

pprint(benchmark_response)

INFO:httpx:HTTP Request: POST http://localhost:8321/v1/eval/benchmarks "HTTP/1.1 200 OK"


In [125]:
benchmarks = client.benchmarks.list()
pprint(benchmarks)

INFO:httpx:HTTP Request: GET http://localhost:8321/v1/eval/benchmarks "HTTP/1.1 200 OK"


## 7. Evaluation Execution

Run the evaluation using our Ragas out-of-tree provider.

In [None]:
# since we can't set the embedding model in the benchmark config, 
# the embedding model is set in the distribution run.yaml file (all-MiniLM-L6-v2)

# TODO: this works, but it always times out, how do we return the job id while continuing to run?
job = client.eval.run_eval(
    benchmark_id=benchmark_id,
    benchmark_config={
        "eval_candidate": {
            "type": "model",
            "model": "meta-llama/Llama-3.2-3B-Instruct",
            "sampling_params": {
                "temperature": 0.7,
                "top_p": 0.9,
                "max_tokens": 256
            },
        },
        "scoring_params": {},
    }
)
pprint(job)

INFO:llama_stack_client._base_client:Retrying request to /v1/eval/benchmarks/ragas_demo_benchmark_20250620_145130/jobs in 0.472683 seconds
INFO:llama_stack_client._base_client:Retrying request to /v1/eval/benchmarks/ragas_demo_benchmark_20250620_145130/jobs in 0.762811 seconds


APITimeoutError: Request timed out.

## 8. Results Display


In [None]:
# coming soon
# results = client.eval.job_result(
#     benchmark_id=benchmark_id,
#     job_id=job.job_id
# )
# pprint(results)