# Ragas Evaluation Demo  -- Remote execution on Kubeflow Pipelines

- This notebook demonstrates how to run Ragas evaluation on Kubeflow Pipelines.
- See README.md for details on how to setup the demo.


## Imports


In [1]:
import os
import pandas as pd
from dotenv import load_dotenv
from langchain_core.prompt_values import StringPromptValue
from langchain_ollama import OllamaEmbeddings, OllamaLLM
from ragas.embeddings.base import LangchainEmbeddingsWrapper
from ragas.llms.base import LangchainLLMWrapper
from rich.pretty import pprint

from magenta_pipelines.config import EvalConfig, KubeflowConfig
from magenta_pipelines.pipeline_runner import PipelineRunner

load_dotenv()

  from .autonotebook import tqdm as notebook_tqdm


True

## Load configs from .env

In [2]:
# load configs from .env
kfp_config = KubeflowConfig()

## Inference setup (generations and embeddings)
- You can use any OpenAI API compatible model and wrap it with the Ragas LangchainLLMWrapper or LangchainEmbeddingsWrapper
- This example uses Ollama

In [3]:
llm_client = LangchainLLMWrapper(
    OllamaLLM(model="granite3.3:2b", base_url=os.environ["INFERENCE_URL"])
)
llm_client.generate_text(
    StringPromptValue(text="Tell me a joke")
)  # StringPromptValue is the way Ragas will invoke the LLM

LLMResult(generations=[[GenerationChunk(text="Why don't scientists trust atoms? Because they make up everything!", generation_info={'model': 'granite3.3:2b', 'created_at': '2025-08-26T01:46:29.487983Z', 'done': True, 'done_reason': 'stop', 'total_duration': 1398585709, 'load_duration': 1103697375, 'prompt_eval_count': 48, 'prompt_eval_duration': 103471458, 'eval_count': 16, 'eval_duration': 190822500, 'response': '', 'thinking': None, 'context': [49152, 2946, 49153, 39558, 390, 17071, 2821, 44, 30468, 225, 36, 34, 36, 38, 32, 203, 4282, 884, 8080, 278, 659, 30, 18909, 810, 25697, 32, 2448, 884, 312, 17247, 19551, 47330, 32, 0, 203, 49152, 496, 49153, 46020, 597, 312, 8725, 479, 0, 203, 49152, 17594, 49153, 24402, 2800, 1330, 2197, 1606, 3045, 18074, 27898, 49, 18369, 2953, 1930, 973, 8989, 19]})]], llm_output=None, run=[RunInfo(run_id=UUID('9e2a7bb2-d3b9-4acb-80ac-968aecee61ce'))], type='LLMResult')

In [4]:
embedding_client = LangchainEmbeddingsWrapper(
    OllamaEmbeddings(
        model="all-minilm:l6-v2",
        base_url=os.environ["INFERENCE_URL"],
    )
)
embedded_documents = embedding_client.embed_documents(["Hello, world!"])
assert len(embedded_documents[0]) == 384  # l6-v2 has 384 dimensions

## Dataset Preparation

Create a sample RAG evaluation dataset. In a real scenario, you would load your own dataset.


In [5]:
# Sample Ragas evaluation dataset
evaluation_data = [
    {
        "user_input": "What is the capital of France?",
        "response": "The capital of France is Paris.",
        "retrieved_contexts": [
            "Paris is the capital and most populous city of France."
        ],
        "reference": "Paris",
    },
    {
        "user_input": "Who invented the telephone?",
        "response": "Alexander Graham Bell invented the telephone in 1876.",
        "retrieved_contexts": [
            "Alexander Graham Bell was a Scottish-American inventor who patented the first practical telephone."
        ],
        "reference": "Alexander Graham Bell",
    },
    {
        "user_input": "What is photosynthesis?",
        "response": "Photosynthesis is the process by which plants convert sunlight into energy.",
        "retrieved_contexts": [
            "Photosynthesis is a process used by plants to convert light energy into chemical energy."
        ],
        "reference": "Photosynthesis is the process by which plants and other organisms convert light energy into chemical energy.",
    },
]

## Upload the dataset to S3

Register the dataset with Llama Stack's Datasets API using the direct rows approach.


In [6]:
S3_PREFIX = "s3://public-rhods/ragas-evaluation-pipeline"
dataset_id = "ragas_demo_dataset_remote"
input_dataset_uri = f"{S3_PREFIX}/{dataset_id}_input.jsonl"
output_dataset_uri = f"{S3_PREFIX}/{dataset_id}_output.jsonl"

In [7]:
# upload the dataset to S3
pd.DataFrame(evaluation_data).to_json(input_dataset_uri, orient="records", lines=True)

## Evaluation Execution

In [8]:
pipeline_runner = PipelineRunner(kfp_config)




In [9]:
eval_config = EvalConfig(
    model="granite3.3:2b",
    model_params={"temperature": 0.1, "max_tokens": 100},
    embedding_model="all-minilm:l6-v2",
    metric_names=["answer_relevance"],
    input_dataset_uri=input_dataset_uri,
    output_dataset_uri=output_dataset_uri,
    # inference_url is read from .env
)
job = pipeline_runner.run_eval(eval_config)
pprint(job)

## 8. Results Display


In [10]:
pprint(pipeline_runner.job_status(job_id=job.job_id))

In [14]:
# wait a few moments for the pipeline to finish
pprint(pipeline_runner.job_status(job_id=job.job_id))

In [15]:
pprint(pipeline_runner.job_result(job_id=job.job_id))