In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import warnings
import nest_asyncio

nest_asyncio.apply()
warnings.filterwarnings("ignore")

In [3]:
from llama_index.llms.ollama import Ollama
from llama_index.core.llama_dataset import (
    LabelledRagDataset,
    LabelledRagDataExample,
    CreatedBy,
)
from llama_index.embeddings.ollama import OllamaEmbedding
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader

import pandas as pd
import json

embed_model = OllamaEmbedding(model_name="nomic-embed-text")
docs = SimpleDirectoryReader("../data/").load_data(show_progress=True)

def get_rag_dataset_from_csv(csv_path: str):
    converters = {
        "reference_contexts":   lambda s: json.loads(s),
        "query_by":             lambda s: CreatedBy.model_validate_json(s),
        "reference_answer_by":  lambda s: CreatedBy.model_validate_json(s),
    }
    df = pd.read_csv(csv_path, converters=converters)
    examples = []
    for _, row in df.iterrows():
        examples.append(
            LabelledRagDataExample(
                query=row["query"],
                query_by=row["query_by"],                      # now a CreatedBy
                reference_contexts=row["reference_contexts"],   # now a List[str]
                reference_answer=row["reference_answer"],
                reference_answer_by=row["reference_answer_by"], # now a CreatedBy
            )
        )

    # 4. Create the dataset
    dataset = LabelledRagDataset(examples=examples)
    return dataset

holdout_dataset = get_rag_dataset_from_csv("holdout_dataset.csv")

Loading files: 100%|██████████| 1/1 [00:00<00:00,  1.52file/s]


In [4]:
index = VectorStoreIndex.from_documents(docs, embed_model=embed_model)
query_engine = index.as_query_engine(
    similarity_top_k=6, 
    llm = Ollama("hf.co/tituslhy/llama32_1bn_raft_non_traditional_credentials:Q4_K_M")
)

In [6]:
from llama_index.packs.rag_evaluator import RagEvaluatorPack

rag_evaluator = RagEvaluatorPack(
    query_engine=query_engine, 
    rag_dataset=holdout_dataset,
    judge_llm=Ollama("qwen2.5", request_timeout=120.0), #use the same llm that we use to create the dataset to judge
    embed_model=OllamaEmbedding(model_name="nomic-embed-text")
)

In [7]:
benchmark_df = await rag_evaluator.arun()

Batch processing of predictions: 100%|██████████| 10/10 [00:27<00:00,  2.79s/it]
Batch processing of predictions: 100%|██████████| 3/3 [00:09<00:00,  3.06s/it]
Batch processing of evaluations: 100%|██████████| 7/7.0 [03:08<00:00, 26.98s/it]


In [8]:
benchmark_df

rag,base_rag
metrics,Unnamed: 1_level_1
mean_correctness_score,2.653846
mean_relevancy_score,0.615385
mean_faithfulness_score,0.230769
mean_context_similarity_score,0.644645
