In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
from dotenv import load_dotenv, find_dotenv
import warnings
import nest_asyncio

_ = load_dotenv(find_dotenv())
warnings.filterwarnings("ignore")
nest_asyncio.apply()

In [3]:
azure_api_key = os.environ["AZURE_OPENAI_API_KEY"]
azure_endpoint = os.environ["AZURE_OPENAI_ENDPOINT"]
azure_api_version = os.environ["AZURE_API_VERSION"]

In [None]:
from llama_index.core import (
    Settings,
    VectorStoreIndex,
    SimpleDirectoryReader,
)
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding

Settings.llm = OpenAI(model="gpt-4o",)
Settings.embed_model = OpenAIEmbedding()

In [6]:
from llama_index.core.llama_dataset.generator import RagDatasetGenerator

docs = SimpleDirectoryReader("../data/").load_data(show_progress=True)
data_gen = RagDatasetGenerator.from_documents(
    docs,
    llm= Settings.llm,
    question_gen_query="You are a teacher/professor. Using the provided context, formulat a single question and its answer",
    num_questions_per_chunk=10
)

Loading files:   0%|          | 0/1 [00:00<?, ?file/s]

Loading files: 100%|██████████| 1/1 [00:00<00:00,  1.56file/s]


In [7]:
qa_dataset = data_gen.generate_dataset_from_nodes()

In [8]:
qa_dataset.examples

[LabelledRagDataExample(query='Question: What impact did the intervention have on learners who completed MOOCs in terms of employment and credential sharing?', query_by=CreatedBy(model_name='gpt-4o', type=<CreatedByType.AI: 'ai'>), reference_contexts=['The value of non-traditional credentials in the labor market*\nSusan Athey & Emil Palikot\nMay 2, 2024\nAbstract\nThis study investigates the labor market value of credentials obtained from Massive Open On-\nline Courses (MOOCs) and shared on business networking platforms. We conducted a random-\nized experiment involving more than 800,000 learners, primarily from developing countries and\nwithout college degrees, who completed technology or business-related courses on the Coursera\nplatform between September 2022 and March 2023. The intervention targeted learners who had\nrecently completed their courses, encouraging them to share their credentials and simplifying the\nsharing process. One year after the intervention, we collected data 

## Evaluate RAG

In [12]:
from llama_index.llms.ollama import Ollama

llm = Ollama(model="llama3.1")

In [13]:
index = VectorStoreIndex.from_documents(docs, llm=llm)
query_engine = index.as_query_engine(similarity_top_k=6)

In [14]:
from llama_index.packs.rag_evaluator import RagEvaluatorPack

rag_evaluator = RagEvaluatorPack(
    query_engine=query_engine, 
    rag_dataset=qa_dataset,
    judge_llm=Settings.llm, #use the same llm that we use to create the dataset to judge
    embed_model=Settings.embed_model
)

In [15]:
benchmark_df = await rag_evaluator.arun()

Batch processing of predictions: 100%|██████████| 10/10 [00:08<00:00,  1.20it/s]
Batch processing of predictions: 100%|██████████| 10/10 [00:02<00:00,  3.34it/s]
Batch processing of predictions: 100%|██████████| 10/10 [00:03<00:00,  3.21it/s]
Batch processing of predictions: 100%|██████████| 10/10 [00:48<00:00,  4.89s/it]
Batch processing of predictions: 100%|██████████| 10/10 [00:08<00:00,  1.17it/s]
Batch processing of predictions: 100%|██████████| 10/10 [00:02<00:00,  3.64it/s]
Batch processing of evaluations:  95%|█████████▌| 30/31.5 [04:14<00:12,  8.49s/it]


In [16]:
benchmark_df

rag,base_rag
metrics,Unnamed: 1_level_1
mean_correctness_score,4.741667
mean_relevancy_score,0.95
mean_faithfulness_score,0.983333
mean_context_similarity_score,0.955815


Oof this is already so good!

## Using our original holdout set

In [21]:
from llama_index.core.llama_dataset import (
    LabelledRagDataset,
    LabelledRagDataExample,
    CreatedBy,
)

import pandas as pd
import json

def get_rag_dataset_from_csv(csv_path: str):
    converters = {
        "reference_contexts":    lambda s: json.loads(s),
        "query_by":             lambda s: CreatedBy.model_validate_json(s),
        "reference_answer_by":  lambda s: CreatedBy.model_validate_json(s),
    }
    df = pd.read_csv(csv_path, converters=converters)
    examples = []
    for _, row in df.iterrows():
        examples.append(
            LabelledRagDataExample(
                query=row["query"],
                query_by=row["query_by"],                      # now a CreatedBy
                reference_contexts=row["reference_contexts"],   # now a List[str]
                reference_answer=row["reference_answer"],
                reference_answer_by=row["reference_answer_by"], # now a CreatedBy
            )
        )

    # 4. Create the dataset
    dataset = LabelledRagDataset(examples=examples)
    return dataset

In [22]:
holdout_dataset = get_rag_dataset_from_csv("holdout_dataset.csv")

In [24]:
holdout_evaluator = RagEvaluatorPack(
    query_engine=query_engine, 
    rag_dataset=holdout_dataset,
    judge_llm=Settings.llm, #use the same llm that we use to create the dataset to judge
    embed_model=Settings.embed_model
)
holdout_df = await holdout_evaluator.arun()

Batch processing of predictions:   0%|          | 0/10 [00:00<?, ?it/s]

Batch processing of predictions: 100%|██████████| 10/10 [00:02<00:00,  3.43it/s]
Batch processing of predictions: 100%|██████████| 3/3 [00:02<00:00,  1.25it/s]
Batch processing of evaluations: 100%|██████████| 7/7.0 [01:03<00:00,  9.01s/it]


In [25]:
holdout_df

rag,base_rag
metrics,Unnamed: 1_level_1
mean_correctness_score,4.807692
mean_relevancy_score,1.0
mean_faithfulness_score,1.0
mean_context_similarity_score,0.941093
