In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import nest_asyncio
nest_asyncio.apply()

In [3]:
from llama_index.core import SimpleDirectoryReader
from llama_index.core.llama_dataset.generator import RagDatasetGenerator
from llama_index.llms.ollama import Ollama

In [4]:
!mkdir  -p ../data
!wget "https://arxiv.org/pdf/2405.00247.pdf" -O "../data/non_traditional_credentials.pdf"

--2025-05-12 17:28:13--  https://arxiv.org/pdf/2405.00247.pdf
Resolving arxiv.org (arxiv.org)... 151.101.195.42, 151.101.67.42, 151.101.131.42, ...
Connecting to arxiv.org (arxiv.org)|151.101.195.42|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://arxiv.org/pdf/2405.00247 [following]
--2025-05-12 17:28:13--  http://arxiv.org/pdf/2405.00247
Connecting to arxiv.org (arxiv.org)|151.101.195.42|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1524369 (1.5M) [application/pdf]
Saving to: ‘../data/non_traditional_credentials.pdf’


2025-05-12 17:28:14 (21.7 MB/s) - ‘../data/non_traditional_credentials.pdf’ saved [1524369/1524369]



In [5]:
docs = SimpleDirectoryReader("../data/").load_data(show_progress=True)

Loading files: 100%|██████████| 1/1 [00:00<00:00,  1.92file/s]


In [6]:
data_gen = RagDatasetGenerator.from_documents(
    docs,
    llm= Ollama("qwen2.5"),
    question_gen_query="You are a teacher/professor. Using the provided context, formulat a single question and its answer",
    num_questions_per_chunk=10
)

In [7]:
qa_dataset = data_gen.generate_dataset_from_nodes()



In [8]:
qa_dataset.examples

[LabelledRagDataExample(query='Question: What was the estimated impact of credential sharing on the likelihood of getting a new job according to the study?', query_by=CreatedBy(model_name='qwen2.5', type=<CreatedByType.AI: 'ai'>), reference_contexts=['The value of non-traditional credentials in the labor market*\nSusan Athey & Emil Palikot\nMay 2, 2024\nAbstract\nThis study investigates the labor market value of credentials obtained from Massive Open On-\nline Courses (MOOCs) and shared on business networking platforms. We conducted a random-\nized experiment involving more than 800,000 learners, primarily from developing countries and\nwithout college degrees, who completed technology or business-related courses on the Coursera\nplatform between September 2022 and March 2023. The intervention targeted learners who had\nrecently completed their courses, encouraging them to share their credentials and simplifying the\nsharing process. One year after the intervention, we collected data f

In [9]:
import json

def serialize_to_jsonl(examples, out_path="train.jsonl"):
    """
    examples: list of LabelledRagDataExample,
              each with .query (str) and .reference_answer (str)
    out_path:  path to write the JSONL file
    """
    def strip_prefix(text):
        # remove leading **Question:** or **Answer:** if present
        for p in ("**Question:**", "**Answer:**"):
            if text.strip().startswith(p):
                return text.strip()[len(p):].strip()
        return text

    with open(out_path, "w", encoding="utf8") as f:
        for ex in examples:
            q_raw = ex.query or ""
            a_raw = getattr(ex, "reference_answer", None)
            # only serialize if this is a 'Question' example and has an answer
            if q_raw.lower().startswith("**question") and a_raw:
                q = strip_prefix(q_raw)
                a = a_raw.strip()
                obj = {
                    "messages": [
                        {"role": "user",      "content": q},
                        {"role": "assistant", "content": a}
                    ]
                }
                f.write(json.dumps(obj, ensure_ascii=False) + "\n")

In [10]:
serialize_to_jsonl(qa_dataset.examples)

## Evaluate RAG
This is the baseline

In [11]:
from llama_index.embeddings.ollama import OllamaEmbedding

embed_model = OllamaEmbedding(model_name="nomic-embed-text")

In [12]:
from llama_index.core import VectorStoreIndex

index = VectorStoreIndex.from_documents(docs, embed_model=embed_model)
query_engine = index.as_query_engine(similarity_top_k=6, llm = Ollama("llama3.2:1b"))

In [13]:
from llama_index.core.llama_pack import download_llama_pack

RagEvaluatorPack = download_llama_pack("RagEvaluatorPack", "./pack")

Processing /Users/tituslim/Documents/Personal Learning Folder/Personal Projects/ideal-palm-tree/notebooks/pack
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Building wheels for collected packages: llama-index-packs-rag-evaluator
  Building wheel for llama-index-packs-rag-evaluator (pyproject.toml): started
  Building wheel for llama-index-packs-rag-evaluator (pyproject.toml): finished with status 'done'
  Created wheel for llama-index-packs-rag-evaluator: filename=llama_index_packs_rag_evaluator-0.3.0-py3-none-any.whl size=4929 sha256=5e36c5b106ef33696bbfafe9ac35b343b71542a4411152f9f15bbac843b5b52a
  Stored in directory: /private/var/folders/zb/r15p7t_d62d8m2s0623s22gh0000gn/T/pip-ephem-wheel-cache-w96xzi5w

In [14]:
rag_evaluator = RagEvaluatorPack(
    query_engine=query_engine, 
    rag_dataset=qa_dataset,
    judge_llm=Ollama("qwen2.5", request_timeout=120.0), #use the same llm that we use to create the dataset to judge
    embed_model=OllamaEmbedding(model_name="nomic-embed-text")
)

In [15]:
import warnings
warnings.filterwarnings("ignore")

This cell will take very long!

In [16]:
benchmark_df = rag_evaluator.run(batch_size=20)

100%|██████████| 20/20 [01:17<00:00,  3.89s/it]
100%|██████████| 20/20 [01:25<00:00,  4.26s/it]
100%|██████████| 20/20 [01:07<00:00,  3.39s/it]
100%|██████████| 2/2 [00:06<00:00,  3.40s/it]
5it [02:57, 35.59s/it]
5it [02:51, 34.28s/it]
5it [02:59, 35.82s/it]
5it [02:35, 31.19s/it]
5it [03:22, 40.50s/it]
5it [06:33, 78.74s/it]
5it [02:58, 35.79s/it]
5it [02:16, 27.21s/it]
5it [02:01, 24.34s/it]
5it [02:10, 26.16s/it]
5it [02:58, 35.74s/it]
5it [02:43, 32.69s/it]
2it [00:53, 26.66s/it]


In [17]:
benchmark_df

rag,base_rag
metrics,Unnamed: 1_level_1
mean_correctness_score,2.774194
mean_relevancy_score,0.725806
mean_faithfulness_score,0.225806
mean_context_similarity_score,0.625543


Ooh! Not totally terrible LOL! Check out the mean correctness scores (2.7/5) and mean faithfulness score (0.225/1)!