In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import nest_asyncio
nest_asyncio.apply()

In [3]:
from llama_index.core import SimpleDirectoryReader
from llama_index.core.llama_dataset.generator import RagDatasetGenerator
from llama_index.llms.ollama import Ollama

In [4]:
!mkdir  -p ../data
!wget "https://arxiv.org/pdf/2405.00247.pdf" -O "../data/non_traditional_credentials.pdf"

--2025-05-12 21:49:34--  https://arxiv.org/pdf/2405.00247.pdf
Resolving arxiv.org (arxiv.org)... 151.101.195.42, 151.101.131.42, 151.101.67.42, ...
Connecting to arxiv.org (arxiv.org)|151.101.195.42|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://arxiv.org/pdf/2405.00247 [following]
--2025-05-12 21:49:35--  http://arxiv.org/pdf/2405.00247
Connecting to arxiv.org (arxiv.org)|151.101.195.42|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1524369 (1.5M) [application/pdf]
Saving to: ‘../data/non_traditional_credentials.pdf’


2025-05-12 21:49:35 (25.6 MB/s) - ‘../data/non_traditional_credentials.pdf’ saved [1524369/1524369]



In [5]:
docs = SimpleDirectoryReader("../data/").load_data(show_progress=True)

Loading files: 100%|██████████| 1/1 [00:00<00:00,  2.09file/s]


In [6]:
data_gen = RagDatasetGenerator.from_documents(
    docs,
    llm= Ollama("qwen2.5"),
    question_gen_query="You are a teacher/professor. Using the provided context, formulat a single question and its answer",
    num_questions_per_chunk=10
)

In [7]:
qa_dataset = data_gen.generate_dataset_from_nodes()



In [8]:
qa_dataset

LabelledRagDataset(examples=[LabelledRagDataExample(query='**Question:** What was the observed impact of encouraging learners to share their credentials from Massive Open Online Courses (MOOCs) on their job search outcomes?', query_by=CreatedBy(model_name='qwen2.5', type=<CreatedByType.AI: 'ai'>), reference_contexts=['The value of non-traditional credentials in the labor market*\nSusan Athey & Emil Palikot\nMay 2, 2024\nAbstract\nThis study investigates the labor market value of credentials obtained from Massive Open On-\nline Courses (MOOCs) and shared on business networking platforms. We conducted a random-\nized experiment involving more than 800,000 learners, primarily from developing countries and\nwithout college degrees, who completed technology or business-related courses on the Coursera\nplatform between September 2022 and March 2023. The intervention targeted learners who had\nrecently completed their courses, encouraging them to share their credentials and simplifying the\ns

In [9]:
qa_dataset.examples

[LabelledRagDataExample(query='**Question:** What was the observed impact of encouraging learners to share their credentials from Massive Open Online Courses (MOOCs) on their job search outcomes?', query_by=CreatedBy(model_name='qwen2.5', type=<CreatedByType.AI: 'ai'>), reference_contexts=['The value of non-traditional credentials in the labor market*\nSusan Athey & Emil Palikot\nMay 2, 2024\nAbstract\nThis study investigates the labor market value of credentials obtained from Massive Open On-\nline Courses (MOOCs) and shared on business networking platforms. We conducted a random-\nized experiment involving more than 800,000 learners, primarily from developing countries and\nwithout college degrees, who completed technology or business-related courses on the Coursera\nplatform between September 2022 and March 2023. The intervention targeted learners who had\nrecently completed their courses, encouraging them to share their credentials and simplifying the\nsharing process. One year aft

## Train-test split

In [10]:
from sklearn.model_selection import train_test_split

# 1. Pull out the raw list of LabelledRagDataExample
all_examples = qa_dataset.examples

# 2. Do an 80/20 split (or whatever ratio you prefer)
train_examples, test_examples = train_test_split(
    all_examples,
    test_size=0.2,          # 20% held out
    random_state=42,        # for reproducibility
    shuffle=True
)

# 3. (Optional) inspect sizes
print(f"Training on {len(train_examples)} examples, testing on {len(test_examples)} examples")

Training on 49 examples, testing on 13 examples


In [11]:
from llama_index.core.llama_dataset import LabelledRagDataset

training_dataset = LabelledRagDataset(examples=train_examples)
holdout_dataset = LabelledRagDataset(examples=test_examples)

In [29]:
import json
import pandas as pd

# 1. Build a list of flat dicts, serializing each field properly
records = []
for ex in holdout_dataset.examples:
    records.append({
        "query": ex.query,
        # JSON-encode the list of contexts
        "reference_contexts": json.dumps(ex.reference_contexts),
        "reference_answer": ex.reference_answer,
        # JSON-encode the CreatedBy objects
        "query_by": ex.query_by.model_dump_json(),
        "reference_answer_by": ex.reference_answer_by.model_dump_json(),
    })

# 2. Turn into a DataFrame and write to CSV
df = pd.DataFrame.from_records(records)
df.to_csv("holdout_dataset.csv", index=False)

#### Try to get back dataset from pandas

In [30]:
from llama_index.core.llama_dataset import (
    LabelledRagDataset,
    LabelledRagDataExample,
    CreatedBy,
)

# 1. Define converters to reverse the JSON dumps
converters = {
    "reference_contexts":    lambda s: json.loads(s),
    "query_by":             lambda s: CreatedBy.model_validate_json(s),
    "reference_answer_by":  lambda s: CreatedBy.model_validate_json(s),
}

# 2. Read the CSV with converters
df = pd.read_csv("holdout_dataset.csv", converters=converters)

# 3. Rebuild your examples
examples = []
for _, row in df.iterrows():
    examples.append(
        LabelledRagDataExample(
            query=row["query"],
            query_by=row["query_by"],                      # now a CreatedBy
            reference_contexts=row["reference_contexts"],   # now a List[str]
            reference_answer=row["reference_answer"],
            reference_answer_by=row["reference_answer_by"], # now a CreatedBy
        )
    )

# 4. Create the dataset
holdout_dataset2 = LabelledRagDataset(examples=examples)

### Save training dataset to jsonl

In [12]:
import json

def serialize_to_jsonl(examples, out_path="train.jsonl"):
    """
    examples: list of LabelledRagDataExample,
              each with .query (str) and .reference_answer (str)
    out_path:  path to write the JSONL file
    """
    def strip_prefix(text):
        # remove leading **Question:** or **Answer:** if present
        for p in ("**Question:**", "**Answer:**"):
            if text.strip().startswith(p):
                return text.strip()[len(p):].strip()
        return text

    with open(out_path, "w", encoding="utf8") as f:
        for ex in examples:
            q_raw = ex.query or ""
            a_raw = getattr(ex, "reference_answer", None)
            # only serialize if this is a 'Question' example and has an answer
            if q_raw.lower().startswith("**question") and a_raw:
                q = strip_prefix(q_raw)
                a = a_raw.strip()
                obj = {
                    "messages": [
                        {"role": "user",      "content": q},
                        {"role": "assistant", "content": a}
                    ]
                }
                f.write(json.dumps(obj, ensure_ascii=False) + "\n")

In [13]:
serialize_to_jsonl(train_examples)

## Evaluate RAG
This is the baseline

In [14]:
from llama_index.embeddings.ollama import OllamaEmbedding

embed_model = OllamaEmbedding(model_name="nomic-embed-text")

In [15]:
from llama_index.core import VectorStoreIndex

index = VectorStoreIndex.from_documents(docs, embed_model=embed_model)
query_engine = index.as_query_engine(similarity_top_k=6, llm = Ollama("llama3.2:1b"))

In [16]:
from llama_index.core.llama_pack import download_llama_pack

RagEvaluatorPack = download_llama_pack("RagEvaluatorPack", "./pack")

Processing /Users/tituslim/Documents/Personal Learning Folder/Personal Projects/ideal-palm-tree/notebooks/pack
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Building wheels for collected packages: llama-index-packs-rag-evaluator
  Building wheel for llama-index-packs-rag-evaluator (pyproject.toml): started
  Building wheel for llama-index-packs-rag-evaluator (pyproject.toml): finished with status 'done'
  Created wheel for llama-index-packs-rag-evaluator: filename=llama_index_packs_rag_evaluator-0.3.0-py3-none-any.whl size=4929 sha256=5e36c5b106ef33696bbfafe9ac35b343b71542a4411152f9f15bbac843b5b52a
  Stored in directory: /private/var/folders/zb/r15p7t_d62d8m2s0623s22gh0000gn/T/pip-ephem-wheel-cache-_fy2jwzf

In [17]:
rag_evaluator = RagEvaluatorPack(
    query_engine=query_engine, 
    rag_dataset=holdout_dataset,
    judge_llm=Ollama("qwen2.5", request_timeout=120.0), #use the same llm that we use to create the dataset to judge
    embed_model=OllamaEmbedding(model_name="nomic-embed-text")
)

In [18]:
import warnings
warnings.filterwarnings("ignore")

This cell will take very long!

In [19]:
benchmark_df = rag_evaluator.run()

100%|██████████| 10/10 [00:38<00:00,  3.81s/it]
100%|██████████| 3/3 [00:11<00:00,  3.71s/it]
2it [00:28, 14.09s/it]
2it [00:48, 24.09s/it]
2it [00:39, 19.60s/it]
2it [00:49, 24.52s/it]
2it [01:43, 51.98s/it]
2it [01:14, 37.48s/it]
1it [00:27, 27.13s/it]


In [20]:
benchmark_df

rag,base_rag
metrics,Unnamed: 1_level_1
mean_correctness_score,2.961538
mean_relevancy_score,0.692308
mean_faithfulness_score,0.076923
mean_context_similarity_score,0.664371


Ooh! Not totally terrible LOL! Check out the mean correctness scores (2.96/5) and mean faithfulness score (0.077/1) - oof!