In [None]:
# --- Install Required Packages ---
!pip install -q huggingface-hub==0.27.1 llama-index==0.10.57 llama-index-vector-stores-chroma==0.1.10 google-generativeai==0.5.4 openai==1.59.8 chromadb==0.5.5 nest_asyncio tiktoken==0.8.0

# Setup LlamaIndex and download RAG data

In [None]:
# --- Set Environment Variables ---
import os
from google.colab import userdata

os.environ["OPENAI_API_KEY"] = ""

import nest_asyncio
nest_asyncio.apply()

In [None]:
# --- Load LLM and Embeddings ---
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import Settings

Settings.llm = OpenAI(temperature=0.7, model="gpt-4o-mini")
Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small")

In [None]:
# --- Download Vector Store from Hugging Face Hub ---
from huggingface_hub import hf_hub_download

vectorstore = hf_hub_download(
    repo_id="jaiganesan/ai_tutor_knowledge",
    filename="vectorstore.zip",
    repo_type="dataset",
    local_dir="/content"
)

!unzip -o /content/vectorstore.zip -d /content/

# Create RAG pipeline (i.e. the query engine)

In [None]:
# --- Create RAG Pipeline ---
import chromadb
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import VectorStoreIndex

# We could hide this logic in the presentation...
def get_query_engine_and_vector_store(top_k=5):
  chroma_client = chromadb.PersistentClient(path="/content/ai_tutor_knowledge")
  chroma_collection = chroma_client.get_or_create_collection("ai_tutor_knowledge")
  vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
  index = VectorStoreIndex.from_vector_store(vector_store=vector_store)
  query_engine = index.as_query_engine(similarity_top_k=top_k)
  return query_engine, vector_store

# Create a query engine with a chosen top_k
query_engine, vector_store = get_query_engine_and_vector_store(top_k=5)

# Let's test the RAG pipeline

In [None]:
# Test the RAG Pipeline
example_query = "How does Parameter Efficient Fine-Tuning (PEFT) work?"
response = query_engine.query(example_query)
print("Query:", example_query)
print("Answer:", response.response)

# See and update the prompt of the RAG pipeline. It's not necessary to show this in the presentation I think... To be used after the prompt optimization to change the RAG prompt with the optimized one

In [None]:
# Let's see what prompt is used by the query engine
import json
from IPython.display import Markdown

def display_prompt_dict(prompts_dict):
    for k, p in prompts_dict.items():
        text_md = f"**Prompt Key**: {k}<br>" f"**Text:** <br>"
        display(Markdown(text_md))
        print(p.get_template())
        display(Markdown("<br><br>"))

prompts_dict = query_engine.get_prompts()
display_prompt_dict(prompts_dict)

In [None]:
# Let's edit the "response_synthesizer:text_qa_template" prompt, which is
# the one used when we call `query_engine.query(...)`
from llama_index.core import PromptTemplate

PROMPT_TEMPLATE_ORIGINAL = """\
Context information is below.
---------------------
{context_str}
---------------------
Given the context information and not prior knowledge, answer the query.
Query: {query_str}
Answer: \
"""

PROMPT_TEMPLATE_NEW = """\
Context information is below.
---------------------
{context_str}
---------------------

Given the context information and not prior knowledge, answer the query.
Answer in the style of Shakespear.

Query: {query_str}
Answer: \
"""

query_engine.update_prompts(
    {"response_synthesizer:text_qa_template": PromptTemplate(PROMPT_TEMPLATE_NEW)}
)

In [None]:
# Test the RAG pipeline with the new prompt
example_query = "How does Parameter Efficient Fine-Tuning (PEFT) work?"
response = query_engine.query(example_query)
print("Query:", example_query)
print("Answer:", response.response)

In [None]:
# Let's put back the original prompt
query_engine.update_prompts(
    {"response_synthesizer:text_qa_template": PromptTemplate(PROMPT_TEMPLATE_ORIGINAL)}
)

# Let's create an evaluation dataset

We're going to get a dataset that will be analysed by a domain expert to learn more about the data and the task, and to ultimately improve the prompt of the RAG pipeline.

To jump-start this process, we use an LLM to generate questions from documents. Ideally, one would ask the domain expert to write them or they would be collected from real-world questions asked by the users.

In [None]:
from llama_index.core.llms.utils import LLM
from llama_index.core.schema import MetadataMode, TextNode
from tqdm import tqdm
import json
import re
import uuid
import warnings
import time
from typing import Dict, List, Tuple
from llama_index.core.evaluation import EmbeddingQAFinetuneDataset

DEFAULT_QA_GENERATE_PROMPT_TMPL = """\
I'm developing an AI tutor whose job is to answer questions about Artificial \
Intelligence from students. The AI tutor has a knowledge base of documents that \
leverages to answer questions from students.

Your task is to write questions that students would ask about AI. In particular, \
the questions that you have to write should be answerable with information \
present the following context, but consider that the student doesn't know that context \
in advance (so, the questions can't mention parts of the context).

Context:
---------------------
{context_str}
---------------------

Given the context information above and no prior knowledge, \
write {num_questions_per_chunk} questions that a student may ask.
The questions should be diverse in nature.
Restrict the questions to the context information provided.
The questions can't refer to the existence of the source context.

Here are some examples of good and bad questions:
-----
Question: What was the primary purpose of developing the AI-powered solution mentioned in the text?
Quality: Bad.
Critique: The question says "mentioned in the text", it can't refer to the existence \
of the source context. \
Since the user can't see the source context, he can't know what is the provided text and \
therefore he can't answer the question.
-----
Question: What is the primary purpose of developing the XYZ AI-powered solution?
Quality: Good.
Critique: The question is self-contained and atomic.
-----
Question: What is the range of the numerical values provided in the data?
Quality: Bad.
Critique: The question can't refer to the source context directly and this question \
says "provided in the data". It must be self-contained and atomic. \
Since the user can't see the source context, he can't know what is the provided data and \
therefore he can't answer the question.
-----
Question: What was the primary purpose of developing the AI-powered solution?
Quality: Bad.
Critique: It is not clear what solution it is talking about, it is not self-contained and atomic. \
Since the user can't see the source context, he can't know what is the solution mentioned and \
therefore he can't answer the question.
-----
Question: What technology was integrated with Slack for internal use in the AI-powered solution project mentioned?
Quality: Bad.
Critique: It is not clear what solution it is talking about, it is not self-contained and atomic. \
Since the user can't see the source context, he can't know what is the project mentioned and \
therefore he can't answer the question.
-----
Question: What is the main consequence of model quantization in terms of accuracy and expressive power?
Quality: Good.
Critique: The question is self-contained and atomic.
-----

Write one question per line. If it's not possible to generate self-contained and atomic \
questions because of the nature of the context, then simply write "NONE".
"""

def generate_question_context_pairs(
    nodes: List[TextNode],
    llm: LLM,
    qa_generate_prompt_tmpl: str = DEFAULT_QA_GENERATE_PROMPT_TMPL,
    num_questions_per_chunk: int = 2,
    request_delay: float = 2.0
) -> EmbeddingQAFinetuneDataset:
    """Generate examples given a set of nodes with delays between requests."""
    node_dict = {
        node.node_id: node.get_content(metadata_mode=MetadataMode.NONE)
        for node in nodes
    }

    queries = {}
    relevant_docs = {}

    for node_id, text in tqdm(node_dict.items()):
        query = qa_generate_prompt_tmpl.format(
            context_str=text, num_questions_per_chunk=num_questions_per_chunk
        )
        response = str(llm.complete(query))

        if "NONE" in response:
            continue

        result = response.strip().split("\n")
        questions = [
            re.sub(r"^\d+[\).\s]", "", question).strip() for question in result
        ]
        questions = [question for question in questions if len(question) > 0][
            :num_questions_per_chunk
        ]

        num_questions_generated = len(questions)
        if num_questions_generated < num_questions_per_chunk:
            warnings.warn(
                f"Fewer questions generated ({num_questions_generated}) "
                f"than requested ({num_questions_per_chunk})."
            )

        for question in questions:
            question_id = str(uuid.uuid4())
            queries[question_id] = question
            relevant_docs[question_id] = [node_id]

        time.sleep(request_delay)

    return EmbeddingQAFinetuneDataset(
        queries=queries, corpus=node_dict, relevant_docs=relevant_docs
    )

In [None]:
# We use the GPT-4o model for generating synthetic question-context pairs.

# Retrieve all the nodes from the docstore for question generation
all_nodes = vector_store.get_nodes([])

# Create an instance of GPT-4o to generate questions
gen_llm = OpenAI(temperature=0.7, model="gpt-4o")

# Generate a smaller dataset to avoid excessive calls
rag_eval_dataset = generate_question_context_pairs(
    nodes=all_nodes[:50],
    llm=gen_llm,
    num_questions_per_chunk=1,
    request_delay=2.0
)

# Save dataset locally
rag_eval_dataset.save_json("./synthetic_rag_eval_dataset.json")

In [None]:
# Reload the dataset (optional)
rag_eval_dataset = EmbeddingQAFinetuneDataset.from_json("./synthetic_rag_eval_dataset.json")
print(f"Number of queries in dataset: {len(rag_eval_dataset.queries)}")

# Review and clean the generated questions

Since we generated the questions synthetically, let's review them with a widget. We'll keep only the most-representative and varied questions.

In [None]:
# --- Review & Clean Generated Questions ---

import json
import ipywidgets as widgets
from IPython.display import display

# Load the synthetic RAG evaluation dataset
with open("synthetic_rag_eval_dataset.json", "r") as f:
    data = json.load(f)

queries = data["queries"]
corpus = data["corpus"]
relevant_docs = data["relevant_docs"]

# Dictionary to store checkboxes for each question
check_boxes = {}

# Create a list of checkboxes, one for each question
checkbox_list = []
for qid, question in queries.items():
    checkbox = widgets.Checkbox(
        value=True,
        description=question,
        indent=False,
        layout=widgets.Layout(width='100%'),  # ensure there's enough horizontal space
        style={'description_width': 'initial'}  # prevent truncation of description
    )
    check_boxes[qid] = checkbox
    checkbox_list.append(checkbox)

# Make a scrollable container for all the checkboxes
box_layout = widgets.Layout(
    overflow_y='scroll',
    border='1px solid gray',
    width='auto',
    height='400px'  # adjust this height as needed
)
scrollable_box = widgets.VBox(checkbox_list, layout=box_layout)

print("Review the questions below and uncheck those that are invalid or undesirable.")
print("Scroll within the box to see all questions. Then click 'Save Cleaned Dataset':\n")

display(scrollable_box)

def save_cleaned_dataset(_):
    # Gather checked (kept) questions
    new_queries = {}
    new_relevant_docs = {}

    for qid, cb in check_boxes.items():
        if cb.value:  # user wants to keep this question
            new_queries[qid] = queries[qid]
            new_relevant_docs[qid] = relevant_docs[qid]

    # Build a new dataset dict
    cleaned_data = {
        "queries": new_queries,
        "corpus": corpus,
        "relevant_docs": new_relevant_docs
    }

    # Save the cleaned dataset
    with open("cleaned_rag_eval_dataset.json", "w") as f:
        json.dump(cleaned_data, f, indent=4)

    print("\nCleaned dataset saved to 'cleaned_rag_eval_dataset.json'.")

# Create and display the "Save Cleaned Dataset" button
save_button = widgets.Button(description="Save Cleaned Dataset", button_style="success")
save_button.on_click(save_cleaned_dataset)

display(save_button)

In [None]:
# Load the cleaned dataset
rag_eval_dataset = EmbeddingQAFinetuneDataset.from_json("./cleaned_rag_eval_dataset.json")
print(f"Number of queries in dataset: {len(rag_eval_dataset.queries)}")

# Generate answers to all the queries in the eval dataset

Then, let's use our RAG pipeline to answer all the questions of the evaluation dataset. Ideally, if we got the questions of the evaluation dataset from real-world data, we'd be able to get their responses from the same real-world data as well.

In [None]:
# Use the query engine to answer all the queries in the eval dataset

# Retrieve the prompt template used by the query engine
prompts_dict = query_engine.get_prompts()
text_qa_template = prompts_dict["response_synthesizer:text_qa_template"].get_template()

template_query_context_answer = []
for i, question in enumerate(rag_eval_dataset.queries.values(), start=1):
    print(f"{i}/{len(rag_eval_dataset.queries)}")

    # Get response from RAG
    response = query_engine.query(question)

    # Retrieve the formatted prompt with context
    context_str = ""
    for i, node in enumerate(response.source_nodes, start=1):
       context_str += f"Source {i}:\n-----\n{node.get_content().strip()}\n-----\n\n"
    context_str = context_str.strip()
    prompt_with_context_query = text_qa_template.format(context_str=context_str, query_str=question)

    # Save the formatted prompt with context
    template_query_context_answer.append({
        "query": question,
        "prompt": prompt_with_context_query,
        "response": response.response
    })


# Save to a JSON file
with open("rag_eval_dataset_with_responses.json", "w", encoding="utf-8") as f:
    json.dump(template_query_context_answer, f, indent=4, ensure_ascii=False)

# Domain expert annotates the data with a yes/no label and a critique

Here we pretend that we had the domain expert reviewing every pair of <prompt, response> and assigning a label "Good" or "Bad" and writing a critique about the reasoning behind the choice of the label.

Instead, what I did here it using o1-pro to write them for me... and I saved the final file as "rag_eval_dataset_with_labels.json".

In [None]:
import json
import pandas as pd

# Load the merged dataset
with open("rag_eval_dataset_with_labels.json", "r", encoding="utf-8") as f:
    d_dataset = json.load(f)

# Convert to pandas DataFrame
df = pd.DataFrame(d_dataset)

df

In [None]:
# Let's see how many "Good" and "Bad" annotations there are in the data
label_counts = df["domain_expert_label"].value_counts()
label_counts

# Now that we worked with the data a bit, let's craft the first prompt for the LLM-as-a-judge

We'll leverage the domain expert critiques for this!

In [None]:
# First version of the LLM-as-a-judge-prompt

judge_prompt = """
Your tasks is to evaluate pairs of input prompts and output responses according to the following criteria:

1. Matches the Source – The response should align with provided sources.
2. Covers Key Points – It should address the main aspects of the query.
3. Clear & Concise – The response should be easy to understand.
4. Relevant & Specific – It must be on-topic and provide useful details.
5. Accurate Information – No errors or misleading claims.

If the response meets these criteria well, label it "Good". If it has major issues, label it "Bad".
Provide also a critique that explains the reasoning behing the assigned label (write at most 100 words for it).
The critique should be written in a single line.

Here's the input prompt to evaluate:
-----
{prompt}
-----

Here's the output response to evaluate:
-----
{response}
-----

Use this format for your response (but don't rewrite the "-" characters):
-----
Label: <good-or-bad>

Critique: <critique>
-----
""".strip()

In [None]:
# Here we use GPT-4o as the judge to write the labels and the critiques

from openai import OpenAI

client = OpenAI()

def get_judge_response(judge_prompt, input_prompt, output_response):
    """
    Uses the LLM to create a judge response based on aggregated critiques.
    """

    formatted_prompt = judge_prompt.format(
        prompt=input_prompt,
        response=output_response
    )

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "user", "content": formatted_prompt}],
        temperature=0.7,
    )
    judge_response = response.choices[0].message.content.strip()

    for line in judge_response.split("\n"):
        if line.startswith("Label:"):
            label = line[len("Label:"):].strip()
        if line.startswith("Critique:"):
            critique = line[len("Critique:"):].strip()

    return label, critique

for i, d in enumerate(d_dataset, start=1):
    print(f"{i}/{len(d_dataset)}")

    label, critique = get_judge_response(judge_prompt, d["prompt"], d["response"])

    d["llm_label"] = label
    d["llm_critique"] = critique

In [None]:
# Convert to pandas DataFrame
df = pd.DataFrame(d_dataset)
df

In [None]:
# How many "Good" and "Bad" annotations there are in the data (from the LLM)
label_counts = df["llm_label"].value_counts()
label_counts

In [None]:
# Let's save the final evaluation dataset with all the labels
with open("d_dataset.json", "w", encoding="utf-8") as f:
    json.dump(d_dataset, f, indent=4, ensure_ascii=False)

# Let's see if the domain expert and the LLM-as-a-judge agree!

We'll use a measure called "Cohen's Kappa", which is a way to measure correlation between to lists of values.

Cohen suggested the Kappa result be interpreted as follows: values ≤ 0 as indicating no agreement and 0.01–0.20 as none to slight, 0.21–0.40 as fair, 0.41– 0.60 as moderate, 0.61–0.80 as substantial, and 0.81–1.00 as almost perfect agreement.

In [None]:
from sklearn.metrics import accuracy_score, cohen_kappa_score, confusion_matrix

# Labels
expert_labels = [d["domain_expert_label"] for d in d_dataset]
llm_labels = [d["llm_label"] for d in d_dataset]

# Accuracy
accuracy = accuracy_score(expert_labels, llm_labels)

# Cohen's Kappa
kappa = cohen_kappa_score(expert_labels, llm_labels)

# Confusion Matrix
conf_matrix = confusion_matrix(expert_labels, llm_labels, labels=["Good", "Bad"])

# Display Results
print(f"Accuracy: {accuracy:.2f}")
print(f"Cohen's Kappa: {kappa:.2f}")
print("Confusion Matrix:")
print(conf_matrix)

# There's low agreement! Let's try improving the judge prompt leveraging the critiques

We could do this manually, as we did here. Otherwise, we could do this with tools like alignevals where prompt optimizations are done automatically by LLMs and then A/B tested using the evaluation dataset.

In [None]:
# Second version of the LLM-as-a-judge-prompt

judge_prompt = """
Your task is to evaluate pairs of input prompts and output responses using the detailed criteria below, incorporating insights from domain experts so that your evaluations align closely with expert judgments:

1. Matches the Source: The response must directly reflect and reference details from the provided sources. It should include specific facts, examples, or terminology from the sources. Generic or unrelated content should result in a "Bad" rating.

2. Covers Key Points: The response must address all main aspects of the query. For example, if the query requires discussing technical details or listing several components, the response must cover each one clearly. Omissions or superficial coverage should be noted.

3. Clear & Concise: The response should be easy to understand, well-structured, and free of unnecessary jargon. It should provide sufficient context so that even non-experts can follow, but without extraneous detail.

4. Relevant & Specific: The response must be strictly on-topic, providing precise and detailed information pertinent to the query. It should avoid vague generalizations or overly broad statements.

5. Accurate Information: All details must be correct and consistent with the source material. Misleading or erroneous claims, even if partially correct, should lead to a "Bad" rating.

Evaluation Guidelines:

If the response explicitly incorporates source details, covers every key point with specificity, and is both clear and accurate, label it "Good".
If it is generic, omits critical details, lacks direct source reference, or contains inaccuracies, label it "Bad".
Provide a single-line critique (up to 100 words) explaining your reasoning—mention specific issues such as lack of technical detail, missing key points, or poor alignment with source content when applicable.

Here's the input prompt to evaluate:
-----
{prompt}
-----

Here's the output response to evaluate:
-----
{response}
-----

Use this format for your response (but don't rewrite the "-" characters):
-----
Label: <good-or-bad>

Critique: <critique>
-----
""".strip()

In [None]:
# We update the LLM labels and critiques using the new prompt (same code as before)
for i, d in enumerate(d_dataset, start=1):
    print(f"{i}/{len(d_dataset)}")

    label, critique = get_judge_response(judge_prompt, d["prompt"], d["response"])

    d["llm_label"] = label
    d["llm_critique"] = critique

# We compute the agreement scores (same code as before)
expert_labels = [d["domain_expert_label"] for d in d_dataset]
llm_labels = [d["llm_label"] for d in d_dataset]
accuracy = accuracy_score(expert_labels, llm_labels)
kappa = cohen_kappa_score(expert_labels, llm_labels)
conf_matrix = confusion_matrix(expert_labels, llm_labels, labels=["Good", "Bad"])
print(f"Accuracy: {accuracy:.2f}")
print(f"Cohen's Kappa: {kappa:.2f}")
print("Confusion Matrix:")
print(conf_matrix)

Great! Cohen's Kappa went from 0.06 to 0.25, signaling a better agreement between the domain expert and the judge LLM! It can be seen from the confusion matrix as well.

We stop here. In practice, we would do other iterations to improve the agreement.

# Here's instead a different approach to evaluating RAG responses specifically: using LLM-based metrics like Answer Relevancy and Answer Faithfulness

In [None]:
# --- Evaluate Generation: Relevancy and Faithfulness ---
from llama_index.llms.openai import OpenAI
from llama_index.core.evaluation import RelevancyEvaluator, FaithfulnessEvaluator, BatchEvalRunner

# We'll use GPT-4o as a judge for generation evaluation
judge_llm = OpenAI(temperature=0, model="gpt-4o")

faithfulness_evaluator = FaithfulnessEvaluator(llm=judge_llm)
relevancy_evaluator = RelevancyEvaluator(llm=judge_llm)

queries = list(rag_eval_dataset.queries.values())
batch_eval_queries = queries[:20]  # Limit the queries for demo purposes (avoid huge costs)

runner = BatchEvalRunner(
    {"faithfulness": faithfulness_evaluator, "relevancy": relevancy_evaluator},
    workers=8,
)

for k in [2, 4, 6, 8, 10]:
    index = VectorStoreIndex.from_vector_store(vector_store=vector_store)
    query_engine = index.as_query_engine(similarity_top_k=k, llm=judge_llm)
    eval_results = await runner.aevaluate_queries(query_engine, queries=batch_eval_queries)

    faithfulness_score = sum(r.passing for r in eval_results["faithfulness"]) / len(eval_results["faithfulness"])
    relevancy_score = sum(r.passing for r in eval_results["relevancy"]) / len(eval_results["relevancy"])

    print(f"top_{k} faithfulness_score: {faithfulness_score}")
    print(f"top_{k} relevancy_score: {relevancy_score}")
    print("=" * 20)