In [13]:
# 1. Setup and Configuration

In [14]:
import pydantic
print(pydantic.__version__)

2.12.5


In [15]:
import os
from dotenv import load_dotenv
import giskard

load_dotenv()
os.environ["USER_AGENT"] = "my-rag-evaluation-app"

# Use the standard gemini 2.5 flash model
giskard.llm.set_llm_model("gemini/gemini-2.5-flash")

# Tell Giskard to use the universally available Google embedding model
# giskard.llm.set_embedding_model("gemini/embedding-001")
# giskard.llm.set_embedding_model("gemini/text-embedding-004")
giskard.llm.set_embedding_model("gemini/gemini-embedding-001")

In [16]:
# 2. Scrape the Website

In [17]:
from langchain_community.document_loaders import WebBaseLoader

# UPDATE: The new, correct import path for modern LangChain
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)

# loader = WebBaseLoader("https://www.ml.school/") #1
loader = WebBaseLoader("https://en.wikipedia.org/wiki/Retrieval-augmented_generation") #2
# loader = WebBaseLoader("https://docs.python.org/3/tutorial/datastructures.html") #3
documents = loader.load_and_split(text_splitter)

print(f"Scraped {len(documents)} documents.")

Scraped 34 documents.


In [18]:
# 3: Embed using Gemini and Store in Vector DB

In [21]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_community.vectorstores import DocArrayInMemorySearch

# Must include the "models/" prefix!
# embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
# embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")
embeddings = GoogleGenerativeAIEmbeddings(model="models/gemini-embedding-001")

vectorstore = DocArrayInMemorySearch.from_documents(
    documents, embedding=embeddings
)

print("Successfully embedded and stored in VectorDB!")

Successfully embedded and stored in VectorDB!


In [22]:
# Cell 4: Create the Knowledge Base & Generate Tests

In [23]:
import pandas as pd
from giskard.rag import KnowledgeBase, generate_testset

df = pd.DataFrame([d.page_content for d in documents], columns=["text"])
knowledge_base = KnowledgeBase(df)

# Generate the test set (using 15 questions for speed; scale up later)
# testset = generate_testset(
#     knowledge_base,
#     num_questions=10,
#     agent_description="A chatbot answering questions about the Machine Learning School Website",
# )
# 2
testset = generate_testset(
    knowledge_base,
    num_questions=15,
    agent_description="A technical assistant answering questions about the concept of Retrieval-Augmented Generation (RAG) based on a Wikipedia article",
)

testset.save("wiki-rag-test-set.jsonl")

# Let's peek at a generated question
test_set_df = testset.to_pandas()
print("Sample Question:", test_set_df.iloc[0]['question'])
print("Expected Answer:", test_set_df.iloc[0]['reference_answer'])

2026-02-25 14:34:41,324 pid:218502 MainThread giskard.rag  INFO     Finding topics in the knowledge base.


  warn(


2026-02-25 14:34:52,706 pid:218502 MainThread giskard.rag  INFO     Found 3 topics in the knowledge base.


Generating questions:   0%|          | 0/15 [00:00<?, ?it/s]

Sample Question: What political topics are mentioned in the provided list?
Expected Answer: The political topics mentioned are AI safety (Alignment), Ethics of AI, EU AI Act, Precautionary principle, and Regulation of AI.


In [24]:
# Cell 5: Build the Gemini RAG Chain

In [25]:
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from operator import itemgetter

retriever = vectorstore.as_retriever()

template = """
Answer the question based on the context below. If you can't 
answer the question, reply "I don't know".

Context: {context}

Question: {question}
"""
prompt = PromptTemplate.from_template(template)

# Use Gemini 2.5 Flash
model = ChatGoogleGenerativeAI(model="gemini-2.5-flash", temperature=0)

chain = (
    {
        "context": itemgetter("question") | retriever,
        "question": itemgetter("question"),
    }
    | prompt
    | model
    | StrOutputParser()
)

# Test the chain manually
print("Testing chain:")
print(chain.invoke({"question": "What are the main benefits of Retrieval-Augmented Generation?"}))

Testing chain:
The main benefits of Retrieval-Augmented Generation (RAG) are:

*   It helps reduce AI hallucinations.
*   It reduces the need to retrain large language models (LLMs) with new data, saving on computational and financial costs.
*   It allows LLMs to include sources in their responses, providing greater transparency and enabling users to verify the cited sources for accuracy and relevance.


In [26]:
# Cell 6: Evaluate the Model

In [27]:
# 1. Create a simple Python function that takes a question and returns an answer
def answer_fn(question: str, history=None) -> str:
    # Notice we use .invoke() here for a single question
    return chain.invoke({"question": question})

# 2. Evaluate!
from giskard.rag import evaluate

report = evaluate(
    answer_fn=answer_fn, 
    testset=testset, 
    knowledge_base=knowledge_base
)

# Display report natively in the notebook
display(report)

Asking questions to the agent:   0%|          | 0/15 [00:00<?, ?it/s]

CorrectnessMetric evaluation:   0%|          | 0/15 [00:00<?, ?it/s]