In [1]:
# 1. Setup and Configuration

In [2]:
import pydantic
print(pydantic.__version__)

2.12.5


In [4]:
import os
from dotenv import load_dotenv
import giskard

load_dotenv()
os.environ["USER_AGENT"] = "my-rag-evaluation-app"

# Use the standard gemini 2.5 flash model
giskard.llm.set_llm_model("gemini/gemini-2.5-flash")

# Tell Giskard to use the universally available Google embedding model
# giskard.llm.set_embedding_model("gemini/embedding-001")
# giskard.llm.set_embedding_model("gemini/text-embedding-004")
giskard.llm.set_embedding_model("gemini/gemini-embedding-001")

In [5]:
# 2. Scrape the Website

In [6]:
from langchain_community.document_loaders import WebBaseLoader

# UPDATE: The new, correct import path for modern LangChain
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)

# loader = WebBaseLoader("https://www.ml.school/") #1
# loader = WebBaseLoader("https://en.wikipedia.org/wiki/Retrieval-augmented_generation") #2
loader = WebBaseLoader("https://docs.python.org/3/tutorial/datastructures.html") #3
documents = loader.load_and_split(text_splitter)

print(f"Scraped {len(documents)} documents.")

Scraped 34 documents.


In [7]:
# 3: Embed using Gemini and Store in Vector DB

In [8]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_community.vectorstores import DocArrayInMemorySearch

# Must include the "models/" prefix!
# embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
# embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")
embeddings = GoogleGenerativeAIEmbeddings(model="models/gemini-embedding-001")

vectorstore = DocArrayInMemorySearch.from_documents(
    documents, embedding=embeddings
)

print("Successfully embedded and stored in VectorDB!")

Successfully embedded and stored in VectorDB!


In [11]:
# Cell 4: Create the Knowledge Base & Generate Tests

In [12]:
import pandas as pd
from giskard.rag import KnowledgeBase, generate_testset

df = pd.DataFrame([d.page_content for d in documents], columns=["text"])
knowledge_base = KnowledgeBase(df)

# Generate the test set (using 15 questions for speed; scale up later)
# testset = generate_testset(
#     knowledge_base,
#     num_questions=10,
#     agent_description="A chatbot answering questions about the Machine Learning School Website",
# )
# 2
# testset = generate_testset(
#     knowledge_base,
#     num_questions=15,
#     agent_description="A technical assistant answering questions about the concept of Retrieval-Augmented Generation (RAG) based on a Wikipedia article",
# )
# 3
testset = generate_testset(
    knowledge_base,
    num_questions=10,
    agent_description="A Python programming assistant answering technical questions about Python data structures (like lists, dictionaries, tuples, and sets) based on the official Python documentation.",
)

# testset.save("test-set.jsonl") #1
# testset.save("wiki-rag-test-set.jsonl") #2
testset.save("python-datastructures-test-set.jsonl") #3

# Let's peek at a generated question
test_set_df = testset.to_pandas()
print("Sample Question:", test_set_df.iloc[0]['question'])
print("Expected Answer:", test_set_df.iloc[0]['reference_answer'])

2026-02-25 14:41:01,937 pid:280791 MainThread giskard.rag  INFO     Finding topics in the knowledge base.
2026-02-25 14:41:04,332 pid:280791 MainThread giskard.rag  INFO     Found 1 topics in the knowledge base.


  warn(


Generating questions:   0%|          | 0/10 [00:00<?, ?it/s]

Sample Question: What are the components of a list comprehension?
Expected Answer: A list comprehension consists of brackets containing an expression followed by a for clause, then zero or more for or if clauses.


In [11]:
# Cell 5: Build the Gemini RAG Chain

In [14]:
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from operator import itemgetter

retriever = vectorstore.as_retriever()

template = """
Answer the question based on the context below. If you can't 
answer the question, reply "I don't know".

Context: {context}

Question: {question}
"""
prompt = PromptTemplate.from_template(template)

# Use Gemini 2.5 Flash
model = ChatGoogleGenerativeAI(model="gemini-2.5-flash", temperature=0)

chain = (
    {
        "context": itemgetter("question") | retriever,
        "question": itemgetter("question"),
    }
    | prompt
    | model
    | StrOutputParser()
)

# Test the chain manually
print("Testing chain:")
print(chain.invoke({"question": "What is the Machine Learning School?"}))

Testing chain:
The Machine Learning School (ml.school) offers a live, interactive program titled "Building AI/ML Systems That Don't Suck." It is a hands-on program designed to help developers build production-ready, world-class AI/ML systems from the ground up, focusing on real-world AI and Machine Learning engineering skills and practical strategies.


In [15]:
# Cell 6: Evaluate the Model

In [17]:
# 1. Create a simple Python function that takes a question and returns an answer
def answer_fn(question: str, history=None) -> str:
    # Notice we use .invoke() here for a single question
    return chain.invoke({"question": question})

# 2. Evaluate!
from giskard.rag import evaluate

report = evaluate(
    answer_fn=answer_fn, 
    testset=testset, 
    knowledge_base=knowledge_base
)

# Display report natively in the notebook
display(report)

Asking questions to the agent:   0%|          | 0/15 [00:00<?, ?it/s]

CorrectnessMetric evaluation:   0%|          | 0/15 [00:00<?, ?it/s]