<a href="https://colab.research.google.com/github/xprilion/gemini-as-a-judge-for-rag-evals/blob/main/Step_3_Perform_Eval.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Gemini As A Judge for RAG Evals

## Perform Evaluations

### 1. Load the datasets

In [1]:
!wget https://raw.githubusercontent.com/xprilion/gemini-as-a-judge-for-rag-evals/refs/heads/main/qna_dataset.json

--2025-03-01 06:28:46--  https://raw.githubusercontent.com/xprilion/gemini-as-a-judge-for-rag-evals/refs/heads/main/qna_dataset.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 121396 (119K) [text/plain]
Saving to: ‘qna_dataset.json’


2025-03-01 06:28:47 (1.20 MB/s) - ‘qna_dataset.json’ saved [121396/121396]



### Packages

In [79]:
%%capture
!pip install qdrant-client[fastembed]
!pip install google-genai
!pip install weave

### Imports

In [90]:
import pandas as pd
import json
import os
import time
from tqdm import tqdm
from google import genai
from google.genai import types
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams
import uuid
import weave
from weave import Evaluation
import asyncio

from google.colab import userdata

### Helpers

In [4]:
collection_name = "product_reviews"

In [93]:
os.environ["WANDB_API_KEY"] = userdata.get("WANDB_TOKEN")

In [5]:
GEMINI_KEY = userdata.get('GEMINI_API_KEY')
gemini_client = genai.Client(
    api_key=GEMINI_KEY
)

In [6]:
def getGeminiResponse(prompt, max_tokens=8192, response_type="text/plain"):
    contents = [
        types.Content(
            role="user",
            parts=[
                types.Part.from_text(
                    text=prompt
                ),
            ],
        ),
    ]
    generate_content_config = types.GenerateContentConfig(
        temperature=0,
        top_p=0.95,
        top_k=40,
        max_output_tokens=max_tokens,
        response_mime_type=response_type,
    )
    response = gemini_client.models.generate_content(
        model="gemini-2.0-flash", contents=contents, config=generate_content_config
    )
    return response.text

In [7]:
getGeminiResponse("What is 2+3?")

'2 + 3 = 5\n'

### EDA

In [10]:
qa_dataset = json.load(open("qna_dataset.json"))

In [13]:
qa_dataset = qa_dataset[0]

### Connect Qdrant

In [14]:
QDRANT_URL = "https://qdrant-1.sg-1.cloudtop.dev"
QDRANT_KEY = userdata.get('PERSONAL_QDRANT_KEY')

In [15]:
qdrant_client = QdrantClient(url=QDRANT_URL, api_key=QDRANT_KEY, port=None)

### Ask questions from the QA dataset

In [179]:
def getRagResponse(question, k=10, skip_ai=False):
    search_result = qdrant_client.query(collection_name=collection_name, query_text=question, limit=k)
    system_prompt = """
      You are an intelligent assistant designed to provide accurate and informative answers based on retrieved documents.

      Your primary task is to:

      Understand the user's query.
      Retrieve relevant information from the provided context (documents).
      Synthesize the retrieved information into a coherent and accurate response.

      documents:

      """

    documents_text = ""
    matched_ids = []

    doc_count = 1
    for result in search_result:
      documents_text += str(doc_count) + ": \n" + result.document + "\n\n"
      matched_ids.append(result.metadata["index"])
      doc_count += 1

    users_query = "\n\n The user is asking: " + question

    prompt = system_prompt + documents_text + users_query

    if skip_ai:
        return prompt, matched_ids

    response = getGeminiResponse(prompt)

    return response, matched_ids

In [180]:
qa_dataset[0]

{'answer': 'The description and review mention the high-density foam cover, smooth application, lightweight frame, and easy cleaning.',
 'matched_indexes': [272, 276],
 'question': 'What are the features of the 9-inch paint roller?'}

In [181]:
user_query = "What are the key features of the heavy-duty workbench?"

In [182]:
result, indexes = getRagResponse(user_query, 5, True)

In [183]:
indexes

[355, 357, 351, 358, 350]

In [184]:
evals = []
matches = 0
k = 5
for eval_ques in tqdm(qa_dataset):
    evals.append({"query": eval_ques, "result": len(intersection) > 0})
    query = eval_ques["question"]
    result, indexes = getRagResponse(query, k, True)
    intersection = list(set(indexes) & set(eval_ques["matched_indexes"]))
    if len(eval_ques["matched_indexes"]) > 0 and len(intersection) > 0:
        matches += 1
    if len(eval_ques["matched_indexes"]) == 0 and len(indexes) == 0:
        matches += 1
    if len(evals) % 100 == 0:
        print(f"Checks: {matches}/{len(evals)} of {len(qa_dataset)}")

100%|██████████| 20/20 [00:02<00:00,  9.19it/s]


In [185]:
matches

10

### Accuracy

In [186]:
accuracy = matches / len(evals)
accuracy

0.5

### Observing

In [187]:
@weave.op()
def accuracy(question, output):
    intersection = list(set(output) & set(question["matched_indexes"]))
    if len(question["matched_indexes"]) > 0 and len(intersection) > 0:
        return True
    if len(question["matched_indexes"]) == 0 and len(output) == 0:
        return True
    return False

In [188]:
@weave.op()
def top_5(question):
    result, indexes = getRagResponse(question["question"], 5, True)
    return indexes

In [189]:
@weave.op()
def top_10(question):
    result, indexes = getRagResponse(question["question"], 10, True)
    return indexes

In [190]:
@weave.op()
def top_20(question):
    result, indexes = getRagResponse(question["question"], 20, True)
    return indexes

In [191]:
evaluation = Evaluation(
    dataset=[{"question": x} for x in qa_dataset], scorers=[accuracy]
)

In [192]:
weave.init('gemini-rag-eval')

<weave.trace.weave_client.WeaveClient at 0x797b84d7fc50>

In [193]:
await evaluation.evaluate(top_5)

{'accuracy': {'true_count': 10, 'true_fraction': 0.5},
 'model_latency': {'mean': 0.45109776258468626}}

In [194]:
await evaluation.evaluate(top_10)

🍩 https://wandb.ai/xprilion/gemini-rag-eval/r/call/019550a2-a6be-7772-ad1b-80897ba55351


{'accuracy': {'true_count': 14, 'true_fraction': 0.7},
 'model_latency': {'mean': 0.5206799507141113}}

In [195]:
await evaluation.evaluate(top_20)

{'accuracy': {'true_count': 14, 'true_fraction': 0.7},
 'model_latency': {'mean': 0.488834810256958}}