<a href="https://colab.research.google.com/github/xprilion/gemini-as-a-judge-for-rag-evals/blob/main/Step_2_Eval_Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Gemini As A Judge for RAG Evals

## Evaluation Dataset

### 1. Load the datasets

In [1]:
!wget https://raw.githubusercontent.com/xprilion/gemini-as-a-judge-for-rag-evals/refs/heads/main/reviews.json

--2025-03-02 02:36:19--  https://raw.githubusercontent.com/xprilion/gemini-as-a-judge-for-rag-evals/refs/heads/main/reviews.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 507865 (496K) [text/plain]
Saving to: ‘reviews.json’


2025-03-02 02:36:20 (58.6 MB/s) - ‘reviews.json’ saved [507865/507865]



### Packages

In [None]:
%%capture
!pip install qdrant-client[fastembed]
!pip install google-genai

### Imports

In [None]:
import pandas as pd
import json
import os
import time
from tqdm import tqdm
from google import genai
from google.genai import types
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams
import uuid
import random

from google.colab import userdata

  warn(


### Helpers

In [None]:
collection_name = "product_reviews"

In [None]:
GEMINI_KEY = userdata.get('GEMINI_API_KEY')
gemini_client = genai.Client(
    api_key=GEMINI_KEY
)

In [None]:
def getGeminiResponse(prompt, max_tokens=8192, response_type="application/json"):
    contents = [
        types.Content(
            role="user",
            parts=[
                types.Part.from_text(
                    text=prompt
                ),
            ],
        ),
    ]
    generate_content_config = types.GenerateContentConfig(
        temperature=0,
        top_p=0.95,
        top_k=40,
        max_output_tokens=max_tokens,
        response_mime_type=response_type,
        response_schema=genai.types.Schema(
            type = genai.types.Type.OBJECT,
            enum = [],
            required = ["query_pair"],
            properties = {
                "query_pair": genai.types.Schema(
                    type = genai.types.Type.ARRAY,
                    items = genai.types.Schema(
                        type = genai.types.Type.OBJECT,
                        enum = [],
                        required = ["question", "answer", "matched_indexes"],
                        properties = {
                            "question": genai.types.Schema(
                                type = genai.types.Type.STRING,
                            ),
                            "answer": genai.types.Schema(
                                type = genai.types.Type.STRING,
                            ),
                            "matched_indexes": genai.types.Schema(
                                type = genai.types.Type.ARRAY,
                                items = genai.types.Schema(
                                    type = genai.types.Type.NUMBER,
                                ),
                            ),
                        },
                    ),
                ),
            },
        ),
    )
    response = gemini_client.models.generate_content(
        model="gemini-2.0-flash", contents=contents, config=generate_content_config
    )
    return response.text

In [None]:
getGeminiResponse("What is 2+3?")

'{\n  "query_pair": [\n    {\n      "answer": "5",\n      "matched_indexes": [],\n      "question": "What is 2+3?"\n    }\n  ]\n}'

### EDA

In [None]:
df = pd.read_json('reviews.json')

In [None]:
df.head()

Unnamed: 0,product_title,product_description,review
0,Hammer,This 16 oz claw hammer is perfect for general ...,I've been using this hammer for a few months n...
1,Hammer,This 16 oz claw hammer is perfect for general ...,This hammer is a solid addition to my toolbox....
2,Hammer,This 16 oz claw hammer is perfect for general ...,I purchased this hammer for some home renovati...
3,Hammer,This 16 oz claw hammer is perfect for general ...,"As a professional carpenter, I rely on my tool..."
4,Hammer,This 16 oz claw hammer is perfect for general ...,This hammer is a great value for the price. Th...


### Connect Qdrant

In [None]:
QDRANT_URL = "https://qdrant-1.sg-1.cloudtop.dev"
QDRANT_KEY = userdata.get('PERSONAL_QDRANT_KEY')

In [None]:
qdrant_client = QdrantClient(url=QDRANT_URL, api_key=QDRANT_KEY, port=None)

### Create Documents

In [None]:
documents = []

In [None]:
for index, row in df.iterrows():
    product_title = row['product_title']
    product_description = row['product_description']
    review = row['review']

    # Combine product information and review into a single document
    document = f"Index: {index}\nTitle: {product_title}\nDescription: {product_description}\nReview: {review}"

    documents.append(document)

In [None]:
len(documents)

900

In [None]:
documents[0]

"Index: 0\nTitle: Hammer\nDescription: This 16 oz claw hammer is perfect for general carpentry and DIY projects. It features a comfortable grip and a durable steel head.\nReview: I've been using this hammer for a few months now, and it's become my go-to tool for all my DIY projects. The 16 oz weight is perfect for driving nails without too much effort, and the claw is great for removing them. The grip is comfortable, even during extended use, and I haven't noticed any wear on it yet. The steel head is incredibly durable and shows no signs of rust or damage."

### Make data buckets

In [None]:
def create_buckets(items, num_subsets=100, subset_size=10, min_occurrences=3):
    """
    Creates N subset lists from a given list of items, ensuring each item
    appears at least Q times across all subsets.

    Args:
        items: The original list of items.
        num_subsets: The desired number of subsets.
        subset_size: The desired size of each subset.
        min_occurrences: The minimum number of times each item should appear.

    Returns:
        A list of N subset lists, or None if it's impossible to fulfill the criteria.
    """

    if subset_size > len(items):
        return None  # Subset size cannot be larger than the original list

    total_required_occurrences = len(items) * min_occurrences
    total_subset_items = num_subsets * subset_size

    if total_subset_items < total_required_occurrences:
        return None #Impossible to fulfill min_occurrences requirement.

    item_counts = {item: 0 for item in items}
    subsets = [[] for _ in range(num_subsets)]

    # Distribute items to meet minimum occurrences
    for _ in range(min_occurrences):
        for item in items:
            placed = False
            attempts = 0
            while not placed and attempts < 10*num_subsets: #Try to distribute somewhat evenly.
                subset_index = random.randrange(num_subsets)
                if len(subsets[subset_index]) < subset_size:
                    subsets[subset_index].append(item)
                    item_counts[item] += 1
                    placed = True
                attempts += 1


    # Fill remaining slots in subsets with random items
    for subset in subsets:
        while len(subset) < subset_size:
            available_items = [item for item in items if item_counts[item] < (total_subset_items // len(items)) + 1] # try not to overpopulate one item.
            if not available_items:
                available_items = items # if there are no items that haven't been overpopulated, just pick from the original list.
            random_item = random.choice(available_items)
            subset.append(random_item)
            item_counts[random_item] += 1

    return subsets

In [None]:
buckets = create_buckets(documents, 500, 25, 3)

### Generate questions for each bucket

In [None]:
NUM_Q_PER_BUCKET = 12

In [None]:
system_prompt = """Given the following list of documents in a dataset, you have to come up with """ + str(NUM_Q_PER_BUCKET) + """ queries that can be asked on that dataset such that:
1. 3 of the queries can bring up good search results
2. 3 of the queries are complex and need strong algorithms for better results
3. 3 of the queries are good for multiple document outcomes, assign multiple values to matched_indexes field.
4. 3 of the queries should not have any responses from the dataset, use invented words, questions with no sense and non english languages
5. Don't provide any numbering to the query phrases.
6. 4 queries will be between 1 to 3 words.
7. 4 queries will be between 4 to 6 words.
8. 4 queries will be 7 or more words.

documents:

"""

In [None]:
prompt = system_prompt + "\n\n".join(buckets[0])
response = getGeminiResponse(prompt)
response

'{\n  "query_pair": [\n    {\n      "answer": "Index 0, 4, 12",\n      "matched_indexes": [\n        0,\n        4,\n        12\n      ],\n      "question": "Best hammer for DIY projects"\n    },\n    {\n      "answer": "Index 272, 274",\n      "matched_indexes": [\n        272,\n        274\n      ],\n      "question": "9-inch paint roller review"\n    },\n    {\n      "answer": "Index 203, 204, 208",\n      "matched_indexes": [\n        203,\n        204,\n        208\n      ],\n      "question": "Pliers set for professional use"\n    },\n    {\n      "answer": "Index 617",\n      "matched_indexes": [\n        617\n      ],\n      "question": "Socket set with quick release"\n    },\n    {\n      "answer": "Index 449",\n      "matched_indexes": [\n        449\n      ],\n      "question": "Nail gun with ergonomic handle"\n    },\n    {\n      "answer": "Index 498",\n      "matched_indexes": [\n        498\n      ],\n      "question": "Angle grinder for heavy duty tasks"\n    },\n    {\

In [None]:
res = json.loads(response)
res

{'query_pair': [{'answer': 'Index 0, 4, 12',
   'matched_indexes': [0, 4, 12],
   'question': 'Best hammer for DIY projects'},
  {'answer': 'Index 272, 274',
   'matched_indexes': [272, 274],
   'question': '9-inch paint roller review'},
  {'answer': 'Index 203, 204, 208',
   'matched_indexes': [203, 204, 208],
   'question': 'Pliers set for professional use'},
  {'answer': 'Index 617',
   'matched_indexes': [617],
   'question': 'Socket set with quick release'},
  {'answer': 'Index 449',
   'matched_indexes': [449],
   'question': 'Nail gun with ergonomic handle'},
  {'answer': 'Index 498',
   'matched_indexes': [498],
   'question': 'Angle grinder for heavy duty tasks'},
  {'answer': 'Index 203, 204, 208',
   'matched_indexes': [203, 204, 208],
   'question': 'Pliers sets with non-slip handles and durable cases'},
  {'answer': 'Index 272, 274, 291',
   'matched_indexes': [272, 274, 291],
   'question': 'Paint rollers for smooth application and small projects'},
  {'answer': 'Index 0,

In [None]:
qna_dataset = []

In [None]:
for bucket in tqdm(buckets):
    prompt = system_prompt + "\n\n".join(bucket)
    response = getGeminiResponse(prompt)
    res = json.loads(response)
    qna_dataset.extend(res['query_pair'])

100%|██████████| 500/500 [40:05<00:00,  4.81s/it]


### Save dataset

In [None]:
len(qna_dataset)

6701

In [None]:
with open('qna_dataset.json', 'w') as f:
    json.dump(qna_dataset, f)