# Search Evaluation

# Required libraries

In [1]:
!pip install uv
!uv pip install -U minsearch qdrant_client



[2mUsing Python 3.10.18 environment at: C:\Users\CHUNGKE2\AppData\Local\anaconda3\envs\evaluation[0m
[2mResolved [1m33 packages[0m [2min 323ms[0m[0m
[2mAudited [1m33 packages[0m [2min 0.27ms[0m[0m


# Evaluation data

In [2]:
!uv pip install requests

[2mUsing Python 3.10.18 environment at: C:\Users\CHUNGKE2\AppData\Local\anaconda3\envs\evaluation[0m
[2mAudited [1m1 package[0m [2min 199ms[0m[0m


In [3]:
import requests
import pandas as pd

url_prefix = 'https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/03-evaluation/'
docs_url = url_prefix + 'search_evaluation/documents-with-ids.json'
documents = requests.get(docs_url).json()

ground_truth_url = url_prefix + 'search_evaluation/ground-truth-data.csv'
df_ground_truth = pd.read_csv(ground_truth_url)
ground_truth = df_ground_truth.to_dict(orient='records')

documents contains the documents from the FAQ database with unique IDs, and ground_truth contains generated question-answer pairs.

Also, we will need the code for evaluating retrieval:

In [4]:
!uv pip install tqdm

[2mUsing Python 3.10.18 environment at: C:\Users\CHUNGKE2\AppData\Local\anaconda3\envs\evaluation[0m
[2mResolved [1m2 packages[0m [2min 122ms[0m[0m
[2mInstalled [1m1 package[0m [2min 421ms[0m[0m
 [32m+[39m [1mtqdm[0m[2m==4.67.1[0m


In [5]:
from tqdm.auto import tqdm

def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

  from .autonotebook import tqdm as notebook_tqdm


# Q1. Minsearch text

Now let's evaluate our usual minsearch approach, but tweak the parameters. Let's use the following boosting params:

In [10]:
import minsearch

In [11]:
index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course", "id"],
)

index.fit(documents)

<minsearch.minsearch.Index at 0x28c8547c670>

In [14]:
boost_dict = {'question': 1.5, 'section': 0.1}

def search_function(q):
    # q is a dict from ground_truth, e.g. {'question': ..., 'course': ..., ...}
    return index.search(
        query=q['question'],
        filter_dict={'course': q['course']},
        boost_dict=boost_dict,
        num_results=5
    )

result = evaluate(ground_truth, search_function)
print("Hitrate:", result['hit_rate'])

100%|██████████| 4627/4627 [00:22<00:00, 207.03it/s]
100%|██████████| 4627/4627 [00:22<00:00, 207.03it/s]


Hitrate: 0.848714069591528


# Embeddings

In [15]:
from minsearch import VectorSearch

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline

In [18]:
# Let's create embeddings for the "question" field:

texts = []

for doc in documents:
    t = doc['question']
    texts.append(t)

pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)
X = pipeline.fit_transform(texts)

# Q2. Vector search for question

In [19]:
# index these embeddings with minsearch:
vindex = VectorSearch(keyword_fields={'course'})
vindex.fit(X, documents)

<minsearch.vector.VectorSearch at 0x28cffb521a0>

In [22]:
def vector_search_function(q):
    # Transform the query question using the pipeline
    query_vec = pipeline.transform([q['question']])[0]
    # Perform vector search with filter
    return vindex.search(query_vec, filter_dict={'course': q['course']})

result = evaluate(ground_truth, vector_search_function)
print("MRR:", result['mrr'])

100%|██████████| 4627/4627 [00:12<00:00, 382.24it/s]
100%|██████████| 4627/4627 [00:12<00:00, 382.24it/s]


MRR: 0.36761837866765423


# Q3 Vector search for question and answer

In [27]:
# Create combined question+answer texts
texts = []
for doc in documents:
    t = doc['question'] + ' ' + doc['text']
    texts.append(t)

# Create embeddings
pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)
X = pipeline.fit_transform(texts)

# Index embeddings with minsearch
vindex = VectorSearch(keyword_fields={'course'})
vindex.fit(X, documents)

# Define search function - only use question for query
def vector_search_function(q):
    query_vec = pipeline.transform([q['question']]).flatten()
    return vindex.search(query_vec, filter_dict={'course': q['course']})

# Evaluate
result = evaluate(ground_truth, vector_search_function)
print("Hitrate:", result['hit_rate'])

100%|██████████| 4627/4627 [00:17<00:00, 263.98it/s]

Hitrate: 0.8841582018586557





# Q4. Qdrant


In [28]:
# Q4. Qdrant
!uv pip install qdrant-client sentence-transformers

[2mUsing Python 3.10.18 environment at: C:\Users\CHUNGKE2\AppData\Local\anaconda3\envs\evaluation[0m
[2mResolved [1m48 packages[0m [2min 534ms[0m[0m
[36m[1mDownloading[0m[39m transformers [2m(10.3MiB)[0m
 [32m[1mDownloading[0m[39m transformers
[2mPrepared [1m3 packages[0m [2min 7.63s[0m[0m
[2mInstalled [1m16 packages[0m [2min 14.94s[0m[0m
 [32m+[39m [1mfilelock[0m[2m==3.18.0[0m
 [32m+[39m [1mfsspec[0m[2m==2025.7.0[0m
 [32m+[39m [1mhuggingface-hub[0m[2m==0.33.4[0m
 [32m+[39m [1mjinja2[0m[2m==3.1.6[0m
 [32m+[39m [1mmarkupsafe[0m[2m==3.0.2[0m
 [32m+[39m [1mmpmath[0m[2m==1.3.0[0m
 [32m+[39m [1mnetworkx[0m[2m==3.4.2[0m
 [32m+[39m [1mpillow[0m[2m==11.3.0[0m
 [32m+[39m [1mpyyaml[0m[2m==6.0.2[0m
 [32m+[39m [1mregex[0m[2m==2024.11.6[0m
 [32m+[39m [1msafetensors[0m[2m==0.5.3[0m
 [32m+[39m [1msentence-transformers[0m[2m==5.0.0[0m
 [32m+[39m [1msympy[0m[2m==1.14.0[0m
 [32m+[39m [1mtokeniz

In [36]:
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct, Filter, FieldCondition, MatchValue
from sentence_transformers import SentenceTransformer
import uuid

In [37]:
# Initialize Qdrant client (in-memory)
client = QdrantClient(":memory:")

In [38]:
# Initialize the embedding model
model = SentenceTransformer("jinaai/jina-embeddings-v2-small-en")

Some weights of BertModel were not initialized from the model checkpoint at jinaai/jina-embeddings-v2-small-en and are newly initialized: ['embeddings.position_embeddings.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.intermediate.dense.weight', 'encoder.layer.0.output.LayerNorm.bias', 'encoder.layer.0.output.LayerNorm.weight', 'encoder.layer.0.output.dense.bias', 'encoder.layer.0.output.dense.weight', 'encoder.layer.1.intermediate.dense.bias', 'encoder.layer.1.intermediate.dense.weight', 'encoder.layer.1.output.LayerNorm.bias', 'encoder.layer.1.output.LayerNorm.weight', 'encoder.layer.1.output.dense.bias', 'encoder.layer.1.output.dense.weight', 'encoder.layer.2.intermediate.dense.bias', 'encoder.layer.2.intermediate.dense.weight', 'encoder.layer.2.output.LayerNorm.bias', 'encoder.layer.2.output.LayerNorm.weight', 'encoder.layer.2.output.dense.bias', 'encoder.layer.2.output.dense.weight', 'encoder.layer.3.intermediate.dense.bias', 'encoder.layer.3.intermediate.den

In [40]:
# Create collection
collection_name = "documents"
client.create_collection(
    collection_name=collection_name,
    vectors_config=VectorParams(size=512, distance=Distance.COSINE),
)

ValueError: Collection documents already exists

In [41]:
# Prepare documents and embeddings
texts = []
for doc in documents:
    text = doc['question'] + ' ' + doc['text']
    texts.append(text)

# Create embeddings
embeddings = model.encode(texts)

# Index documents
points = []
for i, (doc, embedding) in enumerate(zip(documents, embeddings)):
    point = PointStruct(
        id=str(uuid.uuid4()),
        vector=embedding.tolist(),
        payload={
            "id": doc["id"],
            "course": doc["course"],
            "question": doc["question"],
            "text": doc["text"],
            "section": doc["section"]
        }
    )
    points.append(point)

client.upsert(collection_name=collection_name, points=points)

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [43]:
# Define search function
def qdrant_search_function(q):
    # Create embedding for the query
    query_embedding = model.encode([q['question']])
    
    # Search in Qdrant with proper filter syntax
    results = client.search(
        collection_name=collection_name,
        query_vector=query_embedding[0].tolist(),
        query_filter=Filter(
            must=[
                FieldCondition(
                    key="course",
                    match=MatchValue(value=q['course'])
                )
            ]
        ),
        limit=5
    )
    
    
    # Convert results to the expected format
    formatted_results = []
    for result in results:
        formatted_results.append({
            "id": result.payload["id"],
            "course": result.payload["course"],
            "question": result.payload["question"],
            "text": result.payload["text"],
            "section": result.payload["section"]
        })
    
    return formatted_results

# Evaluate
result = evaluate(ground_truth, qdrant_search_function)
print("MRR:", result['mrr'])

  results = client.search(
  results = client.search(
100%|██████████| 4627/4627 [03:47<00:00, 20.38it/s]

MRR: 0.15411713853468792





# Q5. Cosine simiarity

In [44]:
# Q5. Cosine similarity
import numpy as np

In [46]:
# Load the results data
results_url = url_prefix + 'rag_evaluation/data/results-gpt4o-mini.csv'
df_results = pd.read_csv(results_url)

# Create the pipeline for embeddings
pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)

# Fit the pipeline on all text data
all_text = df_results.answer_llm + ' ' + df_results.answer_orig + ' ' + df_results.question
pipeline.fit(all_text)

# Define cosine similarity function
def cosine(u, v):
    u_norm = np.sqrt(u.dot(u))
    v_norm = np.sqrt(v.dot(v))
    return u.dot(v) / (u_norm * v_norm)

# Calculate cosine similarities
cosine_similarities = []

for _, row in df_results.iterrows():
    # Create embeddings for LLM answer and original answer
    v_llm = pipeline.transform([row.answer_llm])[0]
    v_orig = pipeline.transform([row.answer_orig])[0]
    
    # Calculate cosine similarity
    similarity = cosine(v_llm, v_orig)
    cosine_similarities.append(similarity)

# Calculate average cosine similarity
average_cosine = np.mean(cosine_similarities)
print(f"Average cosine similarity: {average_cosine:.3f}")


Average cosine similarity: 0.842


# Q6. Rouge

In [47]:
!uv pip install rouge

[2mUsing Python 3.10.18 environment at: C:\Users\CHUNGKE2\AppData\Local\anaconda3\envs\evaluation[0m
[2mResolved [1m2 packages[0m [2min 368ms[0m[0m
[2mInstalled [1m1 package[0m [2min 425ms[0m[0m
 [32m+[39m [1mrouge[0m[2m==1.0.1[0m


In [48]:
from rouge import Rouge
rouge_scorer = Rouge()

r = df_results.iloc[10]
scores = rouge_scorer.get_scores(r.answer_llm, r.answer_orig)[0]
scores

{'rouge-1': {'r': 0.45454545454545453,
  'p': 0.45454545454545453,
  'f': 0.45454544954545456},
 'rouge-2': {'r': 0.21621621621621623,
  'p': 0.21621621621621623,
  'f': 0.21621621121621637},
 'rouge-l': {'r': 0.3939393939393939,
  'p': 0.3939393939393939,
  'f': 0.393939388939394}}

In [49]:
print("Sample ROUGE scores for document 10:")
print(scores)
print(f"Rouge-1 F1 for document 10: {scores['rouge-1']['f']:.2f}")

# Calculate ROUGE-1 F1 for all pairs
rouge_1_f1_scores = []

for _, row in df_results.iterrows():
    try:
        scores = rouge_scorer.get_scores(row.answer_llm, row.answer_orig)[0]
        rouge_1_f1 = scores['rouge-1']['f']
        rouge_1_f1_scores.append(rouge_1_f1)
    except:
        # Handle any potential errors (empty strings, etc.)
        rouge_1_f1_scores.append(0.0)

# Calculate average ROUGE-1 F1
average_rouge_1_f1 = np.mean(rouge_1_f1_scores)
print(f"\nAverage ROUGE-1 F1 score: {average_rouge_1_f1:.3f}")

Sample ROUGE scores for document 10:
{'rouge-1': {'r': 0.45454545454545453, 'p': 0.45454545454545453, 'f': 0.45454544954545456}, 'rouge-2': {'r': 0.21621621621621623, 'p': 0.21621621621621623, 'f': 0.21621621121621637}, 'rouge-l': {'r': 0.3939393939393939, 'p': 0.3939393939393939, 'f': 0.393939388939394}}
Rouge-1 F1 for document 10: 0.45

Average ROUGE-1 F1 score: 0.352
