## Import evaluation data

In [4]:
import requests
import pandas as pd

url_prefix = (
    "https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/03-evaluation/"
)
docs_url = url_prefix + "search_evaluation/documents-with-ids.json"
documents = requests.get(docs_url).json()

ground_truth_url = url_prefix + "search_evaluation/ground-truth-data.csv"
df_ground_truth = pd.read_csv(ground_truth_url)
ground_truth = df_ground_truth.to_dict(orient="records")

## Define evaluation functions

In [12]:
# Code to evaluate retrieval

from tqdm.auto import tqdm


def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)


def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)


def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q["document"]
        results = search_function(query=q["question"], course=q["course"])
        relevance = [d["id"] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        "hit_rate": hit_rate(relevance_total),
        "mrr": mrr(relevance_total),
    }

## Q1. Evaluating minsearch's text search

In [3]:
import minsearch

index = minsearch.Index(
    text_fields=["question", "section", "text"], keyword_fields=["course", "id"]
)

index.fit(documents)

<minsearch.minsearch.Index at 0x7df0286b0af0>

In [4]:
def minsearch_search(query, course):
    boost = {"question": 1.5, "section": 0.1}

    results = index.search(
        filter_dict={"course": course}, query=query, boost_dict=boost, num_results=5
    )

    return results

In [5]:
evaluate(ground_truth=ground_truth, search_function=minsearch_search)

  0%|          | 0/4627 [00:00<?, ?it/s]

{'hit_rate': 0.848714069591528, 'mrr': 0.7288235717887772}

## Evaluating minsearch's vector search

### Embeddings

In [6]:
from minsearch import VectorSearch

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline

In [8]:
# Only vectorizing the question field
texts = []

for doc in documents:
    t = doc["question"]
    texts.append(t)

pipeline = make_pipeline(
    TfidfVectorizer(min_df=3), TruncatedSVD(n_components=128, random_state=1)
)
X = pipeline.fit_transform(texts)

In [9]:
vindex = VectorSearch(keyword_fields={"course"})
vindex.fit(X, documents)

<minsearch.vector.VectorSearch at 0x7defbaddd120>

### Q2. Vector search for question

In [10]:
def evaluate_vector_search(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q["document"]
        query = q["question"]
        query_vector = pipeline.transform([query])
        results = search_function(query=query_vector, course=q["course"])
        relevance = [d["id"] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        "hit_rate": hit_rate(relevance_total),
        "mrr": mrr(relevance_total),
    }

In [11]:
def minsearch_vector_search(query, course):

    results = vindex.search(filter_dict={"course": course}, query_vector=query, num_results=5)

    return results

In [12]:
evaluate_vector_search(ground_truth=ground_truth, search_function=minsearch_vector_search)

  0%|          | 0/4627 [00:00<?, ?it/s]

{'hit_rate': 0.48173762697212014, 'mrr': 0.3572833369353793}

### Q3. Vector search for question and answer

In [13]:
# Vectorizing the question and text fields
texts = []

for doc in documents:
    t = doc["question"] + " " + doc["text"]
    texts.append(t)

pipeline = make_pipeline(
    TfidfVectorizer(min_df=3), TruncatedSVD(n_components=128, random_state=1)
)
X = pipeline.fit_transform(texts)

In [14]:
vindex = VectorSearch(keyword_fields={"course"})
vindex.fit(X, documents)

<minsearch.vector.VectorSearch at 0x7defbaddd030>

In [15]:
def evaluate_vector_search(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q["document"]
        query = q["question"]
        query_vector = pipeline.transform([query])
        results = search_function(query=query_vector, course=q["course"])
        relevance = [d["id"] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        "hit_rate": hit_rate(relevance_total),
        "mrr": mrr(relevance_total),
    }

In [16]:
def minsearch_vector_search(query, course):

    results = vindex.search(
        filter_dict={"course": course}, query_vector=query, num_results=5
    )

    return results

In [17]:
evaluate_vector_search(
    ground_truth=ground_truth, search_function=minsearch_vector_search
)

  0%|          | 0/4627 [00:00<?, ?it/s]

{'hit_rate': 0.8210503566025502, 'mrr': 0.6717347453353508}

## Q4. Qdrant

In [19]:
from qdrant_client import QdrantClient, models
from fastembed import TextEmbedding

client = QdrantClient("http://localhost:6333")  # connecting to local Qdrant instance

In [20]:
EMBEDDING_DIMENSIONALITY = 512
model_handle = "jinaai/jina-embeddings-v2-small-en"

In [21]:
collection_name = "zoomcamp_hw_eval"

client.create_collection(
    collection_name=collection_name,
    vectors_config=models.VectorParams(
        size=EMBEDDING_DIMENSIONALITY,  # Dimensionality of the vectors
        distance=models.Distance.COSINE,  # Distance metric for similarity search
    ),
)

True

In [23]:
points = []

for i, doc in enumerate(documents):
    text = doc["question"] + " " + doc["text"]
    vector = models.Document(text=text, model=model_handle)
    point = models.PointStruct(id=i, vector=vector, payload=doc)
    points.append(point)

In [24]:
client.upsert(collection_name=collection_name, points=points)

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/367 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

onnx/model.onnx:   0%|          | 0.00/130M [00:00<?, ?B/s]

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [25]:
# Enable efficient filtering for the course field
client.create_payload_index(
    collection_name=collection_name,
    field_name="course",
    field_schema="keyword",  # exact matching on string metadata fields
)

UpdateResult(operation_id=2, status=<UpdateStatus.COMPLETED: 'completed'>)

In [45]:
def evaluate_qdrant_vector_search(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q["document"]
        query = q["question"]
        results = search_function(query=query, course=q["course"])
        relevance = [d.payload["id"] == doc_id for d in results.points]
        relevance_total.append(relevance)

    return {
        "hit_rate": hit_rate(relevance_total),
        "mrr": mrr(relevance_total),
    }

In [46]:
def qdrant_vector_search(query, course):

    results = client.query_points(
    collection_name=collection_name,
    query=models.Document( #embed the query text locally with "jinaai/jina-embeddings-v2-small-en"
        text=query,
        model=model_handle
    ),
    query_filter=models.Filter( # filter by course name
        must=[
            models.FieldCondition(
                key="course",
                match=models.MatchValue(value=course)
            )
        ]
    ),
    limit=5, # top closest matches
    with_payload=True #to get metadata in the results
    )

    return results

In [47]:
evaluate_qdrant_vector_search(
    ground_truth=ground_truth, search_function=qdrant_vector_search
)

  0%|          | 0/4627 [00:00<?, ?it/s]

{'hit_rate': 0.9299762264966501, 'mrr': 0.8517722066133576}

## Q5. Cosine similarity

In [2]:
import numpy as np

def cosine(u, v):
    u_norm = np.sqrt(u.dot(u))
    v_norm = np.sqrt(v.dot(v))
    return u.dot(v) / (u_norm * v_norm)

In [5]:
results_url = url_prefix + "rag_evaluation/data/results-gpt4o-mini.csv"
df_results = pd.read_csv(results_url)

In [88]:
pipeline = make_pipeline(
    TfidfVectorizer(min_df=3), TruncatedSVD(n_components=128, random_state=1)
)

In [89]:
pipeline.fit(
    df_results.answer_llm + " " + df_results.answer_orig + " " + df_results.question
)

In [10]:
results = df_results.to_dict(orient='records')

In [91]:
similarity = []

for record in tqdm(results):
    answer_orig = record["answer_orig"]
    answer_llm = record["answer_llm"]

    v_orig = pipeline.transform([answer_orig]).flatten()
    v_llm = pipeline.transform([answer_llm]).flatten()

    sim = cosine(v_llm, v_orig)
    similarity.append(sim)

  0%|          | 0/1830 [00:00<?, ?it/s]

In [92]:
df_results["cosine"] = similarity
df_results["cosine"].describe()

count    1830.000000
mean        0.841584
std         0.173737
min         0.079093
25%         0.806927
50%         0.905812
75%         0.950711
max         0.996457
Name: cosine, dtype: float64

## Q6. Rouge

In [93]:
!pip install rouge

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl.metadata (4.1 kB)
Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [6]:
from rouge import Rouge

rouge_scorer = Rouge()

r = df_results.iloc[10]
scores = rouge_scorer.get_scores(r.answer_llm, r.answer_orig)[0]
scores

{'rouge-1': {'r': 0.45454545454545453,
  'p': 0.45454545454545453,
  'f': 0.45454544954545456},
 'rouge-2': {'r': 0.21621621621621623,
  'p': 0.21621621621621623,
  'f': 0.21621621121621637},
 'rouge-l': {'r': 0.3939393939393939,
  'p': 0.3939393939393939,
  'f': 0.393939388939394}}

In [None]:
rouge_1f1_scores = []
for record in tqdm(results):
    answer_orig = record["answer_orig"]
    answer_llm = record["answer_llm"]
    
    scores = rouge_scorer.get_scores(answer_llm, answer_orig)[0]
    rouge_1f1 = scores["rouge-1"]["f"]
    rouge_1f1_scores.append(rouge_1f1)

  0%|          | 0/1830 [00:00<?, ?it/s]

In [21]:
df_results["rouge_1f1"] = rouge_1f1_scores
df_results["rouge_1f1"].describe()

count    1830.000000
mean        0.351695
std         0.158905
min         0.000000
25%         0.238887
50%         0.356300
75%         0.460133
max         0.950000
Name: rouge_1f1, dtype: float64