## Import evaluation data

In [1]:
import requests
import pandas as pd

url_prefix = (
    "https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/03-evaluation/"
)
docs_url = url_prefix + "search_evaluation/documents-with-ids.json"
documents = requests.get(docs_url).json()

ground_truth_url = url_prefix + "search_evaluation/ground-truth-data.csv"
df_ground_truth = pd.read_csv(ground_truth_url)
ground_truth = df_ground_truth.to_dict(orient="records")

## Define evaluation functions

In [22]:
# Code to evaluate retrieval

from tqdm.auto import tqdm


def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)


def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)


def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q["document"]
        results = search_function(query=q["question"], course=q["course"])
        relevance = [d["id"] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        "hit_rate": hit_rate(relevance_total),
        "mrr": mrr(relevance_total),
    }

## Q1. Evaluating minsearch's text search

In [23]:
import minsearch

index = minsearch.Index(
    text_fields=["question", "section", "text"], keyword_fields=["course", "id"]
)

index.fit(documents)

<minsearch.minsearch.Index at 0x79f3fd5c39d0>

In [24]:
def minsearch_search(query, course):
    boost = {"question": 1.5, "section": 0.1}

    results = index.search(
        filter_dict={"course": course}, query=query, boost_dict=boost, num_results=5
    )

    return results

In [25]:
evaluate(ground_truth=ground_truth, search_function=minsearch_search)

  0%|          | 0/4627 [00:00<?, ?it/s]

{'hit_rate': 0.848714069591528, 'mrr': 0.7288235717887772}

## Evaluating minsearch's vector search

### Embeddings

In [59]:
from minsearch import VectorSearch

In [60]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline

In [61]:
# Only vectorizing the question field
texts = []

for doc in documents:
    t = doc["question"]
    texts.append(t)

pipeline = make_pipeline(
    TfidfVectorizer(min_df=3), TruncatedSVD(n_components=128, random_state=1)
)
X = pipeline.fit_transform(texts)

In [62]:
vindex = VectorSearch(keyword_fields={"course"})
vindex.fit(X, documents)

<minsearch.vector.VectorSearch at 0x79f3ea023fa0>

### Q2. Vector search for question

In [63]:
def evaluate_vector_search(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q["document"]
        query = q["question"]
        query_vector = pipeline.transform([query])
        results = search_function(query=query_vector, course=q["course"])
        relevance = [d["id"] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        "hit_rate": hit_rate(relevance_total),
        "mrr": mrr(relevance_total),
    }

In [64]:
def minsearch_vector_search(query, course):

    results = vindex.search(filter_dict={"course": course}, query_vector=query, num_results=5)

    return results

In [65]:
evaluate_vector_search(ground_truth=ground_truth, search_function=minsearch_vector_search)

  0%|          | 0/4627 [00:00<?, ?it/s]

{'hit_rate': 0.48173762697212014, 'mrr': 0.3572833369353793}

### Q3. Vector search for question and answer

In [71]:
# Vectorizing the question and text fields
texts = []

for doc in documents:
    t = doc["question"] + " " + doc["text"]
    texts.append(t)

pipeline = make_pipeline(
    TfidfVectorizer(min_df=3), TruncatedSVD(n_components=128, random_state=1)
)
X = pipeline.fit_transform(texts)

In [72]:
vindex = VectorSearch(keyword_fields={"course"})
vindex.fit(X, documents)

<minsearch.vector.VectorSearch at 0x79f3ea0625f0>

In [73]:
def evaluate_vector_search(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q["document"]
        query = q["question"]
        query_vector = pipeline.transform([query])
        results = search_function(query=query_vector, course=q["course"])
        relevance = [d["id"] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        "hit_rate": hit_rate(relevance_total),
        "mrr": mrr(relevance_total),
    }

In [74]:
def minsearch_vector_search(query, course):

    results = vindex.search(
        filter_dict={"course": course}, query_vector=query, num_results=5
    )

    return results

In [75]:
evaluate_vector_search(
    ground_truth=ground_truth, search_function=minsearch_vector_search
)

  0%|          | 0/4627 [00:00<?, ?it/s]

{'hit_rate': 0.8210503566025502, 'mrr': 0.6717347453353508}

## Q4. Qdrant