In [1]:
import json

with open('documents-with-ids.json', 'rt') as f_in:
    documents = json.load(f_in)

In [2]:

from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct
from sentence_transformers import SentenceTransformer

# Connect to Qdrant running locally
qdrant_client = QdrantClient(host="localhost", port=6333)

collection_name = "course-questions"

# Recreate the collection
qdrant_client.recreate_collection(
    collection_name=collection_name,
    vectors_config=VectorParams(size=384, distance=Distance.COSINE),
)

# Load embedding model
model = SentenceTransformer("all-MiniLM-L6-v2")


  qdrant_client.recreate_collection(


In [4]:

from uuid import uuid4

# Convert and upload documents
points = []
for doc in documents:
    vector = model.encode(doc["text"]).tolist()
    points.append(
        PointStruct(
            id=str(uuid4()),
            vector=vector,
            payload=doc
        )
    )

# Upload points to Qdrant
qdrant_client.upsert(collection_name=collection_name, points=points)


UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [5]:
from qdrant_client.http.models import Filter, FieldCondition, MatchValue, SearchParams

def qdrant_search(query, course, model, qdrant_client, collection_name):
    # Convert query to vector
    query_vector = model.encode(query).tolist()

    # Define filter for course
    course_filter = Filter(
        must=[
            FieldCondition(
                key="course",
                match=MatchValue(value=course)
            )
        ]
    )

    # Perform the search
    search_result = qdrant_client.search(
        collection_name=collection_name,
        query_vector=query_vector,
        limit=5,
        query_filter=course_filter,
        search_params=SearchParams(hnsw_ef=128)  # Optional: improves accuracy
    )

    # Extract payloads from search result
    result_docs = [hit.payload for hit in search_result]

    return result_docs


In [7]:
qdrant_search(
    query="I just discovered the course. Can I still join?",
    course="data-engineering-zoomcamp",
    model=model,
    qdrant_client=qdrant_client,
    collection_name=collection_name
)

  search_result = qdrant_client.search(


[{'text': "No, you can only get a certificate if you finish the course with a “live” cohort. We don't award certificates for the self-paced mode. The reason is you need to peer-review capstone(s) after submitting a project. You can only peer-review projects at the time the course is running.",
  'section': 'General course-related questions',
  'question': 'Certificate - Can I follow the course in a self-paced mode and get a certificate?',
  'course': 'data-engineering-zoomcamp',
  'id': 'cb257ee5'},
 {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.',
  'section': 'General course-related questions',
  'question': 'Course - Can I follow the course after it finishes?',
  'course': 'data-engineering-zoomcamp',
  'id': 'a482086d'},
 {'text': "You 

In [8]:
import pandas as pd

In [9]:
df_ground_truth = pd.read_csv('ground-truth-data.csv')

In [10]:
ground_truth = df_ground_truth.to_dict(orient='records')

In [12]:
from tqdm import tqdm

relevance_total = []

for q in tqdm(ground_truth):
    doc_id = q['document']
    results = qdrant_search(query=q['question'], course=q['course'], model=model, qdrant_client=qdrant_client, collection_name=collection_name)
    relevance = [d['id'] == doc_id for d in results]
    relevance_total.append(relevance)

  search_result = qdrant_client.search(
100%|██████████| 317/317 [00:06<00:00, 46.74it/s]


In [13]:
example = [
    [True, False, False, False, False], # 1, 
    [False, False, False, False, False], # 0
    [False, False, False, False, False], # 0 
    [False, False, False, False, False], # 0
    [False, False, False, False, False], # 0 
    [True, False, False, False, False], # 1
    [True, False, False, False, False], # 1
    [True, False, False, False, False], # 1
    [True, False, False, False, False], # 1
    [True, False, False, False, False], # 1 
    [False, False, True, False, False],  # 1/3
    [False, False, False, False, False], # 0
]

# 1 => 1
# 2 => 1 / 2 = 0.5
# 3 => 1 / 3 = 0.3333
# 4 => 0.25
# 5 => 0.2
# rank => 1 / rank
# none => 0

In [14]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

In [15]:
def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [16]:
hit_rate(example)

0.5833333333333334

In [17]:
mrr(example)

0.5277777777777778

- hit-rate (recall)
- Mean Reciprocal Rank (mrr)

In [18]:
hit_rate(relevance_total), mrr(relevance_total)

(0.807570977917981, 0.6714511041009464)

In [19]:
import minsearch

index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course", "id"]
)

index.fit(documents)

<minsearch.minsearch.Index at 0x7460c1a70320>

In [20]:
def minsearch_search(query, course):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': course},
        boost_dict=boost,
        num_results=5
    )

    return results

In [21]:
relevance_total = []

for q in tqdm(ground_truth):
    doc_id = q['document']
    results = minsearch_search(query=q['question'], course=q['course'])
    relevance = [d['id'] == doc_id for d in results]
    relevance_total.append(relevance)

100%|██████████| 317/317 [00:01<00:00, 272.72it/s]


In [22]:
hit_rate(relevance_total), mrr(relevance_total)

(0.7066246056782335, 0.5789695057833859)

Compare with Qdrant results:
```
(0.807570977917981, 0.6714511041009464)
```

In [23]:
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [24]:
evaluate(ground_truth, lambda q: qdrant_search(q['question'], q['course'], model=model, qdrant_client=qdrant_client, collection_name=collection_name))

  search_result = qdrant_client.search(
100%|██████████| 317/317 [00:06<00:00, 47.78it/s]


{'hit_rate': 0.807570977917981, 'mrr': 0.6714511041009464}

In [25]:
evaluate(ground_truth, lambda q: minsearch_search(q['question'], q['course']))

100%|██████████| 317/317 [00:01<00:00, 292.91it/s]


{'hit_rate': 0.7066246056782335, 'mrr': 0.5789695057833859}