In [1]:
import json

# Load the documents with ids
with open("documents-with-ids.json", "rt") as f:
    documents = json.load(f)

In [2]:
# Elastic search
from elasticsearch import Elasticsearch

es_client = Elasticsearch("http://localhost:9200")

index_settings = {
    "settings": {"number_of_shards": 1, "number_of_replicas": 0},
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "id": {"type": "keyword"},
        }
    },
}

index_name = "course-questions"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [3]:
# Index the document
from tqdm.auto import tqdm

for document in tqdm(documents):
    es_client.index(index=index_name, document=document)

  0%|          | 0/948 [00:00<?, ?it/s]

In [4]:
def elastic_search(query, course):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields",
                    }
                },
                "filter": {"term": {"course": course}},
            }
        },
    }

    response = es_client.search(index=index_name, body=search_query)

    result_docs = []

    for hit in response["hits"]["hits"]:
        result_docs.append(hit["_source"])

    return result_docs

In [5]:
elastic_search(
    query="I just discovered the course. Can I still join?",
    course="data-engineering-zoomcamp",
)

[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp',
  'id': '7842b56a'},
 {'text': 'You can start by installing and setting up all the dependencies and requirements:\nGoogle cloud account\nGoogle Cloud SDK\nPython 3 (installed with Anaconda)\nTerraform\nGit\nLook over the prerequisites and syllabus to see if you are comfortable with these subjects.',
  'section': 'General course-related questions',
  'question': 'Course - What can I do before the course starts?',
  'course': 'data-engineering-zoomcamp',
  'id': '63394d91'},
 {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it fin

In [6]:
import pandas as pd

df_gt = pd.read_csv("ground-truth-data.csv")
df_gt.head(10)

Unnamed: 0,question,course,document
0,Can I enroll in the course after it has alread...,data-engineering-zoomcamp,c02e79ef
1,Will I still be able to submit homework if I j...,data-engineering-zoomcamp,c02e79ef
2,Are there any deadlines for the final projects...,data-engineering-zoomcamp,c02e79ef
3,Can I begin the course after the official star...,data-engineering-zoomcamp,c02e79ef
4,Is it possible to join the course post-start d...,data-engineering-zoomcamp,c02e79ef
5,Can I enroll in the course after it has alread...,data-engineering-zoomcamp,1f6520ca
6,Will I still be able to submit homework if I j...,data-engineering-zoomcamp,1f6520ca
7,Are there any deadlines for the final projects...,data-engineering-zoomcamp,1f6520ca
8,Can I begin the course after the official star...,data-engineering-zoomcamp,1f6520ca
9,Is it possible to join the course post-start d...,data-engineering-zoomcamp,1f6520ca


In [7]:
ground_truth = df_gt.to_dict(orient="records")

In [8]:
relevance_total = []

for q in tqdm(ground_truth):
    doc_id = q["document"]
    results = elastic_search(query=q["question"], course=q["course"])
    relevance = [d["id"] == doc_id for d in results]
    relevance_total.append(relevance)

  0%|          | 0/4735 [00:00<?, ?it/s]

In [9]:
relevance_total

[[False, False, False, False, False],
 [False, False, False, True, False],
 [False, False, True, False, False],
 [False, False, True, False, False],
 [False, False, True, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [False, True, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [False, True, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, True, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, True, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, F

In [10]:
example = [
    [False, False, False, False, False],
    [False, False, False, True, False],
    [False, False, True, False, False],
    [False, False, True, False, False],
    [False, False, True, False, False],
    [False, False, False, False, False],
    [False, False, False, False, False],
    [False, True, False, False, False],
    [False, False, False, False, False],
    [False, False, False, False, False],
    [False, True, False, False, False],
    [True, False, False, False, False],
    [True, False, False, False, False],
    [True, False, False, False, False],
]

In [11]:
def hit_rate(data):
    cnt = 0
    for point in data:
        cnt += sum(point)
    return cnt / len(data)


hit_rate(example)

0.6428571428571429

In [12]:
def mrr(data):
    total_score = 0.0
    mean_score = 0.0
    for point in data:
        for rank in range(len(point)):
            if point[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(data)

In [13]:
mrr(example)

0.375

In [14]:
hit_rate(relevance_total)

0.01583949313621964

In [15]:
mrr(relevance_total)

0.007233368532206969

### Search using Minsearch

In [16]:
import minsearch

index = minsearch.Index(
    text_fields=["question", "text", "section"], keyword_fields=["course", "id"]
)

index.fit(documents)

<minsearch.Index at 0x792d1d734560>

In [17]:
def minsearch_search(query, course):
    boost = {"question": 3.0, "section": 0.5}

    results = index.search(
        query=query, filter_dict={"course": course}, boost_dict=boost, num_results=5
    )

    return results

In [18]:
relevance_total = []

for q in tqdm(ground_truth):
    doc_id = q["document"]
    results = minsearch_search(query=q["question"], course=q["course"])
    relevance = [d["id"] == doc_id for d in results]
    relevance_total.append(relevance)

  0%|          | 0/4735 [00:00<?, ?it/s]

In [19]:
hit_rate(relevance_total), mrr(relevance_total)

(0.01583949313621964, 0.007233368532206968)

In [20]:
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q["document"]
        results = search_function(q)
        relevance = [d["id"] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        "hit_rate": hit_rate(relevance_total),
        "mrr": mrr(relevance_total),
    }

In [21]:
evaluate(ground_truth, lambda q: elastic_search(q["question"], q["course"]))

  0%|          | 0/4735 [00:00<?, ?it/s]

{'hit_rate': 0.01583949313621964, 'mrr': 0.007233368532206969}

In [22]:
evaluate(ground_truth, lambda q: minsearch_search(q["question"], q["course"]))

  0%|          | 0/4735 [00:00<?, ?it/s]

{'hit_rate': 0.01583949313621964, 'mrr': 0.007233368532206968}