In [1]:
import json
from elasticsearch import Elasticsearch
from tqdm.auto import tqdm
import pandas as pd
import minsearch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
with open('documents-with-ids.json', 'rt') as f_in:
    documents = json.load(f_in)

### Index documents with ID using ElasticSearch

In [3]:
es_client = Elasticsearch('http://localhost:9200') 

index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "id": {"type": "keyword"},
        }
    }
}

index_name = "course-questions"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [4]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

100%|███████████████████████████████████████████████████████████████████████| 948/948 [00:02<00:00, 375.30it/s]


### Setup ES Search
Define elastic search query with query and a course filter

In [5]:
def elastic_search(query, course):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": course
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

Example using ES Search

In [6]:
elastic_search(
    query="I just discovered the course. Can I still join?",
    course="data-engineering-zoomcamp"
)

[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp',
  'id': '7842b56a'},
 {'text': 'You can start by installing and setting up all the dependencies and requirements:\nGoogle cloud account\nGoogle Cloud SDK\nPython 3 (installed with Anaconda)\nTerraform\nGit\nLook over the prerequisites and syllabus to see if you are comfortable with these subjects.',
  'section': 'General course-related questions',
  'question': 'Course - What can I do before the course starts?',
  'course': 'data-engineering-zoomcamp',
  'id': '63394d91'},
 {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it fin

### ES Search evaluation using ground-truth data

For each generated question in ground truth data, invoke ES Search using the question and its course, and store the 5 search results.

Each search result contains the document ID which will be extracted and compared with the ground-truth data's document ID for evaluation. If it is relevant, then we get 1, else 0.

In [7]:
df_ground_truth = pd.read_csv('ground-truth-data.csv')

In [8]:
ground_truth = df_ground_truth.to_dict(orient='records')

In [9]:
relevance_total = []

for q in tqdm(ground_truth):
    doc_id = q['document']
    results = elastic_search(query=q['question'], course=q['course'])
    relevance = [d['id'] == doc_id for d in results]
    relevance_total.append(relevance)

100%|█████████████████████████████████████████████████████████████████████| 4627/4627 [00:14<00:00, 323.27it/s]


### Hit Rate (HR) or Recall at k:

* Measures the proportion of queries for which at least one relevant document is retrieved in the top k results.
* Formula: ```HR@k = (Number of queries with at least one relevant document in top k) / |Q|```

### Mean Reciprocal Rank (MRR):

* Evaluates the rank position of the first relevant document.
* Formula: ```MRR = (1 / |Q|) * Σ (1 / rank_i) for i = 1 to |Q|``` or $\displaystyle\text{MRR} = \frac{1}{|Q|} \sum_i (\frac{1}{\text{rank }_i})$
* Example, for question i's 5 search results, if True is at position 3 then we get $\frac{1}{3}$ for i.

Example for calculating HR and MRR of a set of evaluation results.

In [10]:
# using Hit Rate
example = [
    [True, False, False, False, False], # 1, 
    [False, False, False, False, False], # 0
    [False, False, False, False, False], # 0 
    [False, False, False, False, False], # 0
    [False, False, False, False, False], # 0 
    [True, False, False, False, False], # 1
    [True, False, False, False, False], # 1
    [True, False, False, False, False], # 1
    [True, False, False, False, False], # 1
    [True, False, False, False, False], # 1 
    [False, False, True, False, False],  # 1
    [False, False, False, False, False], # 0
]

# Using MRR, for True at position
# 1 => 1
# 2 => 1 / 2 = 0.5
# 3 => 1 / 3 = 0.3333
# 4 => 0.25
# 5 => 0.2
# rank => 1 / rank
# none => 0

In [11]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

In [12]:
def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [13]:
hit_rate(example)

0.5833333333333334

In [14]:
mrr(example)

0.5277777777777778

### Evaluating Hit Rate and MRR on ES Search

In [15]:
hit_rate(relevance_total), mrr(relevance_total)

(0.7395720769397017, 0.6029788920106625)

### Setup MinSearch

In [16]:
index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course", "id"]
)

index.fit(documents)

<minsearch.Index at 0x17cb432c0>

In [17]:
def minsearch_search(query, course):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': course},
        boost_dict=boost,
        num_results=5
    )

    return results

### Evaluating Hit Rate and MRR on MinSearch

In [18]:
relevance_total = []

for q in tqdm(ground_truth):
    doc_id = q['document']
    results = minsearch_search(query=q['question'], course=q['course'])
    relevance = [d['id'] == doc_id for d in results]
    relevance_total.append(relevance)

100%|█████████████████████████████████████████████████████████████████████| 4627/4627 [00:07<00:00, 640.15it/s]


In [19]:
hit_rate(relevance_total), mrr(relevance_total)

(0.7722066133563864, 0.661454506159499)

Compare with ES Search results:
    
(0.7395720769397017, 0.6029788920106625)

### Setup evaluate()
Accepts a search function and returns the Hit Rate and MRR results.

In [20]:
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [21]:
evaluate(ground_truth, lambda q: elastic_search(q['question'], q['course']))

100%|█████████████████████████████████████████████████████████████████████| 4627/4627 [00:15<00:00, 301.59it/s]


{'hit_rate': 0.7395720769397017, 'mrr': 0.6029788920106625}

In [22]:
evaluate(ground_truth, lambda q: minsearch_search(q['question'], q['course']))

100%|█████████████████████████████████████████████████████████████████████| 4627/4627 [00:07<00:00, 619.47it/s]


{'hit_rate': 0.7722066133563864, 'mrr': 0.661454506159499}