# Index documents with IDs

In [5]:
# load created document with ids

import json 

with open ('documents-with-ids.json', 'rt') as f_in: 
    documents = json.load(f_in)

In [6]:
# index documents 

from elasticsearch import Elasticsearch 

es_client = Elasticsearch('http://localhost:9200')

index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            # added extra field for id 
            "id": {"type": "keyword"},
        }
    }
}

index_name = "course-questions"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

from tqdm import tqdm

for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 948/948 [00:05<00:00, 179.24it/s]


# Search query 

In [8]:
def elastic_search(query, course):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": course
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

In [10]:
# example 
elastic_search(
    query="I just discovered the course. Can I still join?",
    course="data-engineering-zoomcamp"
)

[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp',
  'id': '7842b56a'},
 {'text': 'You can start by installing and setting up all the dependencies and requirements:\nGoogle cloud account\nGoogle Cloud SDK\nPython 3 (installed with Anaconda)\nTerraform\nGit\nLook over the prerequisites and syllabus to see if you are comfortable with these subjects.',
  'section': 'General course-related questions',
  'question': 'Course - What can I do before the course starts?',
  'course': 'data-engineering-zoomcamp',
  'id': '63394d91'},
 {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it fin

# Iterate over ground truth data queries to find matches

In [11]:
import pandas as pd 

In [14]:
df = pd.read_csv('ground-truth-data.csv')
df.head(1)

Unnamed: 0,question,course,document
0,What is the start date and time for the course?,data-engineering-zoomcamp,c02e79ef


In [15]:
ground_truth_dict = df.to_dict(orient = 'records')
ground_truth_dict

[{'question': 'What is the start date and time for the course?',
  'course': 'data-engineering-zoomcamp',
  'document': 'c02e79ef'},
 {'question': "How can I access the live 'Office Hours' session at the beginning of the course?",
  'course': 'data-engineering-zoomcamp',
  'document': 'c02e79ef'},
 {'question': 'Is there a specific platform I need to subscribe to for course updates?',
  'course': 'data-engineering-zoomcamp',
  'document': 'c02e79ef'},
 {'question': 'What is the registration process before the course begins?',
  'course': 'data-engineering-zoomcamp',
  'document': 'c02e79ef'},
 {'question': 'Where can I find the Telegram channel for course announcements?',
  'course': 'data-engineering-zoomcamp',
  'document': 'c02e79ef'},
 {'question': 'What should I have experience with before enrolling in this course?',
  'course': 'data-engineering-zoomcamp',
  'document': '1f6520ca'},
 {'question': 'Are there any specific skills or tools I need to know for this course?',
  'course'

In [26]:
# Initialize list to hold relevance results
relevance_total = []

# Iterate over each query in the ground truth dictionary
for q in tqdm(ground_truth_dict): 
    doc_id = q['document']  # Get the expected document ID
    results = elastic_search(query=q['question'], course=q['course'])  # Perform search

    # Create a list of True/False indicating if the result ID matches the expected doc_id
    relevance = [d['id'] == doc_id for d in results]

    # Append relevance list to the total list
    relevance_total.append(relevance)


100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 4735/4735 [00:37<00:00, 124.72it/s]


# Metrics

- Hit-Rate (Recall): How many of the relevant items are retrieved; high recall means more relevant items are found
- Mean Reciprocal Rank (MRR): How soon the first relevant item appears in the results; higher MRR means relevant items appear earlier in the results


In [27]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

In [28]:
def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [40]:
# calculate metrics for data 
print(f"hit rate: {round(hit_rate(relevance_total), 3)}")
print(f"mrr: {round(mrr(relevance_total),3)}")

hit rate: 0.646
mrr: 0.495


In [42]:
def evaluate(ground_truth_dict, search_function):
    relevance_total = []

    for q in tqdm(ground_truth_dict):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    print(f"hit rate: {round(hit_rate(relevance_total), 3)}")
    print(f"mrr: {round(mrr(relevance_total),3)}")

In [44]:
evaluate(ground_truth_dict, lambda q: elastic_search(q['question'], q['course']))

100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 4735/4735 [00:23<00:00, 204.58it/s]


{'hit_rate': 0.6458289334741288, 'mrr': 0.4953220696937697}