In [95]:
import json

with open('documents-with-ids.json', 'rt') as f_in:
    documents = json.load(f_in)

In [96]:
from elasticsearch import Elasticsearch

es_client = Elasticsearch('http://localhost:9200') 

index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "id": {"type": "keyword"},
        }
    }
}

index_name = "course-questions"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [97]:
from tqdm.auto import tqdm

for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

100%|███████████████████████████| 948/948 [00:02<00:00, 390.53it/s]


In [156]:
def elastic_search(query, course):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": course
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

In [99]:
elastic_search(
    query="I just discovered the course. Can I still join?",
    course="data-engineering-zoomcamp"
)

[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp',
  'id': '7842b56a'},
 {'text': 'You can start by installing and setting up all the dependencies and requirements:\nGoogle cloud account\nGoogle Cloud SDK\nPython 3 (installed with Anaconda)\nTerraform\nGit\nLook over the prerequisites and syllabus to see if you are comfortable with these subjects.',
  'section': 'General course-related questions',
  'question': 'Course - What can I do before the course starts?',
  'course': 'data-engineering-zoomcamp',
  'id': '63394d91'},
 {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it fin

In [100]:
import  pandas as pd

In [101]:
df_ground_truth = pd.read_csv(
    'ground-truth-data.csv',
    sep=',',
    engine='python'
)


In [102]:
df_ground_truth.head(10)

Unnamed: 0,questions,course,document
0,What is the specific date and time when the co...,data-engineering-zoomcamp,c02e79ef
1,How can I stay updated with course announcemen...,data-engineering-zoomcamp,c02e79ef
2,Is there a registration process required befor...,data-engineering-zoomcamp,c02e79ef
3,What platform should I use to access the cours...,data-engineering-zoomcamp,c02e79ef
4,Where can I find the link to register for the ...,data-engineering-zoomcamp,c02e79ef
5,What specific skills or knowledge do I need be...,data-engineering-zoomcamp,1f6520ca
6,Can you point me to where I can find the requi...,data-engineering-zoomcamp,1f6520ca
7,Are there any prior courses or experiences nec...,data-engineering-zoomcamp,1f6520ca
8,Is there a resource that outlines the prerequi...,data-engineering-zoomcamp,1f6520ca
9,What should I have completed before I start th...,data-engineering-zoomcamp,1f6520ca


In [103]:
df_ground_truth[df_ground_truth[['questions', 'course', 'document']].isna().any(axis=1)]

Unnamed: 0,questions,course,document
2156,Can you explain the significance of the,,


In [107]:
df_ground_truth.loc[2155:2157]

Unnamed: 0,questions,course,document
2155,What kind of error might occur when running a ...,data-engineering-zoomcamp,d452b490
2156,Can you explain the significance of the,data-engineering-zoomcamp,d452b490
2157,character in the context of executing Python ...,data-engineering-zoomcamp,d452b490


In [108]:
#df_ground_truth.loc[2156]['course'] = 'data-engineering-zoomcamp'

In [109]:
#df_ground_truth.loc[2156]['document'] = 'd452b490'

In [110]:
df_ground_truth.rename(columns={'questions': 'question'}, inplace=True)

In [111]:
ground_truth = df_ground_truth.to_dict(orient = 'records')

In [130]:
relevance_total = []

for q in tqdm(ground_truth):
    doc_id = q['document']
    results = elastic_search(query=q['question'], course=q['course'])
    relevance = [d['id'] == doc_id for d in results]
    relevance_total.append(relevance)

100%|█████████████████████████| 4736/4736 [00:08<00:00, 543.91it/s]


In [131]:
relevance

[False, False, True, False, False]

In [132]:
#relevance_total

In [133]:
example = [[False, False, True, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [False, False, True, False, False],
 [False, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [False, False, False, True, False],
 [False, False, False, False, True],
 [False, True, False, False, False],
 [False, False, False, False, False]
          ]

In [134]:
example

[[False, False, True, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [False, False, True, False, False],
 [False, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [False, False, False, True, False],
 [False, False, False, False, True],
 [False, True, False, False, False],
 [False, False, False, False, False]]

In [135]:
def hit_rate(relevance_total):

    cnt = 0
    for line in relevance_total:
        if True in line:
            cnt += 1
    return cnt/len(relevance_total)

In [136]:
def mrr(relevance_total):

    total_score = 0.0
    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score += 1/(1+rank)
    return total_score/len(relevance_total)

In [137]:
hit_rate(relevance_total), mrr(relevance_total)

(0.6342905405405406, 0.4898754222972975)

In [144]:
import minsearch

index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course", "id"]
)

index.fit(documents)

<minsearch.minsearch.Index at 0x7d7782f408f0>

In [145]:
def minsearch_search(query, course):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': course},
        boost_dict=boost,
        num_results=5
    )

    return results

In [146]:
relevance_total = []

for q in tqdm(ground_truth):
    doc_id = q['document']
    results = minsearch_search(query=q['question'], course=q['course'])
    relevance = [d['id'] == doc_id for d in results]
    relevance_total.append(relevance)

100%|█████████████████████████| 4736/4736 [00:13<00:00, 346.73it/s]


In [147]:
hit_rate(relevance_total), mrr(relevance_total)

(0.7065033783783784, 0.5795221002252254)

Compare wit elastic_search:
```
(0.6342905405405406, 0.4898754222972975)
```

In [148]:
0.706503378378378-0.6342905405405406

0.07221283783783738

In [149]:
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)
    return{
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total)
    }

In [157]:
evaluate(ground_truth, lambda q: elastic_search(q['question'], q['course']))

100%|█████████████████████████| 4736/4736 [00:08<00:00, 542.53it/s]


{'hit_rate': 0.6342905405405406, 'mrr': 0.4898754222972975}

In [153]:
evaluate(ground_truth, lambda q: minsearch_search(q['question'], q['course']))

100%|█████████████████████████| 4736/4736 [00:13<00:00, 349.94it/s]


{'hit_rate': 0.7065033783783784, 'mrr': 0.5795221002252254}