In [None]:
import json

# From the ground_truth.ipynb, we created dataset with
# ids on it.
with open('doc_with_ids.json', 'rt') as f_in:
    documents = json.load(f_in)

In [None]:
documents[:5]

### With elastic search

In [None]:
pip install elasticsearch

In [None]:
from elasticsearch import Elasticsearch
es_client = Elasticsearch('http://localhost:9200')

In [None]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "id": {"type": "keyword"}
        }
    }
}

index_name = "course-questions"

es_client.indices.delete(index=index_name, ignore_unavailable=True) 
es_client.indices.create(index=index_name, body=index_settings) # Indexes are created

In [None]:
from tqdm.auto import tqdm

In [None]:
# Added course parameter
def elastic_search(query, course):
    # search query that we sent to elastic search
    search_query = {
        "size": 5, # we will get 5 answers in the result
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"], # question is 3 times more important. boost
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": course # filtering
                    }
                }
            }
        }
    }

    resp = es_client.search(index=index_name, body=search_query)

    result_docs = []

    # To format with text, section, question, course
    for hit in resp['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs

In [None]:
elastic_search(
    query= "Can I still join the course?",
    course="data-engineering-zoomcamp"
)

Now, we are going to iterate over all the queries in ground_truth data 

In [None]:
import pandas as pd

In [None]:
df_gt = pd.read_csv('ground-truth-data.csv')

In [None]:
gt_dict = df_gt.to_dict(orient='records')

In [None]:
relevance_total = []

for q in tqdm(gt_dict):
    doc_id = q['document'] # The doc_id from ground_truth dataset that AI created
    results = elastic_search(query=q['question'], course=q['course']) # Ask the AI question to the elastic search and find relevant questions
    relevance = [d['id'] == doc_id for d in results] # If doc_id from ground_truth and elastic search are same, it is relevant
    relevance_total.append(relevance)


In [None]:
example = [[True, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [False, False, True, False, False],
 [False, False, False, False, False],
 [False, False, False, True, False],
 [False, False, False, False, True]]

### Evaluating the model

In [None]:
def hit_rate(relevance_total):
    count=0

    for line in relevance_total:
        if True in line:
            count = count + 1

    return count/len(relevance_total)

In [None]:
hit_rate(example)

In [None]:
# MRR - According to the relevance

# 1 -> 2
# 2 -> 1/2 = 0.5
# 3 -> 1/3 = 0.333
# 4 -> 1/4 = 0.25
# 5 -> 1/5 = 0.2
# rank = -> 1/rank

In [None]:
def mrr(relevance_total):
    score=0.0

    for line in relevance_total:
        for rank in range(len(line)): # Now we interested in position
            if line[rank] == True:
                score = score + (1/(rank+1))

    return score/len(relevance_total)

In [None]:
mrr(example)

### With minsearch

In [None]:
!wget https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py

In [None]:
import minsearch

index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course", "id"]
)

index.fit(documents)

In [None]:
def minsearch_search(query, course):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': q['course']},
        boost_dict=boost,
        num_results=5
    )

    return results

In [None]:
relevance_total = []

for q in tqdm(gt_dict):
    doc_id = q['document'] # The doc_id from ground_truth dataset that AI created
    results = minsearch_search(query=q['question'], course=q['course']) # Ask the AI question to the elastic search and find relevant questions
    relevance = [d['id'] == doc_id for d in results] # If doc_id from ground_truth and elastic search are same, it is relevant
    relevance_total.append(relevance)


In [None]:
hit_rate(relevance_total)

In [None]:
mrr(relevance_total)

In [None]:
# Elastic search:
# Hit rate: 0.6428571428571429
# Mrr: 0.4845238095238095

# Minsearch:
# Hit rate: 0.7722066133563864
# Mrr: 0.661454506159499