In [1]:
import json

with open('documents-with-ids.json', 'rt') as f_in:
    documents = json.load(f_in)

In [2]:
from sentence_transformers import SentenceTransformer

  from tqdm.autonotebook import tqdm, trange


In [4]:
model_name = 'multi-qa-MiniLM-L6-cos-v1'
model = SentenceTransformer(model_name)

In [5]:
v = model.encode('I just discovered the course. Can I still join?')
len(v)

384

We need to create an embedding for each question, for each answer, and for each the combination of question and answer. So 3 embeddings in total.

In [8]:
from elasticsearch import Elasticsearch

es_client = Elasticsearch('http://localhost:9200') 

index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "id": {"type": "keyword"},
            "question_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
            "text_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
            "question_Text_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            }
        }
    }
}

index_name = "course-questions"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ConnectionError: Connection error caused by: ConnectionError(Connection error caused by: NewConnectionError(<urllib3.connection.HTTPConnection object at 0x771d2ed0c670>: Failed to establish a new connection: [Errno 111] Connection refused))

In [51]:

def elastic_search(query, course):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": course
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

In [52]:
from tqdm.auto import tqdm

for doc in tqdm(documents):
    question = doc['question']
    text = doc['text']
    qt = question + ' ' + text

    vq = model.encode(question)
    vt = model.encode(text)
    vqt = model.encode(qt)
    es_client.index(index=index_name, document=doc)

100%|██████████| 948/948 [00:25<00:00, 37.00it/s]


In [15]:
import pandas as pd
df_ground_truth = pd.read_csv('ground-truth-data.csv')
ground_truth = df_ground_truth.to_dict(orient='records')

In [16]:
df_ground_truth

Unnamed: 0,question,course,document
0,When does the course begin?,data-engineering-zoomcamp,c02e79ef
1,How can I get the course schedule?,data-engineering-zoomcamp,c02e79ef
2,What is the link for course registration?,data-engineering-zoomcamp,c02e79ef
3,How can I receive course announcements?,data-engineering-zoomcamp,c02e79ef
4,Where do I join the Slack channel?,data-engineering-zoomcamp,c02e79ef
...,...,...,...
4622,How should I destroy infrastructure created us...,mlops-zoomcamp,886d1617
4623,What is the first step to destroy AWS infrastr...,mlops-zoomcamp,886d1617
4624,Can I destroy infrastructure created with GitH...,mlops-zoomcamp,886d1617
4625,What command initializes Terraform with specif...,mlops-zoomcamp,886d1617


In [8]:
ground_truth

[{'Unnamed: 0': 0,
  'question': 'When does the course begin?',
  'course': 'data-engineering-zoomcamp',
  'document': 'c02e79ef'},
 {'Unnamed: 0': 1,
  'question': 'How can I get the course schedule?',
  'course': 'data-engineering-zoomcamp',
  'document': 'c02e79ef'},
 {'Unnamed: 0': 2,
  'question': 'What is the link for course registration?',
  'course': 'data-engineering-zoomcamp',
  'document': 'c02e79ef'},
 {'Unnamed: 0': 3,
  'question': 'How can I receive course announcements?',
  'course': 'data-engineering-zoomcamp',
  'document': 'c02e79ef'},
 {'Unnamed: 0': 4,
  'question': 'Where do I join the Slack channel?',
  'course': 'data-engineering-zoomcamp',
  'document': 'c02e79ef'},
 {'Unnamed: 0': 5,
  'question': 'Where can I find the prerequisites for this course?',
  'course': 'data-engineering-zoomcamp',
  'document': '1f6520ca'},
 {'Unnamed: 0': 6,
  'question': 'How do I check the prerequisites for this course?',
  'course': 'data-engineering-zoomcamp',
  'document': '1f

In [18]:
relevance_total = []

for row in tqdm(ground_truth):
    doc_id = row['document']
    results = elastic_search(query=row['question'], course=row['course'])
    relevance = [d['id'] == doc_id for d in results]
    relevance_total.append(relevance)

100%|██████████| 4627/4627 [00:14<00:00, 320.82it/s]


In [24]:
doc_id

'886d1617'

In [20]:
relevance_total

[[True, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [False, False, True, False, False],
 [False, False, False, False, False],
 [False, False, False, True, False],
 [False, False, False, False, True],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [],
 [],
 [],
 [],
 [],
 [True, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [False, False, True, False, False],
 [False, True, False, False, False],
 [False, True, False, False, False],
 [True

In [21]:
example = [
    [True, False, False, False, False], # 1, 1
 [False, False, False, False, False], # 0, 0
 [False, False, False, False, False], # 0, 0
 [False, False, False, False, False], # 0, 0
 [False, False, False, False, False], # 0, 0
 [True, False, False, False, False], # 1, 1
 [True, False, False, False, False], # 1, 1
 [True, False, False, False, False], # 1, 1
 [True, False, False, False, False], # 1, 1
 [True, False, False, False, False], # 1, 1
 [False, False, True, False, False], # 1, 1/3
 [False, False, False, False, False] # 0, 0
]

- hit-rate (recall): if there is at least 1 true, it gets 1
- mean reciprocal rate (MRR): doesn't only check if True is there, but if it's in the first position.
ex: if True in 0 index -> 1, if True in 1 index -> 1/2, if True in 2 index, 1/3... etc

In [33]:
def hit_rate(relevance_total):
    counter = 0
    for line in relevance_total:
        if True in line:
            counter += 1
    
    return counter / len(relevance_total)

hit_rate(example)

0.5833333333333334

In [32]:
def mrr(relevance_total):
    counter = 0
    for line in relevance_total:
        for i in range(len(line)):
            if line[i] == True:
                counter += 1/(i+1)

    return counter / len(relevance_total)

mrr(example)

0.5277777777777778

In [34]:
hit_rate(relevance_total), mrr(relevance_total)

(0.7395720769397017, 0.6029788920106625)

In [35]:
!wget https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/01-intro/minsearch.py

--2024-07-21 13:36:53--  https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/01-intro/minsearch.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.111.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3832 (3.7K) [text/plain]
Saving to: ‘minsearch.py’


2024-07-21 13:36:53 (29.8 MB/s) - ‘minsearch.py’ saved [3832/3832]



In [36]:
import minsearch

index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course", "id"]
)

index.fit(documents)

<minsearch.Index at 0x70794b059ff0>

In [38]:
def minsearch_search(query, course):
    boost = {'question': 3, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': course},
        boost_dict=boost,
        num_results=5
    )
    
    return results

In [39]:
relevance_total = []

for row in tqdm(ground_truth):
    doc_id = row['document']
    results = minsearch_search(query=row['question'], course=row['course'])
    relevance = [d['id'] == doc_id for d in results]
    relevance_total.append(relevance)

100%|██████████| 4627/4627 [00:16<00:00, 282.47it/s]


In [40]:
hit_rate(relevance_total), mrr(relevance_total)

(0.7722066133563864, 0.661454506159499)

minsearch is slightly better than elasticsearch. 

In [43]:
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total)
    }

In [46]:

evaluate(ground_truth, lambda q: minsearch_search(q['question'], q['course']))

100%|██████████| 4627/4627 [00:16<00:00, 280.47it/s]


{'hit_rate': 0.7722066133563864, 'mrr': 0.661454506159499}

In [53]:
evaluate(ground_truth, lambda q: elastic_search(q['question'], q['course']))

100%|██████████| 4627/4627 [00:17<00:00, 263.31it/s]


{'hit_rate': 0.7395720769397017, 'mrr': 0.6031950147683889}