## Import necessary libraries

In [1]:
import os, requests 

import numpy as np
from tqdm.auto import tqdm
from pprint import pprint

import pandas as pd

from sentence_transformers import SentenceTransformer
from elasticsearch import Elasticsearch


  from .autonotebook import tqdm as notebook_tqdm


## Download the requested HF model

In [2]:
model_name = "multi-qa-distilbert-cos-v1"
embedding_model = SentenceTransformer(model_name)

In [None]:
%%time
user_question = "I just discovered the course. Can I still join it?"
vector = embedding_model.encode(user_question)

CPU times: user 73.9 ms, sys: 29.3 ms, total: 103 ms
Wall time: 103 ms


In [None]:
vector[0]

0.07822262

In [None]:
vector.shape

(768,)

## Fetch documents

In [None]:
base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/documents-with-ids.json'

docs_url = f'{base_url}/{relative_url}?raw=1'
docs_response = requests.get(docs_url)
documents = docs_response.json()

In [None]:
type(documents)

list

In [None]:
len(documents)

948

In [None]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp',
 'id': 'c02e79ef'}

In [None]:
documents_filtered = []

for _ in documents:
    if _['course'] == "machine-learning-zoomcamp":
        documents_filtered.append(_)

len(documents_filtered)

375

## Creating embeddings for both Q&A parts

In [None]:
%%time

embeddings = []

for _ in tqdm(documents_filtered):

    question, text = _['question'], _['text']
    qa_text = f'{question} {text}'

    embeddings.append(embedding_model.encode(qa_text))


# cast list to np.array
X = np.array(embeddings)
X.shape

100%|██████████| 375/375 [00:24<00:00, 15.07it/s]

CPU times: user 21.6 s, sys: 1.2 s, total: 22.8 s
Wall time: 24.9 s





(375, 768)

## Search based on given query

In [None]:
vector.dot(vector)

0.9999999

In [None]:
scores = X.dot(vector)

print(scores.shape)

(375,)


In [None]:
np.max(scores), np.argmax(scores)

(0.6506574, 14)

In [None]:
class VectorSearchEngine():
    
    def __init__(self, documents, embeddings):
        self.documents = documents
        self.embeddings = embeddings

    def search(self, v_query, num_results=10):
        scores = self.embeddings.dot(v_query)
        idx = np.argsort(-scores)[:num_results]
        return [self.documents[i] for i in idx]
    
    def search_(self, v_query, num_results=10):
        scores = self.embeddings.dot(v_query)
        idx = np.argpartition(-scores, num_results)[:num_results]
        return [self.documents[i] for i in idx]

search_engine = VectorSearchEngine(documents=documents_filtered, embeddings=X)

In [None]:
%%timeit -n 10

search_engine.search(vector, num_results=5)

The slowest run took 575.73 times longer than the fastest. This could mean that an intermediate result is being cached.
1.23 ms ± 2.29 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [None]:
%%timeit -n 10

search_engine.search_(vector, num_results=5)

The slowest run took 166.77 times longer than the fastest. This could mean that an intermediate result is being cached.
243 μs ± 571 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)


## Hit rate

In [None]:
base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/ground-truth-data.csv'
ground_truth_url = f'{base_url}/{relative_url}?raw=1'

df_ground_truth = pd.read_csv(ground_truth_url)
df_ground_truth = df_ground_truth[df_ground_truth.course == 'machine-learning-zoomcamp']
ground_truth = df_ground_truth.to_dict(orient='records')

In [None]:
len(ground_truth)

1830

In [None]:
ground_truth[0:5]

[{'question': 'Where can I sign up for the course?',
  'course': 'machine-learning-zoomcamp',
  'document': '0227b872'},
 {'question': 'Can you provide a link to sign up?',
  'course': 'machine-learning-zoomcamp',
  'document': '0227b872'},
 {'question': 'Is there an FAQ for this Machine Learning course?',
  'course': 'machine-learning-zoomcamp',
  'document': '0227b872'},
 {'question': 'Does this course have a GitHub repository for the sign-up link?',
  'course': 'machine-learning-zoomcamp',
  'document': '0227b872'},
 {'question': 'How can I structure my questions and answers for the course?',
  'course': 'machine-learning-zoomcamp',
  'document': '0227b872'}]

In [None]:
def calc_HR(model, ground_truth, search_engine, num_results):

    cnt = 0
    total = len(ground_truth)

    for _, record in enumerate(ground_truth):

        query_vector = model.encode(record['question'])
        results = search_engine.search_(query_vector, num_results)

        if record['document'] in [result['id'] for result in results]:
            cnt += 1

        if (_ % 200) == 0 and (_ > 0):
            print(f"HitRate at id {_}", np.round(cnt / _, 3))
    
    hitrate = cnt / total

    return np.round(hitrate,3)


In [None]:
hitrate = calc_HR(embedding_model, ground_truth, search_engine, num_results=5)
print(f"\n\nFinal Hit-rate: {hitrate}")

HitRate at id 200 0.965
HitRate at id 400 0.975
HitRate at id 600 0.953
HitRate at id 800 0.944
HitRate at id 1000 0.938
HitRate at id 1200 0.94
HitRate at id 1400 0.938
HitRate at id 1600 0.941
HitRate at id 1800 0.941


Final Hit-rate: 0.94


## ElasticSearch on the stage

In [None]:
!open -a Docker

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
!docker run \
    --rm \
    --name elasticsearch \
    -d \
    -p 9200:9200 \
    -p 9300:9300 \
    -e "discovery.type=single-node" \
    -e "xpack.security.enabled=false" \
    docker.elastic.co/elasticsearch/elasticsearch:8.4.3

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


8cb6ae902a3b8dae69a0ae06bd92a39fed25122275a3af3be06390de078e0844


In [None]:
es_client = Elasticsearch('http://localhost:9200') 

index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "id": {"type": "keyword"},
            "question_vector": {
                "type": "dense_vector",
                "dims": 768,
                "index": True,
                "similarity": "cosine"
            },
        }
    }
}

index_name = "course-questions"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [None]:
%%time

for doc, emb in zip( tqdm(documents_filtered), X):
    
    try:

        doc["question_vector"] = emb  
        es_client.index(index="course-questions", document=doc)

    except Exception as e:

        print(f"Error processing document: {e}")

100%|██████████| 375/375 [00:00<00:00, 393.58it/s]

CPU times: user 212 ms, sys: 15.8 ms, total: 227 ms
Wall time: 954 ms





In [None]:
index_mapping = es_client.indices.get_mapping(index=index_name)
pprint(index_mapping[index_name]['mappings'], indent=4)

{   'properties': {   'course': {'type': 'keyword'},
                      'id': {'type': 'keyword'},
                      'question': {'type': 'text'},
                      'question_vector': {   'dims': 768,
                                             'index': True,
                                             'similarity': 'cosine',
                                             'type': 'dense_vector'},
                      'section': {'type': 'text'},
                      'text': {'type': 'text'}}}


In [None]:
def elastic_search(field, vector, num_results):

    knn = {
        "field": field,
        "query_vector": vector,
        "k": num_results,
        "num_candidates": 10000
    }

    search_query = {
        "knn": knn,
        "_source": ["id", "text", "section", "question", "course"]  
    }

    es_results = es_client.search(
        index=index_name,
        body=search_query
    )

    result_docs = []
    
    for hit in es_results['hits']['hits']:
        
        result_docs.append({
            "text": hit['_source']['text'],
            "section": hit['_source'].get('section', 'Unknown'),  
            "question": hit['_source'].get('question', 'Unknown'),
            "course": hit['_source'].get('course', 'Unknown'),
            "id": hit['_source'].get('id') 
        })
    
    return result_docs

In [None]:
elastic_search("question_vector", embedding_model.encode(user_question), num_results=1)


[{'text': 'Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the course.\nIn order to get a certificate, you need to submit 2 out of 3 course projects and review 3 peers’ Projects by the deadline. It means that if you join the course at the end of November and manage to work on two projects, you will still be eligible for a certificate.',
  'section': 'General course-related questions',
  'question': 'The course has already started. Can I still join it?',
  'course': 'machine-learning-zoomcamp',
  'id': 'ee58a693'}]

In [None]:
check = [doc for doc in documents_filtered if doc["id"] == "ee58a693"][0]['question_vector']
check.shape

(768,)

In [None]:
embedding_model.encode(user_question).dot(check)

0.6506574

In [None]:
def calc_HR_es(model, ground_truth, num_results):

    cnt = 0
    total = len(ground_truth)

    for _, record in enumerate(ground_truth):

        query_vector = model.encode(record['question'])
        results = elastic_search("question_vector", query_vector, num_results)

        if record['document'] in [result['id'] for result in results]:
            cnt += 1

        if (_ % 200) == 0 and (_ > 0):
            print(f"HitRate at id {_}", np.round(cnt / _, 3))
    
    hitrate = cnt / total

    return np.round(hitrate,3)

In [None]:
hitrate = calc_HR_es(embedding_model, ground_truth, num_results=5)
print(f"\n\nFinal Hit-rate with ES: {hitrate}")

HitRate at id 200 0.87
HitRate at id 400 0.908
HitRate at id 600 0.88
HitRate at id 800 0.869
HitRate at id 1000 0.864
HitRate at id 1200 0.871
HitRate at id 1400 0.872
HitRate at id 1600 0.876
HitRate at id 1800 0.879


Final Hit-rate with ES: 0.878


In [None]:
#"That's a wrap"