Q1. Getting the embeddings model

In [4]:
from tqdm.autonotebook import tqdm, trange

  from tqdm.autonotebook import tqdm, trange


In [6]:
from sentence_transformers import SentenceTransformer
model_name = "multi-qa-distilbert-cos-v1"
embedding_model = SentenceTransformer(model_name)

In [7]:
user_question = "I just discovered the course. Can I still join it?"
result = embedding_model.encode(user_question)
result[0]

0.07822261

In [8]:
import requests 

base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/documents-with-ids.json'
docs_url = f'{base_url}/{relative_url}?raw=1'
docs_response = requests.get(docs_url)
documents = docs_response.json()

len(documents)

948

In [9]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp',
 'id': 'c02e79ef'}

Q2. Creating the embeddings

In [10]:
from tqdm.auto import tqdm

In [11]:
embeddings = []
for doc in tqdm(documents):
    course = doc["course"]
    qa_text = f'{doc["question"]} {doc["text"]}'
    embeddings.append(embedding_model.encode(qa_text))
embeddings[0]    

  0%|          | 0/948 [00:00<?, ?it/s]

array([ 0.06168452, -0.00477428,  0.06304011,  0.02210033,  0.07377933,
       -0.01046654, -0.00822813,  0.07933962,  0.01246424, -0.00729041,
       -0.0302358 , -0.02019793,  0.03527085, -0.05305605,  0.02293106,
        0.0078349 ,  0.01005515, -0.04411037, -0.01710245, -0.03001303,
       -0.02257778, -0.00115793,  0.01737353,  0.02371312,  0.00825784,
        0.08338156,  0.01001803,  0.03136029, -0.00705907,  0.01127569,
       -0.01551907, -0.02476792,  0.04632921,  0.04258855, -0.02803566,
       -0.00525214, -0.04778513,  0.05086436,  0.00728664,  0.04397769,
       -0.01340191,  0.04174938,  0.0275125 , -0.02929592, -0.03890917,
       -0.04555696, -0.02457903, -0.03068175, -0.04773024,  0.00363463,
       -0.03223592,  0.03005083,  0.03857941, -0.00880594, -0.03817934,
       -0.02932218,  0.05444925, -0.03060417, -0.04463731,  0.0515459 ,
       -0.04269945,  0.02487225, -0.03487107,  0.00680872,  0.01479469,
       -0.02789131,  0.03247127, -0.03554043, -0.023505  ,  0.02

In [13]:
operations = []
for doc in tqdm(documents):
    # Transforming the title into an embedding using the model
    question = doc['question']
    text = doc['text']
    qt = question + ' ' + text
    doc['question_vector'] = embedding_model.encode(question)
    doc['text_vector'] = embedding_model.encode(text)
    doc['question_text_vector'] = embedding_model.encode(qt)
    operations.append(doc)
operations[0]

  0%|          | 0/948 [00:00<?, ?it/s]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp',
 'id': 'c02e79ef',
 'question_vector': array([ 1.55808870e-02, -2.45603379e-02,  6.55574650e-02,  4.41504344e-02,
         8.81910846e-02, -2.61536911e-02, -2.68038027e-02,  6.40117601e-02,
        -2.01325975e-02,  1.58315767e-02,  1.42648127e-02, -2.26580556e-02,
         8.13051909e-02, -2.77104899e-02,  4.82684597e-02,  1.24280406e-02,
         6.43338412e-02, -4.93125431e-02

In [14]:
import numpy as np

In [15]:
X = np.array(embeddings)
X.shape

(948, 768)

Q3. Search

In [20]:
max(X.dot(result))

0.6506575

In [16]:
class VectorSearchEngine():
    def __init__(self, documents, embeddings):
        self.documents = documents
        self.embeddings = embeddings

    def search(self, v_query, num_results=10):
        scores = self.embeddings.dot(v_query)
        idx = np.argsort(-scores)[:num_results]
        return [self.documents[i] for i in idx]

search_engine = VectorSearchEngine(documents=documents, embeddings=X)

Q4. Hit-rate for our search engine

In [23]:
!pip install -U pandas

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [24]:
import pandas as pd

In [26]:
base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/ground-truth-data.csv'
ground_truth_url = f'{base_url}/{relative_url}?raw=1'

df_ground_truth = pd.read_csv(ground_truth_url)
df_ground_truth = df_ground_truth[df_ground_truth.course == 'machine-learning-zoomcamp']
ground_truth = df_ground_truth.to_dict(orient='records')

In [27]:
ground_truth[0]

{'question': 'Where can I sign up for the course?',
 'course': 'machine-learning-zoomcamp',
 'document': '0227b872'}

In [16]:
len(ground_truth)

1830

In [34]:
relevance_total = []

for q in tqdm(ground_truth):
    doc_id = q['document']
    result = embedding_model.encode(f'{q["question"]}')
    results = search_engine.search(result,num_results=5)
    relevance = [d['id'] == doc_id for d in results]
    relevance_total.append(relevance)

relevance_total

  0%|          | 0/1830 [00:00<?, ?it/s]

[[False, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [False, True, False, False, False],
 [False, False, True, False, False],
 [True, False, False, False, False],
 [False, True, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [False, False, True, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [False, False, True, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [False, True, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False]

In [30]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

In [35]:
hit_rate(relevance_total)

0.9218579234972678

Q5. Indexing with Elasticsearch

In [36]:
from elasticsearch import Elasticsearch

es_client = Elasticsearch('http://localhost:9200') 


In [37]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "id": {"type": "keyword"},
            "question_vector": {
                "type": "dense_vector",
                "dims": 768,
                "index": True,
                "similarity": "cosine"
            },
            "text_vector": {
                "type": "dense_vector",
                "dims": 768,
                "index": True,
                "similarity": "cosine"
            },
            "question_text_vector": {
                "type": "dense_vector",
                "dims": 768,
                "index": True,
                "similarity": "cosine"
            },
        }
    }
}

index_name = "course-questions"

In [38]:
es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [39]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

  0%|          | 0/948 [00:00<?, ?it/s]

In [67]:
query = {
    "field": "text_vector",
    "query_vector": result,
    "k": 5,
    "num_candidates": 10000, 
}

In [94]:
res = es_client.search(index=index_name, knn=query, source=["text", "section", "question", "course"])
res["hits"]["hits"][0]

{'_index': 'course-questions',
 '_id': '16yEmJABel8YS7aHtEGF',
 '_score': 0.7371134,
 '_source': {'question': 'Any advice for adding the Machine Learning Zoomcamp experience to your LinkedIn profile?',
  'course': 'machine-learning-zoomcamp',
  'section': 'Miscellaneous',
  'text': 'I’ve seen LinkedIn users list DataTalksClub as Experience with titles as:\nMachine Learning Fellow\nMachine Learning Student\nMachine Learning Participant\nMachine Learning Trainee\nPlease note it is best advised that you do not list the experience as an official “job” or “internship” experience since DataTalksClub did not hire you, nor financially compensate you.\nOther ways you can incorporate the experience in the following sections:\nOrganizations\nProjects\nSkills\nFeatured\nOriginal posts\nCertifications\nCourses\nBy Annaliese Bronz\nInteresting question, I put the link of my project into my CV as showcase and make posts to show my progress.\nBy Ani Mkrtumyan'}}

Q6. Hit-rate for Elasticsearch

In [80]:
def elastic_search_knn(field, vector, course):
    knn = {
        "field": field,
        "query_vector": vector,
        "k": 5,
        "num_candidates": 10000,
        "filter": {
            "term": {
                "course": course
            }
        }
    }

    search_query = {
        "knn": knn,
        "_source": ["text", "section", "question", "course", "id"]
    }

    es_results = es_client.search(
        index=index_name,
        body=search_query
    )
    
    result_docs = []
    
    for hit in es_results['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs

In [88]:
def question_vector_knn(q):
    question = q['question']
    course = q['course']

    v_q = embedding_model.encode(question)

    return elastic_search_knn('question_vector', v_q, course)

In [89]:
def evaluate(ground_truth):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = question_vector_knn(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total)
    }

In [90]:
evaluate(ground_truth)

  0%|          | 0/1830 [00:00<?, ?it/s]

{'hit_rate': 0.8076502732240437}