In [1]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("multi-qa-distilbert-cos-v1")

  from .autonotebook import tqdm as notebook_tqdm


Q1: Getting the embeddings model

In [2]:

user_question = "I just discovered the course. Can I still join it?"

embedding = model.encode(user_question)
print(embedding)

[ 7.82226548e-02 -4.01311405e-02  3.86135913e-02 -1.78966438e-04
  8.92347097e-02 -5.04591092e-02 -1.05026569e-02  3.71055678e-02
 -4.18713912e-02  3.48084792e-02 -1.20701883e-02 -2.36942340e-02
  3.87900174e-02  1.60988607e-02  3.50747295e-02  3.04746162e-03
  5.79672381e-02 -4.10627462e-02 -3.41552682e-02 -2.56396383e-02
 -3.55263911e-02  1.42908087e-02 -1.62799917e-02  3.21446545e-02
 -4.66897376e-02  7.89186060e-02  4.90160920e-02  1.56761166e-02
 -1.69110075e-02  2.26482227e-02  5.60206100e-02 -3.98361087e-02
  6.77409917e-02 -1.20209912e-02  1.12621894e-03 -1.94394365e-02
 -2.65951678e-02  1.06177367e-02  1.69687122e-02  1.13487840e-02
 -2.97063086e-02  5.25258258e-02 -1.41453547e-02  4.61699851e-02
  1.17066065e-02 -2.38053519e-02 -6.32558241e-02 -1.92042235e-02
 -7.10592186e-03  3.24167833e-02  2.49618199e-02 -5.27503015e-03
  2.01149024e-02 -3.72371152e-02  3.46405394e-02 -3.29310261e-02
 -2.01484803e-02  5.07839303e-03 -4.55506742e-02  7.89169688e-03
 -4.91713583e-02  4.69897

Q2: Creating the embeddings

In [3]:
import requests
import numpy as np

base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/documents-with-ids.json'
docs_url = f'{base_url}/{relative_url}?raw=1'
docs_response = requests.get(docs_url)
documents = docs_response.json()

filtered_documents = [doc for doc in documents if doc['course'] == 'machine-learning-zoomcamp']

print(f'Number of documents: {len(filtered_documents)}')  # Should be 375

embeddings = []
for doc in filtered_documents:
    question = doc['question']
    text = doc['text']
    qa_text = f'{question} {text}'
    embedding = model.encode(qa_text)
    embeddings.append(embedding)
X = np.array(embeddings)
print(f'Shape of X: {X.shape}')

Number of documents: 375
Shape of X: (375, 768)


In [4]:
user_question = "I just discovered the course. Can I still join it?"
v = model.encode(user_question)

cosine_similarities = X @ v
sorted_indices = np.argsort(cosine_similarities)[::-1]

top_k = 5
for i in range(top_k):
    idx = sorted_indices[i]
    print(f'Document ID: {filtered_documents[idx]["id"]}, Similarity Score: {cosine_similarities[idx]}')

Document ID: ee58a693, Similarity Score: 0.6506572961807251
Document ID: 0a278fb2, Similarity Score: 0.4717075824737549
Document ID: 6ba259b1, Similarity Score: 0.4589996635913849
Document ID: 9f261648, Similarity Score: 0.45699024200439453
Document ID: e7ba6b8a, Similarity Score: 0.45055824518203735


In [5]:
class VectorSearchEngine():
    def __init__(self, documents, embeddings):
        self.documents = documents
        self.embeddings = embeddings

    def search(self, v_query, num_results=10):
        scores = self.embeddings.dot(v_query)
        idx = np.argsort(-scores)[:num_results]
        return [self.documents[i] for i in idx]

search_engine = VectorSearchEngine(documents=documents, embeddings=X)
search_engine.search(v, num_results=5)

[{'text': 'You can find the latest and up-to-date deadlines here: https://docs.google.com/spreadsheets/d/e/2PACX-1vQACMLuutV5rvXg5qICuJGL-yZqIV0FBD84CxPdC5eZHf8TfzB-CJT_3Mo7U7oGVTXmSihPgQxuuoku/pubhtml\nAlso, take note of Announcements from @Au-Tomator for any extensions or other news. Or, the form may also show the updated deadline, if Instructor(s) has updated it.',
  'section': 'General course-related questions',
  'question': 'Homework - What are homework and project deadlines?',
  'course': 'data-engineering-zoomcamp',
  'id': 'a1daf537'},
 {'text': 'After you submit your homework it will be graded based on the amount of questions in a particular homework. You can see how many points you have right on the page of the homework up top. Additionally in the leaderboard you will find the sum of all points you’ve earned - points for Homeworks, FAQs and Learning in Public. If homework is clear, others work as follows: if you submit something to FAQ, you get one point, for each learning i

In [6]:
import pandas as pd

base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/ground-truth-data.csv'
ground_truth_url = f'{base_url}/{relative_url}?raw=1'

df_ground_truth = pd.read_csv("/workspaces/llm-zoomcamp/learning/vector_db/ground-truth-data.csv")
df_ground_truth = df_ground_truth[df_ground_truth.course == 'machine-learning-zoomcamp']
ground_truth = df_ground_truth.to_dict(orient='records')
ground_truth

[{'question': 'Where can I sign up for the course?',
  'course': 'machine-learning-zoomcamp',
  'document': '0227b872'},
 {'question': 'Can you provide a link to sign up?',
  'course': 'machine-learning-zoomcamp',
  'document': '0227b872'},
 {'question': 'Is there an FAQ for this Machine Learning course?',
  'course': 'machine-learning-zoomcamp',
  'document': '0227b872'},
 {'question': 'Does this course have a GitHub repository for the sign-up link?',
  'course': 'machine-learning-zoomcamp',
  'document': '0227b872'},
 {'question': 'How can I structure my questions and answers for the course?',
  'course': 'machine-learning-zoomcamp',
  'document': '0227b872'},
 {'question': 'Are the course videos live or pre-recorded?',
  'course': 'machine-learning-zoomcamp',
  'document': '39fda9f0'},
 {'question': 'When can I start watching the course videos?',
  'course': 'machine-learning-zoomcamp',
  'document': '39fda9f0'},
 {'question': 'Are the live office hours sessions recorded?',
  'cours

In [11]:
import numpy as np

document_texts = [doc['text'] for doc in documents]
X = model.encode(document_texts, convert_to_numpy=True)
v = model.encode('I just discovered the course. Can I still join?', convert_to_numpy=True)
search_engine = VectorSearchEngine(documents=documents, embeddings=X)
results = search_engine.search(v, num_results=5)
for result in results:
    print(result)

{'text': 'Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the course.\nIn order to get a certificate, you need to submit 2 out of 3 course projects and review 3 peers’ Projects by the deadline. It means that if you join the course at the end of November and manage to work on two projects, you will still be eligible for a certificate.', 'section': 'General course-related questions', 'question': 'The course has already started. Can I still join it?', 'course': 'machine-learning-zoomcamp', 'id': 'ee58a693'}
{'text': 'We won’t re-record the course videos. The focus of the course and the skills we want to teach remained the same, and the videos are still up-to-date.\nIf you haven’t taken part in the previous iteration, you can start watching the videos. It’ll be useful for you and you will learn new things. However, we recommend using Python 3.10 now instead of Python 3.8.', 'section': 'General course-related questions', 'question': 'The cours

In [12]:
import pandas as pd

def hit_rate(relevance_total):
    cnt = 0
    for line in relevance_total:
        if True in line:
            cnt += 1
    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0
    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score += 1 / (rank + 1)
    return total_score / len(relevance_total)

# Evaluate the search engine
def evaluate(ground_truth, search_engine):
    relevance_total = []
    for q in ground_truth:
        query_text = q['question']
        doc_id = q['document']
        v_query = model.encode(query_text, convert_to_numpy=True)
        results = search_engine.search(v_query, num_results=5)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)
    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

evaluation_results = evaluate(ground_truth, search_engine)
print(evaluation_results)

{'hit_rate': 0.848087431693989, 'mrr': 0.7310018214936247}


In [39]:
from elasticsearch import Elasticsearch

es_client = Elasticsearch('http://localhost:9200')

# Define index settings and mappings
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "id": {"type": "keyword"},
            "question_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            }
        }
    }
}

index_name = "machine-learning-zoomcamp-questions"
if es_client.indices.exists(index=index_name):
    es_client.indices.delete(index=index_name)
es_client.indices.create(index=index_name, body=index_settings)

for doc in documents:
    question = doc['question']
    text = doc['text']
    course = doc['course']
    question_vector = model.encode(question).tolist()

    doc_to_index = {
        'text': text,
        'section': doc['section'],
        'question': question,
        'course': course,
        'id': doc['id'],
        'question_vector': question_vector
    }
    es_client.index(index=index_name, document=doc_to_index)

In [40]:
query = "I just discovered the course. Can I still join it?"
v_q = model.encode(query).tolist()

# Perform a k-NN search to get the document with the highest score
def get_highest_score_doc(query_vector, course):
    search_query = {
        "size": 1,
        "query": {
            "script_score": {
                "query": {
                    "bool": {
                        "must": [
                            {
                                "term": {
                                    "course": course
                                }
                            }
                        ]
                    }
                },
                "script": {
                    "source": "cosineSimilarity(params.query_vector, 'question_vector') + 1.0",
                    "params": {"query_vector": query_vector}
                }
            }
        },
        "_source": ["id"]
    }

    es_results = es_client.search(index=index_name, body=search_query)
    highest_score_doc_id = es_results['hits']['hits'][0]['_source']['id']
    return highest_score_doc_id


highest_score_doc_id = get_highest_score_doc(v_q, 'machine-learning-zoomcamp')
print(f"ID of the document with the highest score: {highest_score_doc_id}")

ID of the document with the highest score: ee58a693


In [41]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

print("Computing embeddings for all questions...")
question_texts = [doc['question'] for doc in documents]
X = model.encode(question_texts, convert_to_numpy=True, show_progress_bar=True)

def exact_search(query_vector, course):
    course_indices = [i for i, doc in enumerate(documents) if doc['course'] == course]
    course_embeddings = X[course_indices]
    similarities = cosine_similarity([query_vector], course_embeddings)[0]
    most_similar_index = course_indices[np.argmax(similarities)]
    return documents[most_similar_index]['id']

def es_search(query_vector, course):
    search_query = {
        "size": 1,
        "query": {
            "script_score": {
                "query": {
                    "bool": {
                        "must": [
                            {
                                "term": {
                                    "course": course
                                }
                            }
                        ]
                    }
                },
                "script": {
                    "source": "cosineSimilarity(params.query_vector, 'question_vector') + 1.0",
                    "params": {"query_vector": query_vector}
                }
            }
        },
        "_source": ["id"]
    }

    es_results = es_client.search(index=index_name, body=search_query)
    return es_results['hits']['hits'][0]['_source']['id']

def calculate_hit_rate(documents):
    correct_hits = 0
    total_queries = len(documents)

    print("Calculating hit-rate...")
    for doc in tqdm(documents):
        query = doc['question']
        course = doc['course']
        query_vector = model.encode(query, convert_to_numpy=True)

        exact_result = exact_search(query_vector, course)
        es_result = es_search(query_vector.tolist(), course)

        if exact_result == es_result:
            correct_hits += 1

    hit_rate = correct_hits / total_queries
    return hit_rate

hit_rate = calculate_hit_rate(documents)
print(f"Hit-rate for Elasticsearch: {hit_rate:.4f}")

Computing embeddings for all questions...


Batches: 100%|██████████| 30/30 [00:04<00:00,  7.39it/s]


Calculating hit-rate...


100%|██████████| 948/948 [00:38<00:00, 24.46it/s]

Hit-rate for Elasticsearch: 0.9958



