In [1]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("multi-qa-distilbert-cos-v1")

  from .autonotebook import tqdm as notebook_tqdm


Q1: Getting the embeddings model

In [2]:

user_question = "I just discovered the course. Can I still join it?"

embedding = model.encode(user_question)
print(embedding)

[ 7.82226548e-02 -4.01311405e-02  3.86135913e-02 -1.78966438e-04
  8.92347097e-02 -5.04591092e-02 -1.05026569e-02  3.71055678e-02
 -4.18713912e-02  3.48084792e-02 -1.20701883e-02 -2.36942340e-02
  3.87900174e-02  1.60988607e-02  3.50747295e-02  3.04746162e-03
  5.79672381e-02 -4.10627462e-02 -3.41552682e-02 -2.56396383e-02
 -3.55263911e-02  1.42908087e-02 -1.62799917e-02  3.21446545e-02
 -4.66897376e-02  7.89186060e-02  4.90160920e-02  1.56761166e-02
 -1.69110075e-02  2.26482227e-02  5.60206100e-02 -3.98361087e-02
  6.77409917e-02 -1.20209912e-02  1.12621894e-03 -1.94394365e-02
 -2.65951678e-02  1.06177367e-02  1.69687122e-02  1.13487840e-02
 -2.97063086e-02  5.25258258e-02 -1.41453547e-02  4.61699851e-02
  1.17066065e-02 -2.38053519e-02 -6.32558241e-02 -1.92042235e-02
 -7.10592186e-03  3.24167833e-02  2.49618199e-02 -5.27503015e-03
  2.01149024e-02 -3.72371152e-02  3.46405394e-02 -3.29310261e-02
 -2.01484803e-02  5.07839303e-03 -4.55506742e-02  7.89169688e-03
 -4.91713583e-02  4.69897

Q2: Creating the embeddings

In [3]:
import requests
import numpy as np

base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/documents-with-ids.json'
docs_url = f'{base_url}/{relative_url}?raw=1'
docs_response = requests.get(docs_url)
documents = docs_response.json()

filtered_documents = [doc for doc in documents if doc['course'] == 'machine-learning-zoomcamp']

print(f'Number of documents: {len(filtered_documents)}')  # Should be 375

embeddings = []
for doc in filtered_documents:
    question = doc['question']
    text = doc['text']
    qa_text = f'{question} {text}'
    embedding = model.encode(qa_text)
    embeddings.append(embedding)
X = np.array(embeddings)
print(f'Shape of X: {X.shape}')

Number of documents: 375
Shape of X: (375, 768)


In [4]:
user_question = "I just discovered the course. Can I still join it?"
v = model.encode(user_question)

cosine_similarities = X @ v
sorted_indices = np.argsort(cosine_similarities)[::-1]

top_k = 5
for i in range(top_k):
    idx = sorted_indices[i]
    print(f'Document ID: {filtered_documents[idx]["id"]}, Similarity Score: {cosine_similarities[idx]}')

Document ID: ee58a693, Similarity Score: 0.6506572961807251
Document ID: 0a278fb2, Similarity Score: 0.4717075824737549
Document ID: 6ba259b1, Similarity Score: 0.4589996635913849
Document ID: 9f261648, Similarity Score: 0.45699024200439453
Document ID: e7ba6b8a, Similarity Score: 0.45055824518203735


In [15]:
class VectorSearchEngine():
    def __init__(self, documents, embeddings):
        self.documents = documents
        self.embeddings = embeddings

    def search(self, v_query, num_results=10):
        scores = self.embeddings.dot(v_query)
        idx = np.argsort(-scores)[:num_results]
        return [self.documents[i] for i in idx]

search_engine = VectorSearchEngine(documents=documents, embeddings=X)
search_engine.search(v, num_results=5)

[{'text': 'You can find the latest and up-to-date deadlines here: https://docs.google.com/spreadsheets/d/e/2PACX-1vQACMLuutV5rvXg5qICuJGL-yZqIV0FBD84CxPdC5eZHf8TfzB-CJT_3Mo7U7oGVTXmSihPgQxuuoku/pubhtml\nAlso, take note of Announcements from @Au-Tomator for any extensions or other news. Or, the form may also show the updated deadline, if Instructor(s) has updated it.',
  'section': 'General course-related questions',
  'question': 'Homework - What are homework and project deadlines?',
  'course': 'data-engineering-zoomcamp',
  'id': 'a1daf537',
  'question_vector': array([ 2.99051143e-02, -1.91440191e-02, -3.82667668e-02,  2.09317114e-02,
          7.67452791e-02, -5.97699508e-02,  4.89828316e-03,  2.84216534e-02,
         -9.92071509e-05, -2.79434863e-02,  2.46315021e-02, -2.54201833e-02,
          6.55184910e-02, -1.35817090e-02, -2.61942744e-02, -1.38668949e-02,
          2.12290138e-02, -5.78125678e-02,  4.73321136e-03, -2.24713190e-03,
          1.86651126e-02,  1.16054537e-02,  4.

In [38]:
import pandas as pd

base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/ground-truth-data.csv'
ground_truth_url = f'{base_url}/{relative_url}?raw=1'

df_ground_truth = pd.read_csv("/workspaces/llm-zoomcamp/learning/vector_db/ground-truth-data.csv")
df_ground_truth = df_ground_truth[df_ground_truth.course == 'machine-learning-zoomcamp']
ground_truth = df_ground_truth.to_dict(orient='records')
ground_truth

[{'question': 'Where can I sign up for the course?',
  'course': 'machine-learning-zoomcamp',
  'document': '0227b872'},
 {'question': 'Can you provide a link to sign up?',
  'course': 'machine-learning-zoomcamp',
  'document': '0227b872'},
 {'question': 'Is there an FAQ for this Machine Learning course?',
  'course': 'machine-learning-zoomcamp',
  'document': '0227b872'},
 {'question': 'Does this course have a GitHub repository for the sign-up link?',
  'course': 'machine-learning-zoomcamp',
  'document': '0227b872'},
 {'question': 'How can I structure my questions and answers for the course?',
  'course': 'machine-learning-zoomcamp',
  'document': '0227b872'},
 {'question': 'Are the course videos live or pre-recorded?',
  'course': 'machine-learning-zoomcamp',
  'document': '39fda9f0'},
 {'question': 'When can I start watching the course videos?',
  'course': 'machine-learning-zoomcamp',
  'document': '39fda9f0'},
 {'question': 'Are the live office hours sessions recorded?',
  'cours

In [17]:
def get_embedding(query):
    return model.encode(query)

def calculate_hitrate(search_engine, ground_truth, num_results=5):
    hits = 0
    total = len(ground_truth)
    
    for item in ground_truth:
        question = item['question']
        relevant_document = item['document']
        
        # Get the embedding for the question
        v_query = get_embedding(question)
        
        # Perform the search
        results = search_engine.search(v_query, num_results=num_results)
        
        # Check if the relevant document is among the top results
        if relevant_document in results:
            hits += 1
    
    hitrate = hits / total
    return hitrate

# Assuming 'documents' is your list of all documents
# and 'X' is your pre-computed embeddings for these documents
search_engine = VectorSearchEngine(documents=documents, embeddings=X)

# Calculate the hitrate
hitrate = calculate_hitrate(search_engine, ground_truth, num_results=5)
print(f"Hitrate: {hitrate:.2f}")

Hitrate: 0.00


In [34]:
def calculate_hitrate(search_engine, ground_truth, num_results=5):
    hits = 0
    total = len(ground_truth)
    
    for item in ground_truth:
        question = item['question']
        relevant_document = str(item['document'])  # Ensure the ID is a string
        
        v_query = model.encode(question)  # Embed the query here
        results = search_engine.search(v_query, num_results=num_results)  # Pass the embedded query
        search_ids = [str(result['id']) for result in results]  # Ensure IDs are strings
        
        if relevant_document in search_ids:
            print(f"Search IDs: {search_ids}")  # Debug print
            print(f"Relevant Document: {relevant_document}")  # Debug print
            hits += 1
    
    hitrate = hits / total
    return hitrate

search_engine = VectorSearchEngine(documents=documents, embeddings=X)

hitrate = calculate_hitrate(search_engine, ground_truth, num_results=5)
print(f"Hitrate: {hitrate:.2f}")

Hitrate: 0.00


In [35]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)
def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [37]:
from tqdm import tqdm
import numpy as np

# Define hit_rate and mrr functions if they are not already defined
def hit_rate(relevance_total):
    # Flatten the list of relevance results and check if there is at least one True
    return np.mean([any(relevance) for relevance in relevance_total])

def mrr(relevance_total):
    # Compute Mean Reciprocal Rank
    mrr_sum = 0.0
    for relevance in relevance_total:
        try:
            rank = relevance.index(True) + 1  # Rank starts at 1
            mrr_sum += 1.0 / rank
        except ValueError:
            continue  # If there are no relevant documents, rank is considered infinity
    return mrr_sum / len(relevance_total)

# Define the evaluate function
def evaluate(ground_truth, search_engine, num_results=10):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = str(q['document'])  # Ensure the document ID is a string
        question = q['question']
        v_query = model.encode(question)  # Embed the query here
        results = search_engine.search(v_query, num_results=num_results)  # Pass the embedded query
        relevance = [d['id'] == doc_id for d in results]  # Check relevance
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

search_engine = VectorSearchEngine(documents=documents, embeddings=X)

results = evaluate(ground_truth, search_engine, num_results=5)
print(f"Hit Rate: {results['hit_rate']:.2f}")
print(f"MRR: {results['mrr']:.2f}")

100%|██████████| 1830/1830 [02:17<00:00, 13.32it/s]

Hit Rate: 0.00
MRR: 0.00



