In [21]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("multi-qa-distilbert-cos-v1")

In [22]:
user_question = "I just discovered the course. Can I still join it?"

In [23]:
k = model.encode(user_question)
res=k[0]
print(res)

0.078222655


In [30]:
import requests 

base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/documents-with-ids.json'
docs_url = f'{base_url}/{relative_url}?raw=1'
docs_response = requests.get(docs_url)
documents = docs_response.json()

In [31]:
filtered_documents = [doc for doc in documents if doc.get('course') == 'machine-learning-zoomcamp']

In [32]:
import numpy as np
embeddings=[]
for doc in filtered_documents:
    question = doc.get('question', '')
    text = doc.get('text', '')
    qa_text = f'{question} {text}'
    embedding = model.encode(qa_text)
    embeddings.append(embedding)
X = np.array(embeddings)

In [12]:
print(X.shape)

(375, 768)


In [13]:
scores = X.dot(k)

In [14]:
highest_score = np.max(scores)
print(highest_score)

0.6506573


In [34]:
class VectorSearchEngine():
    def __init__(self, documents, embeddings):
        self.documents = documents
        self.embeddings = embeddings

    def search(self, v_query, num_results=10):
        scores = self.embeddings.dot(v_query)
        idx = np.argpartition(-scores,num_results)[:num_results]
        return [self.documents[i] for i in idx]

search_engine = VectorSearchEngine(documents=filtered_documents, embeddings=X)

In [35]:
import pandas as pd
ground_truth_url = f'{base_url}/03-vector-search/eval/ground-truth-data.csv?raw=1'
df_ground_truth = pd.read_csv(ground_truth_url)
df_ground_truth = df_ground_truth[df_ground_truth.course == 'machine-learning-zoomcamp']
ground_truth = df_ground_truth.to_dict(orient='records')
print("Sample ground truth records:", ground_truth[:3])

Sample ground truth records: [{'question': 'Where can I sign up for the course?', 'course': 'machine-learning-zoomcamp', 'document': '0227b872'}, {'question': 'Can you provide a link to sign up?', 'course': 'machine-learning-zoomcamp', 'document': '0227b872'}, {'question': 'Is there an FAQ for this Machine Learning course?', 'course': 'machine-learning-zoomcamp', 'document': '0227b872'}]


In [36]:
hits = 0
for item in ground_truth:
    query_embedding = model.encode(item['question'])
    results = search_engine.search(query_embedding, num_results=5)
    if any('id' in result and result['id'] == item['document'] for result in results):
        hits += 1

hitrate = hits / len(ground_truth)
print(hitrate)

0.9398907103825137
