In [None]:
# Q1 Getting the embeddings model

In [1]:
!pip install sentence-transformers
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer("multi-qa-distilbert-cos-v1")


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


  from tqdm.autonotebook import tqdm, trange


In [2]:
user_question = "I just discovered the course. Can I still join it?"

In [3]:
v = embedding_model.encode(user_question)
Q1 = embedding_model.encode(user_question)[0]

In [None]:
# Prepare the documents

In [5]:
import requests 

base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/documents-with-ids.json'
docs_url = f'{base_url}/{relative_url}?raw=1'
docs_response = requests.get(docs_url)
documents = docs_response.json()

In [6]:
import json
filtered_data = [entry for entry in documents if entry['course'] == 'machine-learning-zoomcamp']
len(filtered_data)

375

In [None]:
# Q2 Creating the embeddings

In [7]:
embeddings = []
for doc in filtered_data:
    question = doc['question']
    text = doc['text']
    qa_text = f'{question} {text}'
    doc["qa_embedding"] = embedding_model.encode(qa_text)
    embeddings.append(doc["qa_embedding"])

In [8]:
import numpy as np
X = np.array(embeddings)

In [9]:
Q2 = X.shape

In [None]:
# Q3 Search

In [11]:
scores = X.dot(v)

In [12]:
Q3 = max(scores)

In [None]:
# Vector search

In [14]:
class VectorSearchEngine():
    def __init__(self, documents, embeddings):
        self.documents = documents
        self.embeddings = embeddings

    def search(self, v_query, num_results=10):
        scores = self.embeddings.dot(v_query)
        idx = np.argsort(-scores)[:num_results]
        return [self.documents[i] for i in idx]

search_engine = VectorSearchEngine(documents=documents, embeddings=X)
search_engine.search(v, num_results=5)

[{'text': 'You can find the latest and up-to-date deadlines here: https://docs.google.com/spreadsheets/d/e/2PACX-1vQACMLuutV5rvXg5qICuJGL-yZqIV0FBD84CxPdC5eZHf8TfzB-CJT_3Mo7U7oGVTXmSihPgQxuuoku/pubhtml\nAlso, take note of Announcements from @Au-Tomator for any extensions or other news. Or, the form may also show the updated deadline, if Instructor(s) has updated it.',
  'section': 'General course-related questions',
  'question': 'Homework - What are homework and project deadlines?',
  'course': 'data-engineering-zoomcamp',
  'id': 'a1daf537'},
 {'text': 'After you submit your homework it will be graded based on the amount of questions in a particular homework. You can see how many points you have right on the page of the homework up top. Additionally in the leaderboard you will find the sum of all points you’ve earned - points for Homeworks, FAQs and Learning in Public. If homework is clear, others work as follows: if you submit something to FAQ, you get one point, for each learning i

In [None]:
# Q4 Hit-rate for our search engine

In [15]:
import pandas as pd

base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/ground-truth-data.csv'
ground_truth_url = f'{base_url}/{relative_url}?raw=1'

df_ground_truth = pd.read_csv(ground_truth_url)
df_ground_truth = df_ground_truth[df_ground_truth.course == 'machine-learning-zoomcamp']
ground_truth = df_ground_truth.to_dict(orient='records')

In [14]:
hits = 0

for item in ground_truth:
    query = item['question']
    ground_truth_id = item['document']  
    
    # Compute the embedding for the query
    v_query = embedding_model.encode(query)
    
    # Get the top N results using the search engine
    results = search_engine.search(v_query, num_results=5)
    
    # Check if the ground truth id (document) is in the top N results
    result_ids = [res['id'] for res in results]
    if ground_truth_id in result_ids:
        hits += 1


In [75]:
hit_rate = hits / len(ground_truth)
Q4 = hit_rate
# needs troubleshooting =(

[{'question': 'Where can I sign up for the course?',
  'course': 'machine-learning-zoomcamp',
  'document': '0227b872'},
 {'question': 'Can you provide a link to sign up?',
  'course': 'machine-learning-zoomcamp',
  'document': '0227b872'},
 {'question': 'Is there an FAQ for this Machine Learning course?',
  'course': 'machine-learning-zoomcamp',
  'document': '0227b872'},
 {'question': 'Does this course have a GitHub repository for the sign-up link?',
  'course': 'machine-learning-zoomcamp',
  'document': '0227b872'},
 {'question': 'How can I structure my questions and answers for the course?',
  'course': 'machine-learning-zoomcamp',
  'document': '0227b872'},
 {'question': 'Are the course videos live or pre-recorded?',
  'course': 'machine-learning-zoomcamp',
  'document': '39fda9f0'},
 {'question': 'When can I start watching the course videos?',
  'course': 'machine-learning-zoomcamp',
  'document': '39fda9f0'},
 {'question': 'Are the live office hours sessions recorded?',
  'cours

In [None]:
# Q5 Indexing with Elasticsearch

In [16]:
from elasticsearch import Elasticsearch
es_client = Elasticsearch('http://localhost:9200') 

In [17]:

index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "id": {"type": "keyword"},
            "question_text_vector": {
                "type": "dense_vector",
                "dims": 768,
                "index": True,
                "similarity": "cosine"
            },
        }
    }
}

index_name = "course-questions"

In [18]:
es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [19]:
for doc in filtered_data:
    es_client.index(index=index_name, document=doc)

In [20]:
search_query = {
    "size": 5,
    "query": {
        "bool": {
            "must": {
                "multi_match": {
                    "query": user_question,
                    "fields": ["text","question", "section", "course","id"],
                    "type": "best_fields"
                }
            }
        }
    }
}

In [21]:
es_results = es_client.search(
    index=index_name,
    body=search_query
)

In [22]:
Q5 = es_results['hits']['hits'][0]['_source']["id"]

'ee58a693'

In [None]:
# Q6 Hit-rate for Elasticsearch

In [30]:
def elasticsearch_search(query_vector, num_results=5):
    knn = {
        "field": "question_text_vector",
        "query_vector": query_vector,
        "k": num_results,
        "num_candidates": 10000
    }

    search_query = {
        "knn": knn,
        "_source": ["id"]
    }

    es_results = es_client.search(
        index=index_name,
        body=search_query
    )

    result_docs = []
    for hit in es_results['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

In [31]:
hits = 0

for item in ground_truth:
    query = item['question']
    ground_truth_id = item['document']  # Using 'document' as the identifier
    
    # Compute the embedding for the query
    v_query = embedding_model.encode(query).tolist()
    
    # Get the top N results using Elasticsearch
    results = elasticsearch_search(v_query, num_results=5)
    
    # Check if the ground truth id (document) is in the top N results
    result_ids = [res['id'] for res in results]
    if ground_truth_id in result_ids:
        hits += 1

In [None]:
hit_rate = hits / len(ground_truth)
Q6 = hit_rate