### Q1. Getting the embeddings model

In [1]:
model_name = "multi-qa-distilbert-cos-v1"
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer(model_name)

  from tqdm.autonotebook import tqdm, trange


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/9.52k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/523 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/265M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/333 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [2]:
user_question = "I just discovered the course. Can I still join it?"

In [3]:
vec = embedding_model.encode(user_question)

#### First value in embedding vector

In [4]:
vec[0]

0.07822263

#### Prepare document

In [5]:
import requests 

base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/documents-with-ids.json'
docs_url = f'{base_url}/{relative_url}?raw=1'
docs_response = requests.get(docs_url)
documents = docs_response.json()

In [7]:
mlcamp = [dict for dict in documents if dict['course'] == "machine-learning-zoomcamp"]

In [8]:
len(mlcamp)

375

### Q2. Creating the embeddings

In [12]:
from tqdm.auto import tqdm
import numpy as np
embeddings = []
for doc in tqdm(mlcamp):
    qa_text = f"{doc['question']} {doc['text']}"
    embedding = embedding_model.encode(qa_text)
    embeddings.append(embedding)

X = np.array(embeddings)

  0%|          | 0/375 [00:00<?, ?it/s]

#### Shape of embeddings list

In [14]:
X.shape

(375, 768)

### Q3. Search

In [15]:
scores = X.dot(vec)

In [18]:
scores[np.argmax(scores)]

0.65065753

#### Vector Search

In [25]:
class VectorSearchEngine():
    def __init__(self, documents, embeddings):
        self.documents = documents
        self.embeddings = embeddings

    def search(self, v_query, num_results=10):
        scores = self.embeddings.dot(v_query)
        idx = np.argsort(-scores)[:num_results]
        return [self.documents[i] for i in idx]

search_engine = VectorSearchEngine(documents=documents, embeddings=X)
results = search_engine.search(vec, num_results=5)

In [33]:
results[1]['id']

'ac25d3af'

### Q4. Hit-rate for our seach engine

In [20]:
import pandas as pd

base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/ground-truth-data.csv'
ground_truth_url = f'{base_url}/{relative_url}?raw=1'

df_ground_truth = pd.read_csv(ground_truth_url)
df_ground_truth = df_ground_truth[df_ground_truth.course == 'machine-learning-zoomcamp']
ground_truth = df_ground_truth.to_dict(orient='records')

In [21]:
def hit_rate(total_hits):
    cnt = 0

    for line in total_hits:
        if True in line:
            cnt = cnt + 1

    return cnt / len(total_hits)

In [32]:
ground_truth[0]

{'question': 'Where can I sign up for the course?',
 'course': 'machine-learning-zoomcamp',
 'document': '0227b872'}

In [37]:
total_hits = []

search_engine = VectorSearchEngine(documents=mlcamp, embeddings=X)
for q in tqdm(ground_truth):
    doc_id = q['document']
    vec = embedding_model.encode(q['question'])
    results = search_engine.search(vec, num_results=5)
    hits = [d['id'] == doc_id for d in results]
    total_hits.append(hits)
  



  0%|          | 0/1830 [00:00<?, ?it/s]

#### Calculate hit rate

In [38]:
total_hits

[[False, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, True, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [False, False, True, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],

#### Hit Rate

In [39]:
hit_rate(total_hits)

0.9398907103825137

### Q5. Indexing with Elasticsearch

In [40]:
mlcamp[0]

{'text': 'Machine Learning Zoomcamp FAQ\nThe purpose of this document is to capture frequently asked technical questions.\nWe did this for our data engineering course and it worked quite well. Check this document for inspiration on how to structure your questions and answers:\nData Engineering Zoomcamp FAQ\nIn the course GitHub repository there’s a link. Here it is: https://airtable.com/shryxwLd0COOEaqXo\nwork',
 'section': 'General course-related questions',
 'question': 'How do I sign up?',
 'course': 'machine-learning-zoomcamp',
 'id': '0227b872'}

In [44]:
from elasticsearch import Elasticsearch

es_client = Elasticsearch('http://localhost:9200') 

index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "id": {"type": "keyword"},
            "embedding": {
                "type": "dense_vector",
                "dims": 768,
                "index": True, 
                "similarity": "cosine"
            }
        }
    }
}

index_name = "mlzoomcamp-questions"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'mlzoomcamp-questions'})

In [67]:
type(embeddings[0])

numpy.ndarray

In [80]:
for idx,doc in tqdm(enumerate(mlcamp)):
    doc['embedding'] = embeddings[idx].tolist()
    try:
        es_client.index(index=index_name,id=doc['id'], document=doc)
    except Exception as e:
        print(e)

0it [00:00, ?it/s]

#### Search

In [82]:
knn = {
        "field": 'embedding',
        "query_vector": vec,
        "k": 5,
        "num_candidates": 10000,
        
    }

search_query = {
    "knn": knn,
    "_source": ["text", "section", "question", "course", "id"]
}

In [83]:
res = es_client.search(index=index_name, knn=knn, source=["text", "section", "question", "course"])
res["hits"]["hits"]

[{'_index': 'mlzoomcamp-questions',
  '_id': 'RDWukpABlXkXunfNd2VQ',
  '_score': 0.8855488,
  '_source': {'question': 'Random seed 42',
   'course': 'machine-learning-zoomcamp',
   'section': '2. Machine Learning for Regression',
   'text': 'One of the questions on the homework calls for using a random seed of 42. When using 42, all my missing values ended up in my training dataframe and not my validation or test dataframes, why is that?\nThe purpose of the seed value is to randomly generate the proportion split. Using a seed of 42 ensures that all learners are on the same page by getting the same behavior (in this case, all missing values ending up in the training dataframe). If using a different seed value (e.g. 9), missing values will then appear in all other dataframes.'}},
 {'_index': 'mlzoomcamp-questions',
  '_id': 'uzWukpABlXkXunfNnWYB',
  '_score': 0.8855488,
  '_source': {'question': 'Random seed 42',
   'course': 'machine-learning-zoomcamp',
   'section': '2. Machine Learnin

#### ID of the highest scoring document

In [84]:
res['hits']['hits'][0]['_id']

'RDWukpABlXkXunfNd2VQ'

### Q6. Hit-rate for Elastic search

In [85]:
def elastic_search(index_name, vec):
    knn = {
        "field": 'embedding',
        "query_vector": vec,
        "k": 5,
        "num_candidates": 10000,
        
    }

    search_query = {
        "knn": knn,
        "_source": ["text", "section", "question", "course", "id"]
    }

    res = es_client.search(index=index_name, knn=knn, source=["text", "section", "question", "course"])
    return res["hits"]["hits"]

In [86]:
elastic_search(index_name, vec)

[{'_index': 'mlzoomcamp-questions',
  '_id': 'RDWukpABlXkXunfNd2VQ',
  '_score': 0.8855488,
  '_source': {'question': 'Random seed 42',
   'course': 'machine-learning-zoomcamp',
   'section': '2. Machine Learning for Regression',
   'text': 'One of the questions on the homework calls for using a random seed of 42. When using 42, all my missing values ended up in my training dataframe and not my validation or test dataframes, why is that?\nThe purpose of the seed value is to randomly generate the proportion split. Using a seed of 42 ensures that all learners are on the same page by getting the same behavior (in this case, all missing values ending up in the training dataframe). If using a different seed value (e.g. 9), missing values will then appear in all other dataframes.'}},
 {'_index': 'mlzoomcamp-questions',
  '_id': 'uzWukpABlXkXunfNnWYB',
  '_score': 0.8855488,
  '_source': {'question': 'Random seed 42',
   'course': 'machine-learning-zoomcamp',
   'section': '2. Machine Learnin

In [88]:
total_hits = []

for q in tqdm(ground_truth):
    doc_id = q['document']
    vec = embedding_model.encode(q['question']).tolist()
    results = elastic_search(index_name, vec)
    hits = [d['_id'] == doc_id for d in results]
    total_hits.append(hits)
  



  0%|          | 0/1830 [00:00<?, ?it/s]

In [89]:
hit_rate(total_hits)

0.7874316939890711