# Q1. Getting the embeddings model

In [15]:
# pip install sentence_transformers

In [41]:
import pickle

import numpy as np
import pandas as pd
import requests
from elasticsearch import Elasticsearch
from sentence_transformers import SentenceTransformer
from tqdm.auto import tqdm

In [14]:
model_name = 'multi-qa-distilbert-cos-v1'
embedding_model = SentenceTransformer(model_name)

  from tqdm.autonotebook import tqdm, trange


In [22]:
user_question = "I just discovered the course. Can I still join it?"
embedding_model.encode(user_question)[0]

0.078222655

What's the first value of the resulting vector?  
`0.07`

In [2]:
base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/documents-with-ids.json'
docs_url = f'{base_url}/{relative_url}?raw=1'
docs_response = requests.get(docs_url)
documents = docs_response.json()

In [20]:
documents = [item for item in documents if item['course'] == 'machine-learning-zoomcamp']
len(documents)

375

# Q2. Creating the embeddings


In [21]:
embeddings = []
for doc in documents:
  question = doc['question']
  text = doc['text']
  qa_text = f'{question} {text}'
  embeddings.append(embedding_model.encode(qa_text))

X = np.array(embeddings)
X.shape

(375, 768)

What's the shape of X? (`X.shape`). Include the parantheses.  
`(375, 768)`


# Q3. Search

In [23]:
v = embedding_model.encode(user_question)
scores = X.dot(v)
max(scores)

0.65065736

What's the highest score in the results?  
`0.6506574`

# Vector search

In [24]:
class VectorSearchEngine():
    def __init__(self, documents, embeddings):
        self.documents = documents
        self.embeddings = embeddings

    def search(self, v_query, num_results=10):
        scores = self.embeddings.dot(v_query)
        idx = np.argsort(-scores)[:num_results]
        return [self.documents[i] for i in idx]

search_engine = VectorSearchEngine(documents=documents, embeddings=X)
search_engine.search(v, num_results=5)

[{'text': 'Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the course.\nIn order to get a certificate, you need to submit 2 out of 3 course projects and review 3 peers’ Projects by the deadline. It means that if you join the course at the end of November and manage to work on two projects, you will still be eligible for a certificate.',
  'section': 'General course-related questions',
  'question': 'The course has already started. Can I still join it?',
  'course': 'machine-learning-zoomcamp',
  'id': 'ee58a693',
  'question_vector': array([ 0.07717118, -0.04749376,  0.02866324, -0.01141071,  0.08245   ,
         -0.04042275, -0.02613375,  0.04122073, -0.04840765,  0.01509399,
         -0.00149671, -0.01334827,  0.0461828 ,  0.02318393,  0.04547328,
         -0.00809982,  0.07718319, -0.03334849, -0.04180022, -0.02304639,
         -0.01866885,  0.00298916, -0.00631761,  0.039316  , -0.02289939,
          0.07724465,  0.06296352,  0.037800

# Q4. Hit-rate for our search engine

In [25]:
base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/ground-truth-data.csv'
ground_truth_url = f'{base_url}/{relative_url}?raw=1'

df_ground_truth = pd.read_csv(ground_truth_url)
df_ground_truth = df_ground_truth[df_ground_truth.course == 'machine-learning-zoomcamp']
ground_truth = df_ground_truth.to_dict(orient='records')

In [26]:
ground_truth[:5]

[{'question': 'Where can I sign up for the course?',
  'course': 'machine-learning-zoomcamp',
  'document': '0227b872'},
 {'question': 'Can you provide a link to sign up?',
  'course': 'machine-learning-zoomcamp',
  'document': '0227b872'},
 {'question': 'Is there an FAQ for this Machine Learning course?',
  'course': 'machine-learning-zoomcamp',
  'document': '0227b872'},
 {'question': 'Does this course have a GitHub repository for the sign-up link?',
  'course': 'machine-learning-zoomcamp',
  'document': '0227b872'},
 {'question': 'How can I structure my questions and answers for the course?',
  'course': 'machine-learning-zoomcamp',
  'document': '0227b872'}]

In [27]:
search_engine.search(v, num_results=1)

[{'text': 'Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the course.\nIn order to get a certificate, you need to submit 2 out of 3 course projects and review 3 peers’ Projects by the deadline. It means that if you join the course at the end of November and manage to work on two projects, you will still be eligible for a certificate.',
  'section': 'General course-related questions',
  'question': 'The course has already started. Can I still join it?',
  'course': 'machine-learning-zoomcamp',
  'id': 'ee58a693',
  'question_vector': array([ 0.07717118, -0.04749376,  0.02866324, -0.01141071,  0.08245   ,
         -0.04042275, -0.02613375,  0.04122073, -0.04840765,  0.01509399,
         -0.00149671, -0.01334827,  0.0461828 ,  0.02318393,  0.04547328,
         -0.00809982,  0.07718319, -0.03334849, -0.04180022, -0.02304639,
         -0.01866885,  0.00298916, -0.00631761,  0.039316  , -0.02289939,
          0.07724465,  0.06296352,  0.037800

In [28]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)


def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [30]:
def search_function(q):
    v_query = embedding_model.encode(q['question'])
    return search_engine.search(v_query, num_results=5)

In [31]:
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [32]:
evaluate(ground_truth, search_function)

100%|██████████| 1830/1830 [02:24<00:00, 12.67it/s]


{'hit_rate': 0.9398907103825137, 'mrr': 0.8516484517304189}

Now use the code from the module to calculate the hitrate of `VectorSearchEngine` with `num_results=5`.  
What did you get?  
`0.9398907103825137`




# Q5. Indexing with Elasticsearch

In [1]:
es_client = Elasticsearch('http://localhost:9200') 

index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "id": {"type": "keyword"},
            "question_vector": {
                "type": "dense_vector",
                "dims": 768,
                "index": True,
                "similarity": "cosine"
            },
            "text_vector": {
                "type": "dense_vector",
                "dims": 768,
                "index": True,
                "similarity": "cosine"
            },
            "question_text_vector": {
                "type": "dense_vector",
                "dims": 768,
                "index": True,
                "similarity": "cosine"
            },
        }
    }
}

index_name = "course-questions"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [5]:
X = np.load('embeddings.npy')
X.shape

(375, 768)

In [7]:
with open('documents.pkl', "rb") as fin:
    documents = pickle.load(fin)

In [10]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

  1%|          | 2/375 [00:00<00:19, 19.50it/s]

100%|██████████| 375/375 [00:12<00:00, 31.23it/s]


In [11]:
query = 'I just discovered the course. Can I still join it?'

In [16]:
v_q = embedding_model.encode(query)

In [17]:
def elastic_search_knn(field, vector, course):
    knn = {
        "field": field,
        "query_vector": vector,
        "k": 5,
        "num_candidates": 10000,
        "filter": {
            "term": {
                "course": course
            }
        }
    }

    search_query = {
        "knn": knn,
        "_source": ["text", "section", "question", "course", "id"]
    }

    es_results = es_client.search(
        index=index_name,
        body=search_query
    )
    
    result_docs = []
    
    for hit in es_results['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs

In [35]:
def question_vector_knn(q):
    question = q['question']
    course = q['course']

    v_q = embedding_model.encode(question)

    return elastic_search_knn('question_vector', v_q, course)

In [40]:
elastic_search_knn('question_vector', v, course='machine-learning-zoomcamp')

[{'question': 'The course has already started. Can I still join it?',
  'course': 'machine-learning-zoomcamp',
  'section': 'General course-related questions',
  'text': 'Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the course.\nIn order to get a certificate, you need to submit 2 out of 3 course projects and review 3 peers’ Projects by the deadline. It means that if you join the course at the end of November and manage to work on two projects, you will still be eligible for a certificate.',
  'id': 'ee58a693'},
 {'question': 'I just joined. What should I do next? How can I access course materials?',
  'course': 'machine-learning-zoomcamp',
  'section': 'General course-related questions',
  'text': 'Welcome to the course! Go to the course page (http://mlzoomcamp.com/), scroll down and start going through the course materials. Then read everything in the cohort folder for your cohort’s year.\nClick on the links and start watching the vid

After indexing, let's perform the search of the same query from `Q1`.  
What's the `ID` of the document with the highest score?  
`'id': 'ee58a693'`



# Q6. Hit-rate for Elasticsearch

In [36]:
evaluate(ground_truth, question_vector_knn)

100%|██████████| 1830/1830 [01:24<00:00, 21.68it/s]


{'hit_rate': 0.8076502732240437, 'mrr': 0.6985519125683067}

What's hitrate for our dataset for Elastic?  
`'hit_rate': 0.8076502732240437`