Q1. Getting the embeddings model

In [1]:
from tqdm.autonotebook import tqdm, trange

  from tqdm.autonotebook import tqdm, trange


In [2]:
from sentence_transformers import SentenceTransformer
model_name = "multi-qa-distilbert-cos-v1"
embedding_model = SentenceTransformer(model_name)

In [3]:
user_question = "I just discovered the course. Can I still join it?"
result = embedding_model.encode(user_question)
result[0]

0.07822261

In [4]:
import requests 

base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/documents-with-ids.json'
docs_url = f'{base_url}/{relative_url}?raw=1'
docs_response = requests.get(docs_url)
documents = docs_response.json()
documents = list(filter(lambda n: n['course'] == 'machine-learning-zoomcamp',  documents))
len(documents)

375

In [5]:
documents[0]

{'text': 'Machine Learning Zoomcamp FAQ\nThe purpose of this document is to capture frequently asked technical questions.\nWe did this for our data engineering course and it worked quite well. Check this document for inspiration on how to structure your questions and answers:\nData Engineering Zoomcamp FAQ\nIn the course GitHub repository there’s a link. Here it is: https://airtable.com/shryxwLd0COOEaqXo\nwork',
 'section': 'General course-related questions',
 'question': 'How do I sign up?',
 'course': 'machine-learning-zoomcamp',
 'id': '0227b872'}

Q2. Creating the embeddings

In [6]:
from tqdm.auto import tqdm

In [7]:
embeddings = []
for doc in tqdm(documents):
    course = doc["course"]
    if (course == "machine-learning-zoomcamp"):
        qa_text = f'{doc["question"]} {doc["text"]}'
        embeddings.append(embedding_model.encode(qa_text))
embeddings[0]    

  0%|          | 0/375 [00:00<?, ?it/s]

array([ 8.80591944e-02,  1.55936796e-02,  7.92557448e-02,  2.52757389e-02,
        7.55764320e-02, -3.90596129e-02, -4.13813330e-02,  2.52917148e-02,
        2.43242439e-02,  3.62593238e-03, -7.28290249e-03, -3.28750350e-02,
        6.12956882e-02, -5.71100600e-02,  1.16774552e-02, -1.79441590e-02,
        4.49205451e-02, -5.41606322e-02, -1.92250684e-03,  1.48329362e-02,
        7.91348703e-03, -3.43125984e-02,  1.21481074e-02,  1.27185760e-02,
       -1.60090867e-02,  7.12127425e-03,  1.58389155e-02, -3.27610120e-04,
        3.01287975e-03, -4.58917879e-02, -1.34771899e-03,  2.01149881e-02,
        2.33724136e-02, -4.28200280e-03, -3.09382956e-02,  1.94312131e-03,
       -1.88219603e-02,  5.35163609e-03,  1.29352398e-02, -3.23959291e-02,
        3.10556311e-02,  5.12988679e-03, -3.24466149e-04, -3.15542221e-02,
        1.93377323e-02, -6.38601482e-02, -3.05380318e-02, -6.27885908e-02,
       -1.74859427e-02,  3.52949090e-02, -2.98070312e-02,  7.45346323e-02,
       -2.06181742e-02, -

In [8]:
operations = []
for doc in tqdm(documents):
    # Transforming the title into an embedding using the model
    question = doc['question']
    text = doc['text']
    qt = question + ' ' + text
    doc['question_vector'] = embedding_model.encode(question)
    doc['text_vector'] = embedding_model.encode(text)
    doc['question_text_vector'] = embedding_model.encode(qt)
    operations.append(doc)
operations[0]

  0%|          | 0/375 [00:00<?, ?it/s]

{'text': 'Machine Learning Zoomcamp FAQ\nThe purpose of this document is to capture frequently asked technical questions.\nWe did this for our data engineering course and it worked quite well. Check this document for inspiration on how to structure your questions and answers:\nData Engineering Zoomcamp FAQ\nIn the course GitHub repository there’s a link. Here it is: https://airtable.com/shryxwLd0COOEaqXo\nwork',
 'section': 'General course-related questions',
 'question': 'How do I sign up?',
 'course': 'machine-learning-zoomcamp',
 'id': '0227b872',
 'question_vector': array([ 8.70802924e-02,  5.98305166e-02,  4.62658592e-02, -8.17991495e-02,
         4.93381359e-02,  6.70092478e-02,  5.20505011e-02, -3.99582088e-03,
         2.49936637e-02, -1.05813043e-02, -2.35179393e-03, -3.07780541e-02,
         8.50448459e-02, -3.35007831e-02, -6.30881172e-03, -3.81490253e-02,
         1.95373837e-02,  1.93471219e-02,  1.81184597e-02, -3.73530984e-02,
        -1.95184331e-02,  9.90939233e-03, -1

In [9]:
import numpy as np

In [10]:
X = np.array(embeddings)
X.shape

(375, 768)

Q3. Search

In [11]:
max(X.dot(result))

0.65065753

In [12]:
class VectorSearchEngine():
    def __init__(self, documents, embeddings):
        self.documents = documents
        self.embeddings = embeddings

    def search(self, v_query, num_results=10):
        scores = self.embeddings.dot(v_query)
        idx = np.argsort(-scores)[:num_results]
        return [self.documents[i] for i in idx]

search_engine = VectorSearchEngine(documents=documents, embeddings=X)

Q4. Hit-rate for our search engine

In [19]:
!pip install -U pandas

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [13]:
import pandas as pd

In [14]:
base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/ground-truth-data.csv'
ground_truth_url = f'{base_url}/{relative_url}?raw=1'

df_ground_truth = pd.read_csv(ground_truth_url)
df_ground_truth = df_ground_truth[df_ground_truth.course == 'machine-learning-zoomcamp']
ground_truth = df_ground_truth.to_dict(orient='records')

In [15]:
ground_truth[0]

{'question': 'Where can I sign up for the course?',
 'course': 'machine-learning-zoomcamp',
 'document': '0227b872'}

In [16]:
len(ground_truth)

1830

In [17]:
relevance_total = []

for q in tqdm(ground_truth):
    doc_id = q['document']
    result = embedding_model.encode(f'{q["question"]}')
    results = search_engine.search(result,num_results=5)
    relevance = [d['id'] == doc_id for d in results]
    relevance_total.append(relevance)

relevance_total

  0%|          | 0/1830 [00:00<?, ?it/s]

[[False, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, True, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [False, False, True, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],

In [18]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1
    print(cnt)
    return cnt / len(relevance_total)

In [19]:
hit_rate(relevance_total)

1720


0.9398907103825137

Q5. Indexing with Elasticsearch

In [23]:
from elasticsearch import Elasticsearch

es_client = Elasticsearch('http://localhost:9200') 


In [22]:
!pip install -U elasticsearch

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting elasticsearch
  Using cached elasticsearch-8.14.0-py3-none-any.whl.metadata (7.2 kB)
Collecting elastic-transport<9,>=8.13 (from elasticsearch)
  Using cached elastic_transport-8.13.1-py3-none-any.whl.metadata (3.7 kB)
Using cached elasticsearch-8.14.0-py3-none-any.whl (480 kB)
Using cached elastic_transport-8.13.1-py3-none-any.whl (64 kB)
Installing collected packages: elastic-transport, elasticsearch
Successfully installed elastic-transport-8.13.1 elasticsearch-8.14.0


In [24]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "id": {"type": "keyword"},
            "question_vector": {
                "type": "dense_vector",
                "dims": 768,
                "index": True,
                "similarity": "cosine"
            },
            "text_vector": {
                "type": "dense_vector",
                "dims": 768,
                "index": True,
                "similarity": "cosine"
            },
            "question_text_vector": {
                "type": "dense_vector",
                "dims": 768,
                "index": True,
                "similarity": "cosine"
            },
        }
    }
}



index_name = "course-questions"

In [25]:
es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [26]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

  0%|          | 0/375 [00:00<?, ?it/s]

In [39]:
query = {
    "field": "question_vector",
    "query_vector": embedding_model.encode(user_question),
    "k": 5,
    "num_candidates": 10000, 
    "filter": {
            "term": {
                "course": "machine-learning-zoomcamp"
            }
        }
}



In [40]:
res = es_client.search(index=index_name, knn=query, source=["text", "section", "question", "course","id"])
res["hits"]["hits"][-1]


{'_index': 'course-questions',
 '_id': 'iUhvzJABnItI8E55uXZy',
 '_score': 0.6082219,
 '_source': {'question': 'How to conduct peer reviews for projects?',
  'course': 'machine-learning-zoomcamp',
  'section': 'Projects (Midterm and Capstone)',
  'text': 'Answer: Previous cohorts projects page has instructions (youtube).\nhttps://github.com/DataTalksClub/machine-learning-zoomcamp/blob/master/cohorts/2022/projects.md#midterm-project\nAlexey and his team will compile a g-sheet with links to submitted projects with our hashed emails (just like when we check leaderboard for homework) that are ours to review within the evaluation deadline.\n~~~ Added by Nukta Bhatia ~~~',
  'id': '7a1fcfd9'}}

In [36]:
def elastic_search(query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query
                    }
                },
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

In [38]:
elastic_search(user_question)

[{'text': 'Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the course.\nIn order to get a certificate, you need to submit 2 out of 3 course projects and review 3 peers’ Projects by the deadline. It means that if you join the course at the end of November and manage to work on two projects, you will still be eligible for a certificate.',
  'section': 'General course-related questions',
  'question': 'The course has already started. Can I still join it?',
  'course': 'machine-learning-zoomcamp',
  'id': 'ee58a693',
  'question_vector': [0.07717117667198181,
   -0.047493770718574524,
   0.028663232922554016,
   -0.011410684324800968,
   0.08245004713535309,
   -0.04042275249958038,
   -0.026133768260478973,
   0.04122072830796242,
   -0.048407651484012604,
   0.015093980357050896,
   -0.0014966968446969986,
   -0.013348274864256382,
   0.046182796359062195,
   0.023183928802609444,
   0.04547326639294624,
   -0.008099835366010666,
   0.07718

Q6. Hit-rate for Elasticsearch

In [35]:
def elastic_search_knn(field, vector, course):
    knn = {
        "field": field,
        "query_vector": vector,
        "k": 5,
        "num_candidates": 10000,
        "filter": {
            "term": {
                "course": course
            }
        }
    }

    search_query = {
        "knn": knn,
        "_source": ["text", "section", "question", "course", "id"]
    }

    es_results = es_client.search(
        index=index_name,
        body=search_query
    )
    
    result_docs = []
    
    for hit in es_results['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs

In [36]:
def question_vector_knn(q):
    question = q['question']
    course = q['course']

    v_q = embedding_model.encode(question)

    return elastic_search_knn('question_vector', v_q, course)

In [37]:
def evaluate(ground_truth,search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total)
    }

In [38]:
def question_text_vector_knn(q):
    question = q['question']
    course = q['course']

    v_q = embedding_model.encode(question)

    return elastic_search_knn('question_text_vector', v_q, course)

In [39]:
def text_vector_knn(q):
    question = q['question']
    course = q['course']

    v_q = embedding_model.encode(question)

    return elastic_search_knn('text_vector', v_q, course)

In [40]:
evaluate(ground_truth,question_vector_knn)

  0%|          | 0/1830 [00:00<?, ?it/s]

{'hit_rate': 0.8076502732240437}

In [46]:
evaluate(ground_truth,question_text_vector_knn)


  0%|          | 0/1830 [00:00<?, ?it/s]

{'hit_rate': 0.9398907103825137}

In [42]:
evaluate(ground_truth,text_vector_knn)

  0%|          | 0/1830 [00:00<?, ?it/s]

{'hit_rate': 0.8803278688524591}

In [43]:
def elastic_search_knn_combined(vector, course):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": [
                    {
                        "script_score": {
                            "query": {
                                "term": {
                                    "course": course
                                }
                            },
                            "script": {
                                "source": """
                                    cosineSimilarity(params.query_vector, 'question_vector') + 
                                    cosineSimilarity(params.query_vector, 'text_vector') + 
                                    cosineSimilarity(params.query_vector, 'question_text_vector') + 
                                    1
                                """,
                                "params": {
                                    "query_vector": vector
                                }
                            }
                        }
                    }
                ],
                "filter": {
                    "term": {
                        "course": course
                    }
                }
            }
        },
        "_source": ["text", "section", "question", "course", "id"]
    }

    es_results = es_client.search(
        index=index_name,
        body=search_query
    )
    
    result_docs = []
    
    for hit in es_results['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs

In [44]:
def vector_combined_knn(q):
    question = q['question']
    course = q['course']

    v_q = embedding_model.encode(question)

    return elastic_search_knn_combined(v_q, course)



In [45]:
evaluate(ground_truth, vector_combined_knn)

  0%|          | 0/1830 [00:00<?, ?it/s]

{'hit_rate': 0.9218579234972678}