### Building semantic search engine using elastic search vector db

In [1]:
import json

In [34]:
with open('documents.json', 'rt') as f_in:
    docs_raw = json.load(f_in)

In [35]:
# Elastic search needs them to be in same hierarchy:
documents = []

for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course'] # For each document, a new key 'course' is added
        documents.append(doc)


## Create embeddings using Pretrained models

In [6]:
pip install sentence_transformers==2.7.0

Note: you may need to restart the kernel to use updated packages.


In [36]:
# Everytime we passed the data to tranformer, it gives us set of vector.
# Then we will give this vector to the vector db.
from sentence_transformers import SentenceTransformer 

In [8]:
!df -h

Filesystem      Size  Used Avail Use% Mounted on
overlay          32G   13G   18G  41% /
tmpfs            64M     0   64M   0% /dev
shm              64M     0   64M   0% /dev/shm
/dev/root        29G   23G  6.1G  80% /vscode
/dev/loop3       32G   13G   18G  41% /workspaces
/dev/sdb1        44G  108K   42G   1% /tmp


In [65]:
# 1. Load a pretrained Sentence Transformer model
model = SentenceTransformer("all-mpnet-base-v2")

In [66]:
len(model.encode("Hey my name is Berry.")) # Different models have different lengths.

768

In [12]:
documents[5]

{'text': "There are 3 Zoom Camps in a year, as of 2024. However, they are for separate courses:\nData-Engineering (Jan - Apr)\nMLOps (May - Aug)\nMachine Learning (Sep - Jan)\nThere's only one Data-Engineering Zoomcamp “live” cohort per year, for the certification. Same as for the other Zoomcamps.\nThey follow pretty much the same schedule for each cohort per zoomcamp. For Data-Engineering it is (generally) from Jan-Apr of the year. If you’re not interested in the Certificate, you can take any zoom camps at any time, at your own pace, out of sync with any “live” cohort.",
 'section': 'General course-related questions',
 'question': 'Course - how many Zoomcamps in a year?',
 'course': 'data-engineering-zoomcamp'}

In [67]:
operations = []

# Creating the dense vector using the pretrained model
for doc in documents:
    doc["text_vector"] = model.encode(doc["text"]).tolist()
    operations.append(doc)

- The whole idea is creating embeddings for our dataset.

In [68]:
operations[5]

{'text': "There are 3 Zoom Camps in a year, as of 2024. However, they are for separate courses:\nData-Engineering (Jan - Apr)\nMLOps (May - Aug)\nMachine Learning (Sep - Jan)\nThere's only one Data-Engineering Zoomcamp “live” cohort per year, for the certification. Same as for the other Zoomcamps.\nThey follow pretty much the same schedule for each cohort per zoomcamp. For Data-Engineering it is (generally) from Jan-Apr of the year. If you’re not interested in the Certificate, you can take any zoom camps at any time, at your own pace, out of sync with any “live” cohort.",
 'section': 'General course-related questions',
 'question': 'Course - how many Zoomcamps in a year?',
 'course': 'data-engineering-zoomcamp',
 'text_vector': [-0.01657216250896454,
  -0.02443241886794567,
  -0.008768061175942421,
  0.01739579811692238,
  0.0015266425907611847,
  -0.012990646064281464,
  -0.023258810862898827,
  -0.005358885508030653,
  0.06164291501045227,
  0.016793811693787575,
  0.0015134389977902

### Setup elasticsearch connection

In [69]:
from elasticsearch import Elasticsearch

es_client = Elasticsearch('http://localhost:9200') # After docker run

In [70]:
es_client.info()

ObjectApiResponse({'name': '1e2f86cccae5', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'HcbKzks6RXSAKpOSoR4iOQ', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

### Create Mappings and Index

- Mapping is the process of defining how a document, and the fields it contains, are stored and indexed.

In [71]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "text_vector": {"type": "dense_vector", "dims": 768, "index": True, "similarity": "cosine"}
        }
    }
}

In [72]:
index_name = "course-questions"

# Metadata
es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings) # Indexes are created

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

### Ad documents into index

In [73]:
for doc in documents:
    es_client.index(index=index_name, document=doc)


### Create end user query

In [81]:
search_q = "Window or mac?"
vector_search_term = model.encode(search_q)

In [51]:
# query = {
#     "field": "text_vector",
#     "query_vector": vector_search_term,
#     "k": 5,
#     "num_candidates": 10000,
# }

In [52]:
# # source: Fields that you want in results pack
# resp = es_client.search(index=index_name, knn=query, source=["text", "section", "question", "course"])

In [53]:
# search_query = {
#         "size": 5, # we will get 5 answers in the result
#         "query": {
#             "bool": {
#                 "must": {
#                     "multi_match": {
#                         "query": search_q,
#                         "fields": ["question^3", "text", "section"], # question is 3 times more important. boost
#                         "type": "best_fields"
#                     }
#                 },
#                 "filter": {
#                     "term": {
#                         "course": "data-engineering-zoomcamp" # filtering
#                     }
#                 }
#             }
#         }
#     }

# resp = es_client.search(index=index_name, body=search_query)

In [82]:
# Advanced semantic search
knn_query = {
    "field": "text_vector",
    "query_vector": vector_search_term,
    "k": 5,
    "num_candidates": 10000,
}



In [83]:

respo = es_client.search(
    index=index_name,
    query = {
        "match": {
                "course": "data-engineering-zoomcamp"
            }
        },
    knn=knn_query,
    size=5
)

In [84]:
respo["hits"]["hits"]

[{'_index': 'course-questions',
  '_id': '6Mf_5JABgyTMvWL8ugFa',
  '_score': 1.480098,
  '_source': {'text': 'Yes! Linux is ideal but technically it should not matter. Students last year used all 3 OSes successfully',
   'section': 'General course-related questions',
   'question': 'Environment - Is the course [Windows/mac/Linux/...] friendly?',
   'course': 'data-engineering-zoomcamp',
   'text_vector': [-0.026965461671352386,
    -0.000626126304268837,
    -0.01662949100136757,
    0.05285150930285454,
    0.05476527288556099,
    -0.03133990615606308,
    0.029942581430077553,
    -0.04808562621474266,
    0.04467551037669182,
    0.005839474033564329,
    0.016233040019869804,
    0.012001154012978077,
    -0.031222281977534294,
    0.016600528731942177,
    -0.04886901378631592,
    -0.06496307998895645,
    0.046434223651885986,
    -0.009297756478190422,
    -0.0642528235912323,
    -0.01373267825692892,
    -0.015976183116436005,
    0.008629541844129562,
    -0.024478990584611