### Building semantic search engine using elastic search vector db

In [None]:
import json

In [None]:
with open('documents.json', 'rt') as f_in:
    docs_raw = json.load(f_in)

In [None]:
# Elastic search needs them to be in same hierarchy:
documents = []

for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course'] # For each document, a new key 'course' is added
        documents.append(doc)


## Create embeddings using Pretrained models

In [None]:
pip install sentence_transformers==2.7.0

In [None]:
# Everytime we passed the data to tranformer, it gives us set of vector.
# Then we will give this vector to the vector db.
from sentence_transformers import SentenceTransformer 

In [None]:
!df -h

In [None]:
# 1. Load a pretrained Sentence Transformer model
model = SentenceTransformer("all-mpnet-base-v2")

In [None]:
len(model.encode("Hey my name is Berry.")) # Different models have different lengths.

In [None]:
documents[5]

In [None]:
operations = []

# Creating the dense vector using the pretrained model
for doc in documents:
    doc["text_vector"] = model.encode(doc["text"]).tolist()
    operations.append(doc)

- The whole idea is creating embeddings for our dataset.

In [None]:
operations[5]

### Setup elasticsearch connection

In [None]:
from elasticsearch import Elasticsearch

es_client = Elasticsearch('http://localhost:9200') # After docker run

In [None]:
es_client.info()

### Create Mappings and Index

- Mapping is the process of defining how a document, and the fields it contains, are stored and indexed.

In [None]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "text_vector": {"type": "dense_vector", "dims": 768, "index": True, "similarity": "cosine"}
        }
    }
}

In [None]:
index_name = "course-questions"

# Metadata
es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings) # Indexes are created

### Ad documents into index

In [None]:
for doc in documents:
    es_client.index(index=index_name, document=doc)


### Create end user query

In [None]:
search_q = "Window or mac?"
vector_search_term = model.encode(search_q)

In [None]:
# query = {
#     "field": "text_vector",
#     "query_vector": vector_search_term,
#     "k": 5,
#     "num_candidates": 10000,
# }

In [None]:
# # source: Fields that you want in results pack
# resp = es_client.search(index=index_name, knn=query, source=["text", "section", "question", "course"])

In [None]:
# search_query = {
#         "size": 5, # we will get 5 answers in the result
#         "query": {
#             "bool": {
#                 "must": {
#                     "multi_match": {
#                         "query": search_q,
#                         "fields": ["question^3", "text", "section"], # question is 3 times more important. boost
#                         "type": "best_fields"
#                     }
#                 },
#                 "filter": {
#                     "term": {
#                         "course": "data-engineering-zoomcamp" # filtering
#                     }
#                 }
#             }
#         }
#     }

# resp = es_client.search(index=index_name, body=search_query)

In [None]:
# Advanced semantic search
knn_query = {
    "field": "text_vector",
    "query_vector": vector_search_term,
    "k": 5,
    "num_candidates": 10000,
}



In [None]:

respo = es_client.search(
    index=index_name,
    query = {
        "match": {
                "course": "data-engineering-zoomcamp"
            }
        },
    knn=knn_query,
    size=5
)

In [None]:
respo["hits"]["hits"]