In [None]:
# This python notebook is a companion to the semantic search article here <insert url after publishing>
# Prerequisites:
# $ pip install -U sentence-transformers
# $ pip install -U opensearch-py
# Install opensearch, docker install is easy: https://opensearch.org/docs/latest/install-and-configure/install-opensearch/index/


In [None]:
# compute embeddings for test data
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2', device='cpu')

descriptions = ['car', 'bus', 'house', 'cat', 'dog']
embeddings = [model.encode(description) for description in descriptions]
embeddings

In [None]:
# compute embeddings (again) parallelized ... use this if you have a large number of embeddings to compute
import multiprocessing
from tqdm import tqdm
import concurrent.futures
from time import time

# if your SentenceTransformer device is 'cuda' or 'mps' you should count GPU cores in the next line instead of CPU cores
num_processes = multiprocessing.cpu_count()
with concurrent.futures.ThreadPoolExecutor(num_processes) as pool:
    futures = [pool.submit(model.encode, item) for item in descriptions]  
    embeddings = [f.result() for f in futures]
embeddings

In [None]:
from opensearchpy import OpenSearch

# Connect to OpenSearch
# Note your configuration may be different depending on how you set it up
# We ran it via docker locally with -e "plugins.security.disabled=true"
# Don't forget to add your OpenSearch password below

host = 'localhost'
port = 9200
auth = ('admin', '<your opensearch password>')

# Create the client with SSL/TLS enabled, but hostname verification disabled.
client = OpenSearch(
    hosts = [{'host': host, 'port': port}],
    use_ssl = False
)


In [None]:
# recreate index
index_name = "semantic_index"

# delete index if it already exists
client.indices.delete(index=index_name)

# index schema
mapping = \
{
  "settings": {
    "index": {
      "knn": True,
      "knn.algo_param.ef_search": 100
    }
  },
  "mappings": {
    "properties": {
        "description_emb": {
          "type": "knn_vector",
          "dimension": 384,
          "method": {
            "name": "hnsw",
            "space_type": "l2",
            "engine": "nmslib",
            "parameters": {
              "ef_construction": 128,
              "m": 24
            }
          }
      },
      "media_url": {
        "type": "binary"
      },
      "description": {
        "type": "text"
      }
    }
  }
}

# create index
client.indices.create(index=index_name, body=mapping)

In [None]:
# index documents
# Note that for large data sets bulk indexing will be much faster as documented here: 
# https://opensearch.org/docs/latest/clients/python-low-level/

for i in range(len(descriptions)):
    document = {
        'description': descriptions[i],
        'media_url': 'http://this.would/point/to/the/media.jpg',
        'description_emb': embeddings[i]
    }
    
    client.index(index=index_name, body=document)



In [None]:
# calculate embedding for query
user_query = "motorcycle"
user_query_emb = model.encode(user_query)

# construct opensearch query and submit
desired_results = 2
opensearch_query = { "size": desired_results, "query": {"knn": { "description_emb": {"vector": user_query_emb, "k": desired_results } } } }

from time import sleep
sleep(1) # give OpenSearch a second to catch up if you're running the entire notebook top to bottom
results = client.search(index=index_name, body=opensearch_query)

for result in results['hits']['hits']:
    print(result['_source']['description'])