# Introduction Search Index

Using OpenSearch to create index mappings for the recipes

# Import and parse JSON file with the recipes

In [1]:
import json as json

with open("recipes_data.json", "r") as read_file:
    recipes_data = json.load(read_file)

In [2]:
recipe_book_len = len(recipes_data)
str(recipe_book_len)

'994'

In [None]:
recipes_data['0']

# OpenSearch client creation


Need to create the openSearch client in order to create the search index mappings and further quering

## Credentials and index name

In [3]:
import pprint as pp
import requests
from config import CONFIG

host = CONFIG["host"]
port = CONFIG["port"]
user = CONFIG["user"]
password = CONFIG["password"]
#index_name = CONFIG["index_name"]
index_name = user

print(index_name)

user219


## Create the OpenSearch client and check if index_name exists

In [4]:
from opensearchpy import OpenSearch
from opensearchpy import helpers

# Create the client with SSL/TLS enabled, but hostname verification disabled.
client = OpenSearch(
    hosts = [{'host': host, 'port': port}],
    http_compress = True, # enables gzip compression for request bodies
    http_auth = (user, password),
    url_prefix = 'opensearch',
    use_ssl = True,
    verify_certs = False,
    ssl_assert_hostname = False,
    ssl_show_warn = False
)

# index_name_test = 'python-test-index'
# index_body_test = {
#   'settings': {
#     'index': {
#       'number_of_shards': 4
#     }
#   }
# }

# response = client.indices.create(index_name_test, body=index_body_test)

# Index mappings

In [5]:
# recipe_mapping = {
#     "properties": {
#             "test": {"type": "keyword"},
#         }
# }

## Print index properties

In [6]:
print('\n----------------------------------------------------------------------------------- INDEX SETTINGS')
index_settings = {
    "settings":{
      "index":{
         "refresh_interval" : "1s"
      }
   }
}
client.indices.put_settings(index = index_name, body = index_settings)
settings = client.indices.get_settings(index = index_name)
pp.pprint(settings)

print('\n----------------------------------------------------------------------------------- INDEX MAPPINGS')
client.indices.put_mapping(
    index=index_name,
    body={
        "properties": {
            "ingredients": {
              "type": "nested",
              "properties": {
                "displayText": {"type": "text"},
                "ingredient": {"type": "text"},
                "quantity": {"type": "integer"},
                "unit": {"type": "keyword"}
              }
            }
        }
    }
)


mappings = client.indices.get_mapping(index = index_name)
pp.pprint(mappings)

print('\n----------------------------------------------------------------------------------- INDEX #DOCs')
print(client.count(index = index_name))


----------------------------------------------------------------------------------- INDEX SETTINGS
{'user219': {'settings': {'index': {'creation_date': '1649241367846',
                                    'knn': 'true',
                                    'number_of_replicas': '0',
                                    'number_of_shards': '4',
                                    'provided_name': 'user219',
                                    'refresh_interval': '1s',
                                    'uuid': 'PpMrLHMUSLqJ2_V185OjRA',
                                    'version': {'created': '135238227'}}}}}

----------------------------------------------------------------------------------- INDEX MAPPINGS
{'user219': {'mappings': {'dynamic': 'strict',
                          'properties': {'canonicalName': {'type': 'keyword'},
                                         'contents': {'analyzer': 'standard',
                                                      'similarity': 'BM25',
   

# Add recipes to index with the correct mappings

In [None]:
for recipe_id in recipes_data:
    prep_time = recipes_data[recipe_id]['prepTimeMinutes']
    cook_time = recipes_data[recipe_id]['cookTimeMinutes']

    if prep_time is not None and cook_time is not None:
        time = prep_time + cook_time
    else:
        time = None


    # Extract and format ingredients data
    ingredients = []
    for ingredient_data in recipes_data[recipe_id]['ingredients']:
        ingredient = {
            "displayText": ingredient_data['displayText'],
            "ingredient": ingredient_data['ingredient'],
            "quantity": ingredient_data.get('quantity'),
            "unit": ingredient_data.get('unit')
        }
        ingredients.append(ingredient)

    # Test first recipe log
    if recipe_id == '0': 
        print(ingredients)
    
    recipe = {
        "recipe_id": recipe_id,
        "title": recipes_data[recipe_id]['displayName'],
        #"title_embedding": None, # should call embedding function
        "description": recipes_data[recipe_id]['description'],
        "ingredients": ingredients,
        "time": time,
    }


    ## Add recipe to index (not sure if it has the right mappings yet, so stay commented)
    if recipe_id == '0': 
        result = client.index(index=index_name, id=int(recipe_id), body=recipe)
        print(result)
    

# Text Based Search

In [None]:
def text_based_search(size, sources, query_txt, fields):
    query_bm25 = {
      'size': size,
      '_source': sources,
      'query': {
        'multi_match': {
          'query': query_txt,
          'fields': fields
        }
      }
    }
    print("EN>TROU")
    return client.search(
        body = query_bm25,
        index = index_name
    )


result = text_based_search(recipe_book_len, ['title', 'description', 'ingredients'], 'chicken parmesan.', ['title'])

print('\nSearch results:')
pp.pprint(result)


# Embeddings based Search with kNN

Using Approximate kNN for better perfomance over kNN on high-dimensional data.

Using dense vector for capturing more complex relationships and similarities between data points while working with high-dimensional data

In [20]:
emebedding_mappings = {
    "properties": {
        # "title_embedding":{
        #     "type":"knn_vector",
        #     "dimension": 768,
        #     "method":{
        #         "name":"hnsw",
        #         "space_type":"innerproduct",
        #         "engine":"faiss",
        #         "parameters":{
        #           "ef_construction":256,
        #           "m":48
        #         }
        #     }
        # },
        "description_embedding":{
            "type":"knn_vector",
            "dimension": 768,
            "method":{
                "name":"hnsw",
                "space_type":"innerproduct",
                "engine":"faiss",
                "parameters":{
                  "ef_construction":256,
                  "m":48
                }
            }
        }
    }
}

client.indices.put_mapping(index=index_name, body=emebedding_mappings)
mappings = client.indices.get_mapping(index = index_name)
pp.pprint(mappings)

{'user219': {'mappings': {'dynamic': 'strict',
                          'properties': {'canonicalName': {'type': 'keyword'},
                                         'contents': {'analyzer': 'standard',
                                                      'similarity': 'BM25',
                                                      'type': 'text'},
                                         'cookTimeMinutes': {'type': 'integer'},
                                         'cookingMethod': {'type': 'keyword'},
                                         'description': {'type': 'text'},
                                         'description_embedding': {'dimension': 768,
                                                                   'method': {'engine': 'faiss',
                                                                              'name': 'hnsw',
                                                                              'parameters': {'ef_construction': 256,
                      