# Introduction Search Index

Using OpenSearch to create index mappings for the recipes

# Import and parse JSON file with the recipes

In [1]:
import json as json

with open("recipes_data.json", "r") as read_file:
    recipes_data = json.load(read_file)

In [2]:
recipe_book_len = len(recipes_data)
str(recipe_book_len)

'994'

# OpenSearch client creation


Need to create the openSearch client in order to create the search index mappings and further quering

## Credentials and index name

In [3]:
import pprint as pp
import requests
from config import CONFIG

host = CONFIG["host"]
port = CONFIG["port"]
user = CONFIG["user"]
password = CONFIG["password"]
#index_name = CONFIG["index_name"]
index_name = user

## Create the OpenSearch client and check if index_name exists

In [4]:
from opensearchpy import OpenSearch
from opensearchpy import helpers

# Create the client with SSL/TLS enabled, but hostname verification disabled.
client = OpenSearch(
    hosts = [{'host': host, 'port': port}],
    http_compress = True, # enables gzip compression for request bodies
    http_auth = (user, password),
    url_prefix = 'opensearch',
    use_ssl = True,
    verify_certs = False,
    ssl_assert_hostname = False,
    ssl_show_warn = False
)

Release the resources by closing index handle

In [5]:
resp = client.indices.close(index = index_name, timeout=600)
print(resp)

{'acknowledged': True, 'shards_acknowledged': True, 'indices': {'user219': {'closed': True}}}


Delete the previous index

In [6]:
#be absolutely sure that you want to comment this line and actually delete the index!!!

if client.indices.exists(index=index_name):
    # Delete the index.
    response = client.indices.delete(
        index = index_name,
        timeout = 600
    )
    print('\nDeleting index:')
    print(response)


Deleting index:
{'acknowledged': True}


# Index creation

In [7]:

index_body = {
   "settings":{
      "index":{
         "number_of_replicas":0,
         "number_of_shards":4,
         "refresh_interval":"1s",
         "knn":"true"
      }
   },
   "mappings": {
        "properties": {
            "recipe_id": {"type": "keyword"},
            "title": {"type": "text"},
            "description": {"type": "text"},
            "time": {"type": "integer"},
            "difficulty": {"type": "keyword"},
            "ingredients": {
                "type": "nested",
                "properties": {
                    "name": {"type": "text"},
                    "quantity": {"type": "float"},
                    "unit": {"type": "keyword"}
                }
            },
            "instructions": {
                "type": "nested",
                "properties": {
                    "stepNumber": {"type": "integer"},
                    "text": {"type": "text"},
                    "durationSeconds": {"type": "integer"}
                }
            },
            "nutrients": {
                "type": "object",
                "properties": {
                    "calories": {
                        "type": "object",
                        "properties": {
                            "quantity": {"type": "float"},
                            "measurement": {"type": "keyword"}
                        }
                    },
                    "protein": {
                        "type": "object",
                        "properties": {
                            "quantity": {"type": "float"},
                            "measurement": {"type": "keyword"}
                        }
                    },
                    "fat": {
                        "type": "object",
                        "properties": {
                            "quantity": {"type": "float"},
                            "measurement": {"type": "keyword"}
                        }
                    },
                    "carbohydrates": {
                        "type": "object",
                        "properties": {
                            "quantity": {"type": "float"},
                            "measurement": {"type": "keyword"}
                        }
                    }
                }
            }
        }
    }
}

if client.indices.exists(index=index_name):
    print("Index already existed. Nothing to be done.")
else:        
    response = client.indices.create(index_name, body=index_body)
    print('\nCreating index:')
    print(response)



Creating index:
{'acknowledged': True, 'shards_acknowledged': True, 'index': 'user219'}


## Print index properties

In [8]:
print('\n----------------------------------------------------------------------------------- INDEX SETTINGS')
settings = client.indices.get_settings(index = index_name)
pp.pprint(settings)

print('\n----------------------------------------------------------------------------------- INDEX MAPPINGS')
mappings = client.indices.get_mapping(index = index_name)
pp.pprint(mappings)

print('\n----------------------------------------------------------------------------------- INDEX #DOCs')
print(client.count(index = index_name))


----------------------------------------------------------------------------------- INDEX SETTINGS
{'user219': {'settings': {'index': {'creation_date': '1712597116194',
                                    'knn': 'true',
                                    'number_of_replicas': '0',
                                    'number_of_shards': '4',
                                    'provided_name': 'user219',
                                    'refresh_interval': '1s',
                                    'uuid': 'A4XzDgHGQP-8sgsYJGw80A',
                                    'version': {'created': '135238227'}}}}}

----------------------------------------------------------------------------------- INDEX MAPPINGS
{'user219': {'mappings': {'properties': {'description': {'type': 'text'},
                                         'difficulty': {'type': 'keyword'},
                                         'ingredients': {'properties': {'name': {'type': 'text'},
                                   

# Add recipes to index with the correct mappings

In [15]:
Comment line if you need to add to index

for recipe_id in recipes_data:
    
    # Extract and format ingredients data
    ingredients = []
    for ingredient_data in recipes_data[recipe_id]['ingredients']:
        ingredient = {
            "name": ingredient_data['ingredient'],
            "quantity": ingredient_data['quantity'],
            "unit": ingredient_data['unit']
        }
        ingredients.append(ingredient)

    instructions = []
    for instruction_data in recipes_data[recipe_id]['instructions']:
        instruction = {
            "stepNumber": instruction_data['stepNumber'],
            "text": instruction_data['stepText'],
            "durationSeconds": instruction_data['stepDurationSeconds']
        }
        instructions.append(instruction)

    # Check if nutrients data is available
    nutrients_data = recipes_data[recipe_id].get('nutrition')
    if nutrients_data:
        # Check if nutrients are available
        nutrients = {}
        for nutrient_name in ['calories', 'carbohydrateContent', 'fatContent', 'proteinContent']:
            nutrient_data = nutrients_data.get('nutrients', {}).get(nutrient_name)
            if nutrient_data:
                nutrients[nutrient_name] = {
                    "quantity": nutrient_data.get('quantity'),
                    "unit": nutrient_data.get('measurement')
                }
    else:
        nutrients = None
    
    recipe = {
        "recipe_id": recipe_id,
        "title": recipes_data[recipe_id]['displayName'],
        "description": recipes_data[recipe_id]['description'],
        "difficulty": recipes_data[recipe_id]['difficultyLevel'],
        "ingredients": ingredients,
        "instructions": instructions,
        "nutrients": nutrients,  # Assign nutrients here
        "time": recipes_data[recipe_id]['totalTimeMinutes'],
    }

    # Add recipe to index
    result = client.index(index=index_name, id=int(recipe_id), body=recipe)

print('DONE')


DONE


### Delete the recipes (optional)

In [None]:
comment this line if you need to delete the recipes

for i in range(0, recipe_book_len):
    response = client.delete(index=index_name, id=i)

# Text Based Search

In [16]:
def text_based_search(size, sources, query_txt, fields):
    query_bm25 = {
      'size': size,
      '_source': sources,
      'query': {
        'multi_match': {
          'query': query_txt,
          'fields': fields
        }
      }
    }
    return client.search(
        body = query_bm25,
        index = index_name
    )


result = text_based_search(recipe_book_len, ['title', 'description', 'ingredients'], 'chicken parmesan', ['title'])

print('\nSearch results:')
pp.pprint(result)



Search results:
{'_shards': {'failed': 0, 'skipped': 0, 'successful': 4, 'total': 4},
 'hits': {'hits': [{'_id': '557',
                    '_index': 'user219',
                    '_score': 9.647913,
                    '_source': {'description': None,
                                'ingredients': [{'name': None,
                                                 'quantity': 4,
                                                 'unit': 'COUNT'},
                                                {'name': 'salt',
                                                 'quantity': 1,
                                                 'unit': 'TO_TASTE'},
                                                {'name': 'egg',
                                                 'quantity': 2,
                                                 'unit': 'COUNT'},
                                                {'name': None,
                                                 'quantity': 1,
                              

# Embeddings based Search with kNN

Using Approximate kNN for better perfomance over kNN on high-dimensional data.

Using dense vector for capturing more complex relationships and similarities between data points while working with high-dimensional data

## Create an index with dense vectors

In [None]:
emebedding_mappings = {
    "properties": {
        # "title_embedding":{
        #     "type":"knn_vector",
        #     "dimension": 768,
        #     "method":{
        #         "name":"hnsw",
        #         "space_type":"innerproduct",
        #         "engine":"faiss",
        #         "parameters":{
        #           "ef_construction":256,
        #           "m":48
        #         }
        #     }
        # },
        "description_embedding":{
            "type":"knn_vector",
            "dimension": 768,
            "method":{
                "name":"hnsw",
                "space_type":"innerproduct",
                "engine":"faiss",
                "parameters":{
                  "ef_construction":256,
                  "m":48
                }
            }
        }
    }
}

client.indices.put_mapping(index=index_name, body=emebedding_mappings)
mappings = client.indices.get_mapping(index = index_name)
pp.pprint(mappings)

## Define dual-encoder

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F

#Mean Pooling - Take average of all tokens
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output.last_hidden_state #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


#Encode text
def encode(texts):
    # Tokenize sentences
    encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')

    # Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input, return_dict=True)

    # Perform pooling
    embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

    # Normalize embeddings
    embeddings = F.normalize(embeddings, p=2, dim=1)
    
    return embeddings


# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

# Recipe we want sentence embeddings for
recipes = ["Chicken pasta", "Vodka drink"]
recipes_emb = encode(recipes)

print(recipes_emb)