# Introduction Search Index

Using OpenSearch to create index mappings for the recipes.

Support for text based search, embeddings based search, boolean filters and search with boolean filters

# Import and parse JSON file with the recipes

In [14]:
import json as json
    
with open("recipes_data.json", "r") as read_file:
    recipes_data = json.load(read_file)

In [15]:
recipe_book_len = len(recipes_data)
str(recipe_book_len)

'994'

# OpenSearch client creation


Need to create the openSearch client in order to create the search index mappings and further quering

## Credentials and index name

In [16]:
import pprint as pp
import requests
from config import CONFIG

host = CONFIG["host"]
port = CONFIG["port"]
user = CONFIG["user"]
password = CONFIG["password"]
#index_name = CONFIG["index_name"]
index_name = user

## Create the OpenSearch client and check if index_name exists

In [17]:
from opensearchpy import OpenSearch
from opensearchpy import helpers

# Create the client with SSL/TLS enabled, but hostname verification disabled.
client = OpenSearch(
    hosts = [{'host': host, 'port': port}],
    http_compress = True, # enables gzip compression for request bodies
    http_auth = (user, password),
    url_prefix = 'opensearch',
    use_ssl = True,
    verify_certs = False,
    ssl_assert_hostname = False,
    ssl_show_warn = False
)

Release the resources by closing index handle

In [27]:
resp = client.indices.close(index = index_name, timeout=600)
print(resp)

{'acknowledged': True, 'shards_acknowledged': True, 'indices': {'user219': {'closed': True}}}


Delete the previous index

In [46]:
#be absolutely sure that you want to comment this line and actually delete the index!!!

if client.indices.exists(index=index_name):
    # Delete the index.
    response = client.indices.delete(
        index = index_name,
        timeout = 600
    )
    print('\nDeleting index:')
    print(response)


Deleting index:
{'acknowledged': True}


# Index creation

In [47]:

index_body = {
    "settings":{
        "index":{
            "number_of_replicas":0,
            "number_of_shards":4,
            "refresh_interval":"1s",
            "knn":"true"
        }
    },
    "mappings": {
            "properties": {
                "recipe_id": {"type": "keyword"},
                "title": {"type": "text"},
                "description": {"type": "text"},
                "time": {"type": "integer"},
                "difficulty": {"type": "keyword"},
                "ingredients": {
                    "type": "nested",
                    "properties": {
                        "text": {"type": "text"},
                        "name": {"type": "text"},
                        "quantity": {"type": "float"},
                        "unit": {"type": "keyword"}
                    }
                },
                "instructions": {
                    "type": "nested",
                    "properties": {
                        "stepNumber": {"type": "integer"},
                        "text": {"type": "text"},
                        "durationSeconds": {"type": "integer"}
                    }
                },
                "nutrients": {
                    "type": "object",
                    "properties": {
                        "calories": {
                            "type": "object",
                            "properties": {
                                "quantity": {"type": "float"},
                                "measurement": {"type": "keyword"}
                            }
                        },
                        "protein": {
                            "type": "object",
                            "properties": {
                                "quantity": {"type": "float"},
                                "measurement": {"type": "keyword"}
                            }
                        },
                        "fat": {
                            "type": "object",
                            "properties": {
                                "quantity": {"type": "float"},
                                "measurement": {"type": "keyword"}
                            }
                        },
                        "carbohydrates": {
                            "type": "object",
                            "properties": {
                                "quantity": {"type": "float"},
                                "measurement": {"type": "keyword"}
                            }
                        }
                    }
                },
                "title_embedding":{
                    "type":"knn_vector",
                    "dimension": 768,
                    "method":{
                        "name":"hnsw",
                        "space_type":"innerproduct",
                        "engine":"faiss",
                        "parameters":{
                        "ef_construction":256,
                        "m":48
                        }
                    }
                },
                "description_embedding":{
                    "type":"knn_vector",
                    "dimension": 768,
                    "method":{
                        "name":"hnsw",
                        "space_type":"innerproduct",
                        "engine":"faiss",
                        "parameters":{
                        "ef_construction":256,
                        "m":48
                        }
                    }
                },
                "time_embedding":{
                    "type":"knn_vector",
                    "dimension": 768,
                    "method":{
                        "name":"hnsw",
                        "space_type":"innerproduct",
                        "engine":"faiss",
                        "parameters":{
                          "ef_construction":256,
                          "m":48
                        }
                    }
                },
                "difficulty_embedding":{
                    "type":"knn_vector",
                    "dimension": 768,
                    "method":{
                        "name":"hnsw",
                        "space_type":"innerproduct",
                        "engine":"faiss",
                        "parameters":{
                          "ef_construction":256,
                          "m":48
                        }
                    }
                },
                "ingredients_embedding":{
                    "type":"knn_vector",
                    "dimension": 768,
                    "method":{
                        "name":"hnsw",
                        "space_type":"innerproduct",
                        "engine":"faiss",
                        "parameters":{
                          "ef_construction":256,
                          "m":48
                        }
                    }
                },
                "instructions_embedding":{
                    "type":"knn_vector",
                    "dimension": 768,
                    "method":{
                        "name":"hnsw",
                        "space_type":"innerproduct",
                        "engine":"faiss",
                        "parameters":{
                          "ef_construction":256,
                          "m":48
                        }
                    }
                },
                "nutrients_embedding":{
                    "type":"knn_vector",
                    "dimension": 768,
                    "method":{
                        "name":"hnsw",
                        "space_type":"innerproduct",
                        "engine":"faiss",
                        "parameters":{
                          "ef_construction":256,
                          "m":48
                        }
                    }
                },
                "images_embedding":  {"type": "binary"}
            }
        }
    }

if client.indices.exists(index=index_name):
    print("Index already existed. Nothing to be done.")
else:        
    response = client.indices.create(index_name, body=index_body)
    print('\nCreating index:')
    print(response)



Creating index:
{'acknowledged': True, 'shards_acknowledged': True, 'index': 'user219'}


## Define dual-encoder

# Embeddings based Search with kNN

Using Approximate kNN for better perfomance over kNN on high-dimensional data.

Using dense vector for capturing more complex relationships and similarities between data points while working with high-dimensional data

In [48]:
from transformers import AutoTokenizer, AutoModel, CLIPProcessor, CLIPModel

import torch
import torch.nn.functional as F
import pickle
import os
from PIL import Image
import requests
import json


#Mean Pooling - Take average of all tokens
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output.last_hidden_state #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


#Encode text
def encode(texts):
    # Tokenize sentences
    encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')

    # Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input, return_dict=True)

    # Perform pooling
    embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

    # Normalize embeddings
    embeddings = F.normalize(embeddings, p=2, dim=1)
    
    return embeddings      

def encode_images(image_urls):
    # Load the pre-trained CLIP model and processor
    
    processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
    model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
    
    # Load all images
    images = [Image.open(requests.get(url, stream=True).raw).resize((224, 224)) for url in image_urls]

    # Encode images
    image_inputs = processor(images=images, return_tensors="pt", padding=True).to(device)
    with torch.no_grad():
        image_embeddings = model.get_image_features(**image_inputs)
    
    return image_embeddings

def encode_images_in_batches(images, batch_size=32):
        for i in range(0, len(images), batch_size):
            yield encode_images(images[i:i+batch_size])
        
def get_embeddings():
    if os.path.exists('embeddings.pkl'):
        # Load embeddings from a file
        print('Embeddings file found. Loading embeddings...')
        with open('embeddings.pkl', 'rb') as f:
            return pickle.load(f)
    else:
        print('Embeddings file not found. Generating embeddings...')
        return add_embeddings()
    
def add_embeddings():
    titles = []
    descriptions = []
    times = []
    difficulties = []
    # ingredients = []
    # instructions = []
    images = []
    
    for recipe in recipes_data.values():
        titles.append(recipe["displayName"] if recipe["displayName"] is not None else 'None') 
        descriptions.append(recipe["description"] if recipe["description"] is not None else 'None')
        #times.append(str(recipe["totalTimeMinutes"]) if recipe["totalTimeMinutes"] is not None else 'None')        
        #difficulties.append(recipe["difficultyLevel"] if recipe["difficultyLevel"] is not None else 'None')
        # for ingredient in recipe["ingredients"]:
        #     ingredients.append(ingredient if ingredient is not None else 'None')
        # for instruction in recipe["instructions"]:
        #     instructions.append(instruction["stepText"] if instruction["stepText"] is not None else 'None')
        for image in recipe["images"]:
            images.append(image["url"] if image["url"] is not None else 'None')
        
    # Calculate embeddings
    titles_emb = encode(titles)
    descriptions_emb = encode(descriptions)
    #times_emb = encode(times)
    #difficulties_emb = encode(difficulties)
    # ingredients_emb = encode(ingredients)
    # instructions_emb = encode(instructions)

    images_emb_generator = encode_images_in_batches(images)

    all_images_emb = []

    # Save embeddings to a file
    try:
        for images_emb in images_emb_generator:
            # Append each batch of image embeddings to the list
            all_images_emb.extend(images_emb)
            
        with open('embeddings.pkl', 'wb') as f:
            pickle.dump({
                'titles': titles_emb,
                'descriptions': descriptions_emb,
                # 'times': times_emb,
                # 'difficulties': difficulties_emb,
                # 'ingredients': ingredients_emb,
                # 'instructions': instructions_emb,
                'images': all_images_emb
            }, f)
    except Exception as e:
        print(f"Error while writing embeddings to file: {e}")
        return None
            
    # Free up GPU memory
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        
    # Load embeddings from file
    try:
        with open('embeddings.pkl', 'rb') as f:
            return pickle.load(f)
    except EOFError:
        print("Error: The embeddings file is empty or not completely written.")
        return None
    except Exception as e:
        print(f"Error while loading embeddings from file: {e}")
        return None
    
    
    
device = "cuda:0" if torch.cuda.is_available() else "cpu"

# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/msmarco-distilbert-base-v2")
model = AutoModel.from_pretrained("sentence-transformers/msmarco-distilbert-base-v2").to(device) 

## Print index properties

In [49]:
print('\n----------------------------------------------------------------------------------- INDEX SETTINGS')
settings = client.indices.get_settings(index = index_name)
pp.pprint(settings)

print('\n----------------------------------------------------------------------------------- INDEX MAPPINGS')
mappings = client.indices.get_mapping(index = index_name)
pp.pprint(mappings)

print('\n----------------------------------------------------------------------------------- INDEX #DOCs')
print(client.count(index = index_name))


----------------------------------------------------------------------------------- INDEX SETTINGS
{'user219': {'settings': {'index': {'creation_date': '1715254190148',
                                    'knn': 'true',
                                    'number_of_replicas': '0',
                                    'number_of_shards': '4',
                                    'provided_name': 'user219',
                                    'refresh_interval': '1s',
                                    'uuid': 'DZvs30CqQl23MJAftCmC2Q',
                                    'version': {'created': '135238227'}}}}}

----------------------------------------------------------------------------------- INDEX MAPPINGS
{'user219': {'mappings': {'properties': {'description': {'type': 'text'},
                                         'description_embedding': {'dimension': 768,
                                                                   'method': {'engine': 'faiss',
                           

# Add recipes

## Embeddings based search

In [27]:
# Compute the query embedding
query = "drink"
query_emb = encode(query)

query_denc = {
  'size': 10,
#  '_source': ['doc_id', 'contents', 'sentence_embedding'],
#  '_source': ['doc_id', 'contents'],
  '_source': ['title', 'description', 'ingredients'],
   "query": {
        "knn": {
          "title_embedding": {
            "vector": query_emb[0].numpy(),
            "k": 2
          }
        }
      }
}

response = client.search(
    body = query_denc,
    index = index_name
)

print('\nSearch results:')
pp.pprint(response)


Search results:
{'_shards': {'failed': 0, 'skipped': 0, 'successful': 4, 'total': 4},
 'hits': {'hits': [],
          'max_score': None,
          'total': {'relation': 'eq', 'value': 0}},
 'timed_out': False,
 'took': 2}
