In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("../data/data.csv")
df.head()

Unnamed: 0,id,recipe_name,type_of_dish,main_ingredient,cuisine,cooking_method,prep_time,cook_time,instructions
0,0,Spaghetti Bolognese,Main Course,Beef,Italian,Simmering,10 minutes,40 minutes,Heat olive oil in a large pan over medium heat...
1,1,Chicken Curry,Main Course,Chicken,Indian,Sautéing,15 minutes,30 minutes,Heat oil in a large pan over medium heat. Add ...
2,2,Caesar Salad,Appetizer,Lettuce,Western,Tossing,10 minutes,0 minutes,"In a large bowl, toss chopped romaine lettuce ..."
3,3,Chocolate Cake,Dessert,Chocolate,Western,Baking,20 minutes,35 minutes,Preheat oven to 180°C (350°F). In a large bowl...
4,4,Grilled Salmon,Main Course,Salmon,Western,Grilling,5 minutes,10 minutes,Preheat grill to high heat. Rub salmon fillets...


In [42]:
df.columns

Index(['id', 'recipe_name', 'type_of_dish', 'main_ingredient', 'cuisine',
       'cooking_method', 'prep_time', 'cook_time', 'instructions'],
      dtype='object')

In [3]:
documents = df.to_dict(orient="records")
documents

[{'id': 0,
  'recipe_name': 'Spaghetti Bolognese',
  'type_of_dish': 'Main Course',
  'main_ingredient': 'Beef',
  'cuisine': 'Italian',
  'cooking_method': 'Simmering',
  'prep_time': '10 minutes',
  'cook_time': '40 minutes',
  'instructions': 'Heat olive oil in a large pan over medium heat. Add finely chopped onions, carrots, and celery. Cook for 5-7 minutes until softened. Add minced beef and cook until browned. Pour in canned tomatoes, beef broth, and Italian herbs. Simmer uncovered for 30-40 minutes. Stir occasionally and season with salt and pepper. Serve over cooked spaghetti and garnish with fresh basil.'},
 {'id': 1,
  'recipe_name': 'Chicken Curry',
  'type_of_dish': 'Main Course',
  'main_ingredient': 'Chicken',
  'cuisine': 'Indian',
  'cooking_method': 'Sautéing',
  'prep_time': '15 minutes',
  'cook_time': '30 minutes',
  'instructions': 'Heat oil in a large pan over medium heat. Add finely chopped onions and sauté until golden brown. Stir in minced garlic, ginger, and s

### Elastic search

Run elasticsearch service using the following:

```bash
docker run -it \
    --rm \
    --name elasticsearch \
    -m 4GB \
    -p 9200:9200 \
    -p 9300:9300 \
    -e "discovery.type=single-node" \
    -e "xpack.security.enabled=false" \
    docker.elastic.co/elasticsearch/elasticsearch:8.9.0
```

Check elasticsearch is running:
```bash
curl -X GET "http://localhost:9200/"
```

Check elasticsearch health:
```bash
curl -X GET "localhost:9200/_cluster/health?pretty"
```

In [4]:
import json
import pandas as pd
from tqdm.auto import tqdm
from sentence_transformers import SentenceTransformer
from elasticsearch import Elasticsearch

In [5]:
model_name = 'multi-qa-MiniLM-L6-cos-v1'
model = SentenceTransformer(model_name)

  return torch._C._cuda_getDeviceCount() > 0


In [62]:
for doc in tqdm(documents):
    recipe_name = doc['recipe_name']
    instructions = doc['instructions']
    combined_text = recipe_name + ' ' + instructions

    # Generate vectors for the recipe_name, instructions, and combined text
    doc['recipe_name_vector'] = model.encode(recipe_name).tolist()
    doc['instructions_vector'] = model.encode(instructions).tolist()
    doc['combined_vector'] = model.encode(combined_text).tolist()


  0%|          | 0/182 [00:00<?, ?it/s]

In [63]:
documents[0]

{'id': 0,
 'recipe_name': 'Spaghetti Bolognese',
 'type_of_dish': 'Main Course',
 'main_ingredient': 'Beef',
 'cuisine': 'Italian',
 'cooking_method': 'Simmering',
 'prep_time': '10 minutes',
 'cook_time': '40 minutes',
 'instructions': 'Heat olive oil in a large pan over medium heat. Add finely chopped onions, carrots, and celery. Cook for 5-7 minutes until softened. Add minced beef and cook until browned. Pour in canned tomatoes, beef broth, and Italian herbs. Simmer uncovered for 30-40 minutes. Stir occasionally and season with salt and pepper. Serve over cooked spaghetti and garnish with fresh basil.',
 'recipe_name_vector': [-0.006667444948107004,
  0.046345990151166916,
  0.006302243564277887,
  0.003957842011004686,
  -0.021067073568701744,
  -0.02932151034474373,
  0.016785765066742897,
  0.010842581279575825,
  -0.04205571115016937,
  -0.08049868047237396,
  -0.06755813956260681,
  -0.02072698436677456,
  0.05509309098124504,
  -0.004102345556020737,
  0.030918583273887634,
  

In [64]:
# Connect to the Elasticsearch instance
es_client = Elasticsearch('http://localhost:9200', request_timeout=30)

# Define index settings with mapping for recipe data
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "recipe_name": {"type": "text"},
            "instructions": {"type": "text"},
            "cuisine": {"type": "keyword"},
            "type_of_dish": {"type": "keyword"},
            "main_ingredient": {"type": "keyword"},
            "id": {"type": "keyword"},
            "recipe_name_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
            "instructions_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
            "combined_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            }
        }
    }
}

# Define the index name
index_name = "recipes"

# Delete the index if it exists, and create a new one
es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

print(f"Index '{index_name}' created successfully.")

Index 'recipes' created successfully.


In [65]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

  0%|          | 0/182 [00:00<?, ?it/s]

In [66]:
query = "What is the main ingredient of Spaghetti Bolognese? and how to make it?"
v_q = model.encode(query)

In [67]:
v_q.shape

(384,)

In [69]:
# Define the KNN query (vector-based search)
knn_query = {
    "field": "combined_vector",
    "query_vector": v_q,  # Your query vector generated by an encoding model
    "k": 5,  # Number of nearest neighbors to retrieve
    "num_candidates": 10000,  # Candidate pool size for vector search
    "boost": 0.5,  # Adjust the importance of the vector search in the final score
    # "filter": {
    #     "term": {
    #         "cuisine": "Italian"  # Example filter to narrow down search results
    #     }
    # }
}

# Define the keyword query (text-based search)
keyword_query = {
    "bool": {
        "must": {
            "multi_match": {
                "query": query,  # The text query, e.g., "Chicken curry with coconut milk"
                "fields": ["recipe_name^3", "instructions", "cuisine", "main_ingredient"],  # Fields to search
                "type": "best_fields",
                "boost": 0.5  # Adjust the importance of the keyword search
            }
        },
        # "filter": {
        #     "term": {
        #         "type_of_dish": "Main Course"  # Example filter to narrow down by dish type
        #     }
        # }
    }
}

In [70]:
# Perform the search with both the keyword query and KNN query
response = es_client.search(
    index="recipes",  # Your index name
    query=keyword_query,  # Keyword-based query
    knn=knn_query,  # KNN (vector-based) query
    size=5  # Number of results to return
)


In [71]:
# Print out the results
for hit in response['hits']['hits']:
    print(f"Recipe: {hit['_source']['recipe_name']}")
    print(f"Cuisine: {hit['_source']['cuisine']}")
    print(f"Instructions: {hit['_source']['instructions']}\n")

Recipe: Spaghetti Bolognese
Cuisine: Italian
Instructions: Heat olive oil in a large pan over medium heat. Add finely chopped onions, carrots, and celery. Cook for 5-7 minutes until softened. Add minced beef and cook until browned. Pour in canned tomatoes, beef broth, and Italian herbs. Simmer uncovered for 30-40 minutes. Stir occasionally and season with salt and pepper. Serve over cooked spaghetti and garnish with fresh basil.

Recipe: Spaghetti Carbonara
Cuisine: Italian
Instructions: Cook pasta according to package instructions. In a skillet, cook pancetta or bacon until crispy. Whisk eggs and Parmesan cheese together. Toss cooked pasta with bacon and the egg mixture. Serve immediately with extra cheese and black pepper.

Recipe: Spaghetti Aglio e Olio
Cuisine: Italian
Instructions: Cook spaghetti and toss with sautéed garlic, red pepper flakes, and olive oil. Garnish with parsley and Parmesan cheese.

Recipe: Macaroni and Cheese
Cuisine: American
Instructions: Cook macaroni accord

### Hybrid search pipeline

In [72]:
df_ground_truth = pd.read_csv('../data/ground-truth-retrieval.csv')

In [73]:
df_ground_truth.head()

Unnamed: 0,id,question
0,0,"What should I do after cooking the onions, car..."
1,0,How long should I simmer the mixture after add...
2,0,What ingredients do I need to season the Bolog...
3,0,How long does it take to prepare the ingredien...
4,0,What type of pasta is recommended to serve wit...


In [74]:
ground_truth = df_ground_truth.to_dict(orient='records')

In [75]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

In [76]:
def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [77]:
def elastic_search_hybrid(field, query, vector):

    knn_query = {
        "field": field,
        "query_vector": vector,
        "k": 5,
        "num_candidates": 10000,
        "boost": 0.5,
    }
    


    keyword_query = {
        "bool": {
            "must": {
                "multi_match": {
                    "query": query,
                    "fields": ["recipe_name^3", "instructions", "cuisine", "main_ingredient"],  # Fields to search
                    "type": "best_fields",
                    "boost": 0.5,
                }
            },
        }
    }

    search_query = {
        "knn": knn_query,
        "query": keyword_query,
        "size": 5,
        "_source": ['id', 'recipe_name', 'type_of_dish', 'main_ingredient', 'cuisine',
                    'cooking_method', 'prep_time', 'cook_time', 'instructions']
    }

    es_results = es_client.search(
        index=index_name,
        body=search_query
    )
    
    result_docs = []
    
    for hit in es_results['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs

In [78]:
def question_hybrid(q):
    question = q['question']

    v_q = model.encode(question)

    return elastic_search_hybrid('combined_vector', question, v_q)

In [79]:
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['id']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [80]:
evaluate(ground_truth, question_hybrid)

  0%|          | 0/910 [00:00<?, ?it/s]

{'hit_rate': 0.8384615384615385, 'mrr': 0.737912087912088}

### Reranking (RRF)

In [81]:
def elastic_search_hybrid_rrf(field, query, vector):

    knn_query = {
        "field": field,
        "query_vector": vector,
        "k": 5,
        "num_candidates": 10000,
        "boost": 0.5,
    }

    keyword_query = {
        "bool": {
            "must": {
                "multi_match": {
                    "query": query,
                    "fields": ["recipe_name^3", "instructions", "cuisine", "main_ingredient"],  # Fields to search
                    "type": "best_fields",
                    "boost": 0.5,
                }
            },
        }
    }

    search_query = {
        "knn": knn_query,
        "query": keyword_query,
        "size": 5,
        "rank": {
            "rrf": {}
        },
        "_source": ['id', 'recipe_name', 'type_of_dish', 'main_ingredient', 'cuisine',
                    'cooking_method', 'prep_time', 'cook_time', 'instructions']
    }

    es_results = es_client.search(
        index=index_name,
        body=search_query
    )
    
    result_docs = []
    
    for hit in es_results['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs

In [82]:
# elastic_search_hybrid_rrf('instructions_vector', query, v_q)

By default, RRF isn't available in a free-tier subscription. But you can try to use 30-day trial or upgrade the subscription plan.



### RRF Implementation

In [83]:
def compute_rrf(rank, k=60):
    """ Our own implementation of the relevance score """
    return 1 / (k + rank)

def elastic_search_hybrid_rrf(field, query, vector, k=60):
    knn_query = {
        "field": field,
        "query_vector": vector,
        "k": 5,
        "num_candidates": 10000,
        "boost": 0.5,
    }

    keyword_query = {
        "bool": {
            "must": {
                "multi_match": {
                    "query": query,
                    "fields": ["recipe_name^3", "instructions", "cuisine", "main_ingredient"],  # Fields to search
                    "type": "best_fields",
                    "boost": 0.5,
                }
            },
        }
    }

    knn_results = es_client.search(
        index=index_name, 
        body={
            "knn": knn_query, 
            "size": 10
        }
    )['hits']['hits']
    
    keyword_results = es_client.search(
        index=index_name, 
        body={
            "query": keyword_query, 
            "size": 10
        }
    )['hits']['hits']
    
    rrf_scores = {}
    # Calculate RRF using vector search results
    for rank, hit in enumerate(knn_results):
        doc_id = hit['_id']
        rrf_scores[doc_id] = compute_rrf(rank + 1, k)

    # Adding keyword search result scores
    for rank, hit in enumerate(keyword_results):
        doc_id = hit['_id']
        if doc_id in rrf_scores:
            rrf_scores[doc_id] += compute_rrf(rank + 1, k)
        else:
            rrf_scores[doc_id] = compute_rrf(rank + 1, k)

    # Sort RRF scores in descending order
    reranked_docs = sorted(rrf_scores.items(), key=lambda x: x[1], reverse=True)
    
    # Get top-K documents by the score
    final_results = []
    for doc_id, score in reranked_docs[:5]:
        doc = es_client.get(index=index_name, id=doc_id)
        final_results.append(doc['_source'])
    
    return final_results

In [84]:
def question_text_hybrid_rrf(q):
    question = q['question']

    v_q = model.encode(question)

    return elastic_search_hybrid_rrf('combined_vector', question, v_q)

evaluate(ground_truth, question_text_hybrid_rrf)

  0%|          | 0/910 [00:00<?, ?it/s]

{'hit_rate': 0.845054945054945, 'mrr': 0.7454761904761906}