# Food Recipe Browser Project

By `652115013 Narongchai Rongthong`

Firstly we load the data from parquet file provided

In [11]:
import pandas as pd

# Check if recipes_df is already loaded
if 'recipes_df' not in globals():
    recipes_df = pd.read_parquet('../resource/recipes.parquet')
    recipes_df['RecipeServings'].fillna(0.0, inplace=True)  # Fill NaN with default value
    print(f"Loaded {len(recipes_df)} recipes.")
else:
    print("Recipes data already loaded.")
    print(f"Loaded {len(recipes_df)} recipes.") # type: ignore


Recipes data already loaded.
Loaded 522517 recipes.


In [None]:
import os
from dotenv import load_dotenv
from elasticsearch import Elasticsearch

# Load environment variables from .env
load_dotenv()

# Get values from .env
es_password = os.getenv("ES_PASSWORD")

index_name = "recipes"

# Create Elasticsearch client
es_client = Elasticsearch(
    "https://localhost:9200",
    basic_auth=("elastic", es_password),
    ca_certs='~/http_ca.crt'
)

if es_client.ping():
    print("Connected to Elasticsearch")
else:
    raise ConnectionRefusedError("Elasticsearch connection failed")

ConnectionRefusedError: Elasticsearch connection failed

Then we can start indexing the data
- applying fields we need
    - id
    - name
    - ingredients
    - instuctions

For searching i want to join those together so its easier to find into `cleaned` "search text"

along with extra cleaned name

Through `stemming` and removing `stopwords`

In [None]:
# Setup text cleaner
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('stopwords')

stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))
# Exclude specific stopwords
important_stop_words =  {"with", "and"}
custom_stopwords = set(stopwords.words('english')) - important_stop_words  

def clean_text(text):
    tokens = word_tokenize(text.lower())  
    filtered_tokens = [word for word in tokens if word not in custom_stopwords]  
    stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]  
    return " ".join(stemmed_tokens)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!



Additionally, since if we send very short query like `"t"` or `"to"` we'd get completely empty results
instead we can make it try to show up something that matches their `ngrams`.

In [None]:
from elasticsearch.helpers import bulk
import numpy as np

# Define index name and sample size for development
sample_size = 20000 # Set the sample size for testing (adjust as needed)

# Delete the index if it already exists
es_client.indices.delete(index=index_name, ignore=[400, 404])

# Create the index with a mapping that uses an English analyzer
mapping = {
    "settings": {
        "analysis": {
            "tokenizer": {
                "ngram_tokenizer": {
                    "type": "ngram",
                    "min_gram": 2,  # Minimum length of n-grams
                    "max_gram": 3,  # Maximum length of n-grams
                    "token_chars": ["letter", "digit"]
                }
            },
            "analyzer": {
                "default": {
                    "type": "english"
                },
                "ngram_analyzer": {  # Add a custom n-gram analyzer
                    "type": "custom",
                    "tokenizer": "ngram_tokenizer",
                    "filter": ["lowercase"]
                }
            }
        }
    },
    "mappings": {
        "properties": {
            "recipe_id": {"type": "keyword"},
            "name": { 
                "type": "text", 
                "analyzer": "english",
                "fields": { 
                    "ngram": {  # Add an n-gram variant of the name field
                        "type": "text", 
                        "analyzer": "ngram_analyzer"
                    }
                }
            },
            "cleaned_name": {"type": "text", "analyzer": "english"},
            "author_name": {"type": "text", "analyzer": "english"},
            "recipe_category": {"type": "text", "analyzer": "english"},
            "description": {"type": "text", "analyzer": "english"},
            "ingredients": {
                "type": "text",
                "analyzer": "english",
                "fields": {
                    "raw": {
                        "type": "keyword"
                    }
                }
            },
            "instructions": {
                "type": "text",
                "analyzer": "english",
                "fields": {
                    "raw": {
                        "type": "keyword"
                    }
                }
            },
            "keywords": {
                "type": "text",
                "analyzer": "english",
                "fields": {
                    "raw": {
                        "type": "keyword"
                    }
                }
            },
            "search_text": {"type": "text", "analyzer": "english"},
            "image_urls": {"type": "keyword"},
            # Time-related fields
            "cook_time": {"type": "text"},
            "prep_time": {"type": "text"},
            "total_time": {"type": "text"},
            # Nutritional content fields
            "calories": {"type": "float"},
            "fat_content": {"type": "float"},
            "cholesterol_content": {"type": "float"},
            "carbohydrate_content": {"type": "float"},
            "fiber_content": {"type": "float"},
            "sugar_content": {"type": "float"},
            "protein_content": {"type": "float"},
            "recipe_servings": {"type": "float"},
            # Allergens
            "allergens": {
                "type": "text",
                "analyzer": "english",
                "fields": {
                    "raw": {
                        "type": "keyword"
                    }
                }
            },
        }
    }
}

# Create the index
es_client.indices.create(index=index_name, body=mapping)
print(f"Created index: {index_name}")

# Get a sample of the recipes for development (you can adjust sample size)
recipes_sample = recipes_df.head(sample_size)

# Detects allergens helper
ALLERGENS = {
    "peanuts": "peanuts",
    "milk": "dairy",
    "cheese": "dairy",
    "butter": "dairy",
    "wheat": "gluten",
    "flour": "gluten",
    "soy sauce": "soy",
    "soybean": "soy",
    "shrimp": "shellfish",
    "crab": "shellfish",
    "lobster": "shellfish",
    "egg": "egg",
    "almond": "tree nuts",
    "cashew": "tree nuts",
    "walnut": "tree nuts",
}
def detect_allergens(ingredients):
    detected_allergens = set()
    for ingredient, _ in ingredients:  # Extract ingredient names
        for allergen, category in ALLERGENS.items():
            if allergen in ingredient.lower():  # Check for allergen keywords
                detected_allergens.add(category)
    return list(detected_allergens)

# Prepare the documents for bulk indexing
def generate_docs(df):
    for idx, row in df.iterrows():
        # Main Informations
        recipe_id = str(int(float(row.get('RecipeId', idx))))  # Ensures it's always an integer string
        name = str(row.get('Name') or '')
        cleaned_name = clean_text(name)
        author_name = str(row.get('AuthorName') or '')
        recipe_category = str(row.get('RecipeCategory') or '')
        description = str(row.get('Description') or '')
        
        # Process ingredients as a list of (ingredient, quantity) pairs
        ingredients = list(zip(row.get('RecipeIngredientParts', []), row.get('RecipeIngredientQuantities', [])))
        
        # Process instructions safely
        instructions_val = row.get('RecipeInstructions')
        if instructions_val is None:
            instructions_list = []
        elif isinstance(instructions_val, np.ndarray):
            instructions_list = instructions_val.tolist()
        else:
            instructions_list = instructions_val
        instructions_text = " ".join(map(str, instructions_list)) if instructions_list else ''
        
        # Process keywords safely
        keywords_val = row.get('Keywords')
        if keywords_val is None:
            keywords = []
        elif isinstance(keywords_val, np.ndarray):
            keywords = keywords_val.tolist()
        else:
            keywords = keywords_val
        keywords_text = " ".join(filter(None, map(str, keywords))) if len(keywords) > 0 else ''
        
        # Time
        cook_time = str(row.get('CookTime') or '')
        prep_time = str(row.get('PrepTime') or '')
        total_time = str(row.get('TotalTime') or '')
        
        # Nutritional Contents
        calories = float(row.get('Calories') or 0.0)
        fat_content = float(row.get('FatContent') or 0.0)
        cholesterol_content = float(row.get('CholesterolContent') or 0.0)
        carbohydrate_content = float(row.get('CarbohydrateContent') or 0.0)
        fiber_content = float(row.get('FiberContent') or 0.0)
        sugar_content = float(row.get('SugarContent') or 0.0)
        protein_content = float(row.get('ProteinContent') or 0.0)
        recipe_servings = float(row.get('RecipeServings') or 0.0)
        
        # Process ingredients into text for search_text
        ingredients_text = " ".join([f"{str(ing)} {str(qty)}" for ing, qty in ingredients]) if ingredients else ''
        
        # Combine and clean everything
        combined_text = " ".join([name, description, ingredients_text, instructions_text, keywords_text])
        search_text = clean_text(combined_text)
        
        # Process image_urls safely
        image_urls_val = row.get('Images')
        if image_urls_val is None:
            image_urls = []
        elif isinstance(image_urls_val, np.ndarray):
            image_urls = image_urls_val.tolist()
        else:
            image_urls = image_urls_val
            
        # Detects allergens
        allergens = detect_allergens(ingredients)
        
        doc = {
            "_op_type": "index",
            "_index": index_name,
            "_id": recipe_id,
            "_source": {
                "recipe_id": recipe_id,
                "name": name,
                "cleaned_name": cleaned_name,
                "author_name": author_name,
                "recipe_category": recipe_category,
                "description": description,
                "ingredients": ingredients,  # stored as list of tuples
                "instructions": instructions_list,  # stored as a list
                # Time
                "cook_time": cook_time,
                "prep_time": prep_time,
                "total_time": total_time,
                # Nutritional Contents
                "calories": calories,
                "fat_content": fat_content,
                "cholesterol_content": cholesterol_content,
                "carbohydrate_content": carbohydrate_content,
                "fiber_content": fiber_content,
                "sugar_content": sugar_content,
                "protein_content": protein_content,
                "recipe_servings": recipe_servings,
                # Searching words
                "keywords": keywords,
                "search_text": search_text,
                "image_urls": image_urls,
                # Allergens
                "allergens": allergens
            }
        }
        yield doc


# Bulk index the sample documents
run_all = False
if run_all == True:
    sample_size = len(recipes_df)
    bulk(es_client, generate_docs(recipes_df)) # full size
else:
    bulk(es_client, generate_docs(recipes_sample)) # limited size

print(f"Indexed {sample_size} recipes into Elasticsearch.")


  es_client.indices.delete(index=index_name, ignore=[400, 404])


Created index: recipes
Indexed 20000 recipes into Elasticsearch.


User related database were created in docker-compose's sql

Now we can Create flask app to expose api (Moved to api.ipynb for better organization)