# Food Recipe Browser Project

By `652115013 Narongchai Rongthong`

Firstly we load the data from parquet file provided

In [57]:
import pandas as pd

# Check if recipes_df is already loaded
if 'recipes_df' not in globals():
    recipes_df = pd.read_parquet('resource/recipes.parquet')
    recipes_df['RecipeServings'].fillna(0.0, inplace=True)  # Fill NaN with default value
    print(f"Loaded {len(recipes_df)} recipes.")
else:
    print("Recipes data already loaded.")
    print(f"Loaded {len(recipes_df)} recipes.")


Recipes data already loaded.
Loaded 522517 recipes.


In [58]:
from elasticsearch import Elasticsearch

es_client = Elasticsearch(
    "https://localhost:9200",
    basic_auth=("elastic", "_Z9BSk2zcMuFD=-1LlAX"),
    ca_certs="~/http_ca.crt"
)

if es_client.ping():
    print("Connected to Elasticsearch")
else:
    print("Elasticsearch connection failed")

Connected to Elasticsearch


Then we can start indexing the data
- applying fields we need
    - id
    - name
    - ingredients
    - instuctions

For searching i want to join those together so its easier to find into `cleaned` "search text"

along with extra cleaned name

Through `stemming` and removing `stopwords`

In [59]:
# Setup text cleaner
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('stopwords')

stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))
# Exclude specific stopwords
important_stop_words =  {"with", "and"}
custom_stopwords = set(stopwords.words('english')) - important_stop_words  

def clean_text(text):
    tokens = word_tokenize(text.lower())  
    filtered_tokens = [word for word in tokens if word not in custom_stopwords]  
    stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]  
    return " ".join(stemmed_tokens)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!



Additionally, since if we send very short query like `"t"` or `"to"` we'd get completely empty results
instead we can make it try to show up something that matches their `ngrams`.

In [None]:
from elasticsearch.helpers import bulk
import numpy as np

# Define index name and sample size for development
index_name = "recipes"
sample_size = 1000 # Set the sample size for testing (adjust as needed)

# Delete the index if it already exists
es_client.indices.delete(index=index_name, ignore=[400, 404])

# Create the index with a mapping that uses an English analyzer
mapping = {
    "settings": {
        "analysis": {
            "tokenizer": {
                "ngram_tokenizer": {
                    "type": "ngram",
                    "min_gram": 2,  # Minimum length of n-grams
                    "max_gram": 3,  # Maximum length of n-grams
                    "token_chars": ["letter", "digit"]
                }
            },
            "analyzer": {
                "default": {
                    "type": "english"
                },
                "ngram_analyzer": {  # Add a custom n-gram analyzer
                    "type": "custom",
                    "tokenizer": "ngram_tokenizer",
                    "filter": ["lowercase"]
                }
            }
        }
    },
    "mappings": {
        "properties": {
            "recipe_id": {"type": "keyword"},
            "name": { 
                "type": "text", 
                "analyzer": "english",
                "fields": { 
                    "ngram": {  # Add an n-gram variant of the name field
                        "type": "text", 
                        "analyzer": "ngram_analyzer"
                    }
                }
            },
            "cleaned_name": {"type": "text", "analyzer": "english"},
            "author_name": {"type": "text", "analyzer": "english"},
            "recipe_category": {"type": "text", "analyzer": "english"},
            "description": {"type": "text", "analyzer": "english"},
            "ingredients": {
                "type": "text",
                "analyzer": "english",
                "fields": {
                    "raw": {
                        "type": "keyword"
                    }
                }
            },
            "instructions": {
                "type": "text",
                "analyzer": "english",
                "fields": {
                    "raw": {
                        "type": "keyword"
                    }
                }
            },
            "keywords": {
                "type": "text",
                "analyzer": "english",
                "fields": {
                    "raw": {
                        "type": "keyword"
                    }
                }
            },
            "search_text": {"type": "text", "analyzer": "english"},
            "image_urls": {"type": "keyword"},
            # Time-related fields
            "cook_time": {"type": "text"},
            "prep_time": {"type": "text"},
            "total_time": {"type": "text"},
            # Nutritional content fields
            "calories": {"type": "float"},
            "fat_content": {"type": "float"},
            "cholesterol_content": {"type": "float"},
            "carbohydrate_content": {"type": "float"},
            "fiber_content": {"type": "float"},
            "sugar_content": {"type": "float"},
            "protein_content": {"type": "float"},
            "recipe_servings": {"type": "float"},
        }
    }
}


# Create the index
es_client.indices.create(index=index_name, body=mapping)
print(f"Created index: {index_name}")

# Get a sample of the recipes for development (you can adjust sample size)
recipes_sample = recipes_df.head(sample_size)

# Prepare the documents for bulk indexing
def generate_docs(df):
    for idx, row in df.iterrows():
        # Main Informations
        recipe_id = str(int(float(row.get('RecipeId', idx))))  # Ensures it's always an integer string
        name = row.get('Name', '')
        cleaned_name = clean_text(name)
        author_name = row.get('AuthorName', '')
        recipe_category = row.get('RecipeCategory', '')
        description = row.get('Description', '')
        ingredients = list(zip(row.get('RecipeIngredientParts', []), row.get('RecipeIngredientQuantities', [])))
        instructions = row.get('RecipeInstructions', [])
        keywords = row.get('Keywords', [])
        
        # Time
        cook_time = row.get('CookTime', '')
        prep_time = row.get('PrepTime', '')
        total_time = row.get('TotalTime', '')
        
        # Nutritional Contents
        calories = row.get('Calories', 0.0)
        fat_content = row.get('FatContent', 0.0)
        cholesterol_content = row.get('CholesterolContent', 0.0)
        carbohydrate_content = row.get('CarbohydrateContent', 0.0)
        fiber_content = row.get('FiberContent', 0.0)
        sugar_content = row.get('SugarContent', 0.0)
        protein_content = row.get('ProteinContent', 0.0)
        recipe_servings = row.get('RecipeServings', 0.0)
        
        # Process instructions: if it's a numpy array, convert it to a list
        if instructions is not None:
            if isinstance(instructions, np.ndarray):
                instructions_list = instructions.tolist()
            else:
                instructions_list = instructions
        else:
            instructions_list = []
            
        instructions_text = " ".join(map(str, instructions_list)) if len(instructions_list) > 0 else ''

        # Process ingredients text (list of tuples)
        ingredients_text = " ".join([f"{ingredient} {quantity}" for ingredient, quantity in ingredients]) if ingredients else ''
        # Process keywords
        # Convert keywords to a list if it is a numpy array
        keywords_list = list(keywords) if isinstance(keywords, np.ndarray) else keywords
        keywords_text = " ".join(filter(None, map(str, keywords_list))) if len(keywords_list) > 0 else ''

        # Combine and clean everything
        search_text = clean_text(" ".join([name, description, ingredients_text, instructions_text, keywords_text]))
        
        # Handle image_url field
        image_urls = row.get('Images', [])

        doc = {
            "_op_type": "index",
            "_index": index_name,
            "_id": recipe_id,
            "_source": {
                "recipe_id": recipe_id,
                "name": name,
                "cleaned_name": cleaned_name,
                "author_name": author_name,
                "recipe_category": recipe_category,
                "description": description,
                "ingredients": ingredients,
                "instructions": instructions,  # You can also store the list as is
                # Time
                "cook_time": cook_time,
                "prep_time": prep_time,
                "total_time": total_time,
                # Nutritional Contents
                "calories": calories,
                "fat_content": fat_content,
                "cholesterol_content": cholesterol_content,
                "carbohydrate_content": carbohydrate_content,
                "fiber_content": fiber_content,
                "sugar_content": sugar_content,
                "protein_content": protein_content,
                "recipe_servings": recipe_servings,
                # Searching words
                "keywords": keywords,
                "search_text": search_text,
                "image_urls": image_urls
            }
        }
        yield doc


# Bulk index the sample documents
bulk(es_client, generate_docs(recipes_sample)) # limited size
# bulk(es_client, generate_docs(recipes_df)) # full size

print(f"Indexed {len(recipes_sample)} recipes into Elasticsearch.")


  es_client.indices.delete(index=index_name, ignore=[400, 404])


Created index: recipes


KeyboardInterrupt: 

Create user system for token and tracking for recommendations

In [None]:
# --- Dummy User & In-Memory Data for Auth, Bookmarks, and Folders ---

# Dummy user database (for UC-001: Authentication)
users = {
    "user1": "password1",
    "user2": "password2"
}

# In-memory storage
sessions = {}  # token -> username
user_bookmarks = {}  # username -> list of {recipe_id, rating}
user_folders = {}   # username -> {folder_name: [recipe_ids]}

import uuid

def generate_token():
    return str(uuid.uuid4())

print("Initialized dummy user authentication and storage.")

Initialized dummy user authentication and storage.


Create flask app to expose api

In [None]:
# --- Flask API Endpoints ---
import json
import time
import random
from flask_cors import CORS
from flask import Flask, request, jsonify

app = Flask(__name__)
CORS(app, supports_credentials=True, resources={r"/*": {"origins": "*"}})

# Development mode token (for easier development)
DEV_TOKEN = "dev" 

def generate_token():
    return str(random.randint(100000, 999999))

# Including elapsed time
from flask import g

@app.before_request
def start_timer():
    g.start_time = time.time()

@app.after_request
def add_elapsed_time(response):
    if hasattr(g, 'start_time'):
        response_time = time.time() - g.start_time
        response_json = response.get_json()
        if response_json:  # Only modify if response is JSON
            response_json["response_time"] = round(response_time, 4)
            response.set_data(json.dumps(response_json))  # Update response body
    return response


# UC-001: User Authentication
@app.route('/login', methods=['POST'])
def login():
    data = request.get_json()
    username = data.get("username")
    password = data.get("password")
    if username in users and users[username] == password:
        token = generate_token()
        sessions[token] = username
        return jsonify({"message": "Login successful", "token": token})
    return jsonify({"message": "Invalid credentials"}), 401

@app.route('/logout', methods=['POST'])
def logout():
    token = request.headers.get("Authorization")
    if token in sessions:
        sessions.pop(token)
        return jsonify({"message": "Logout successful"})
    return jsonify({"message": "Invalid token"}), 401

# Helper function to check authentication
def is_authenticated(request):
    token = request.headers.get("Authorization")
    return token == DEV_TOKEN or token in sessions

# UC-002 & UC-003: Recipe Search Functionality & Display Results
@app.route('/search', methods=['GET'])
def search():
    # Authorization
    if not is_authenticated(request):
        return jsonify({"message": "Unauthorized"}), 401
    
    # Responds
    query = request.args.get("query", "")
    cleaned_query = clean_text(query)
    res = es_client.search(index=index_name, body={
        "query": {
            "bool": {
                "should": [
                    { "match": { "name": { "query": query, "boost": 3 } } },  # No stemming (Best match)
                    { "match": { "name.ngram": { "query": query, "boost": 2 } } },  # Partial match with n-grams
                    { "match": { "stemmed_name": { "query": cleaned_query, "boost": 2 } } },  # Stemmed query
                    { "match": { "search_text": { "query": cleaned_query, "fuzziness": "AUTO", "boost": 1 } } }  # Stemmed + Fuzzy
                ]
            }
        }
    })
    hits = res["hits"]["hits"]
    results = [
        {
            "recipe_id": hit["_source"]["recipe_id"],
            "name": hit["_source"]["name"],
            "snippet": hit["_source"]["description"][:75],
            "image_urls": hit["_source"].get("image_urls", "")
        } for hit in hits
    ]
    return jsonify({"results": results})

# I'll be using this to get image when result gives no image
@app.route('/search_nearest_image', methods=['GET'])
def search_nearest_image():
    # Authorization
    if not is_authenticated(request):
        return jsonify({"message": "Unauthorized"}), 401
    
    # Responds
    query = request.args.get("query", "")
    cleaned_query = clean_text(query)
    
    res = es_client.search(index=index_name, body={
        "query": {
            "bool": {
                "should": [
                    { "match": { "name": { "query": query, "boost": 3 } } },  # No stemming (Best match)
                    { "match": { "name.ngram": { "query": query, "boost": 2 } } },  # Partial match with n-grams
                    { "match": { "stemmed_name": { "query": cleaned_query, "boost": 2 } } },  # Stemmed query
                    { "match": { "search_text": { "query": cleaned_query, "fuzziness": "AUTO", "boost": 1 } } }  # Stemmed + Fuzzy
                ]
            }
        }
    })
    
    # Get all hits
    hits = res["hits"]["hits"]
    
    # Iterate over hits to find the first result with an image
    for hit in hits:
        top_hit = hit["_source"]
        if "image_urls" in top_hit and top_hit["image_urls"]:
            result = {
                "recipe_id": top_hit["recipe_id"],
                "name": top_hit["name"],
                "image_urls": top_hit["image_urls"]
            }
            return jsonify({"result": result})
    
    # If no image was found
    return jsonify({"message": "No results with images found"}), 404



# UC-004: Detailed Dish Information
@app.route('/recipe/<recipe_id>', methods=['GET'])
def recipe_detail(recipe_id):
    # Authorization
    if not is_authenticated(request):
        return jsonify({"message": "Unauthorized"}), 401

    # Fetch the document from Elasticsearch
    res = es_client.get(index=index_name, id=recipe_id)
    result = res["_source"]

    # Remove unwanted fields
    result.pop("cleaned_name", None)
    result.pop("search_text", None)

    return jsonify(result)


# UC-006: Bookmarking and Rating
@app.route('/bookmark', methods=['POST'])
def bookmark():
    # Authorization
    if not is_authenticated(request):
        return jsonify({"message": "Unauthorized"}), 401
    data = request.get_json()
    recipe_id = data.get("recipe_id")
    rating = data.get("rating")
    username = sessions.get(request.headers.get("Authorization"), "dev_user")
    user_bookmarks.setdefault(username, []).append({"recipe_id": recipe_id, "rating": rating})
    return jsonify({"message": "Bookmarked successfully"})

# UC-005: Folder Management
@app.route('/folders', methods=['GET', 'POST'])
def folders():
    # Authorization
    if not is_authenticated(request):
        return jsonify({"message": "Unauthorized"}), 401
    username = sessions.get(request.headers.get("Authorization"), "dev_user")
    if request.method == 'GET':
        return jsonify(user_folders.get(username, {}))
    elif request.method == 'POST':
        data = request.get_json()
        folder_name = data.get("folder_name")
        user_folders.setdefault(username, {})[folder_name] = []
        return jsonify({"message": f"Folder '{folder_name}' created"})

# UC-007: Personalized Recommendations (dummy implementation)
@app.route('/recommendations', methods=['GET'])
def recommendations():
    # Authorization
    if not is_authenticated(request):
        return jsonify({"message": "Unauthorized"}), 401
    res = es_client.search(index=index_name, body={
        "query": {"match_all": {}},
        "size": 5
    })
    hits = res["hits"]["hits"]
    recs = [
        {
            "recipe_id": hit["_source"]["recipe_id"],
            "name": hit["_source"]["name"],
            "snippet": hit["_source"]["search_text"][:150],
            "image_urls": hit["_source"].get("image_urls", "")
        } for hit in hits
    ]
    return jsonify({"recommendations": recs})

print("Flask API endpoints defined.")


Flask API endpoints defined.


In [None]:
# Run the Flask app on port 5000
app.run(port=5000, debug=False)

 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
127.0.0.1 - - [06/Mar/2025 23:04:36] "GET /recipe/109 HTTP/1.1" 200 -
127.0.0.1 - - [06/Mar/2025 23:06:51] "OPTIONS /search_nearest_image?query=cake HTTP/1.1" 200 -
127.0.0.1 - - [06/Mar/2025 23:06:51] "GET /search_nearest_image?query=cake HTTP/1.1" 200 -
127.0.0.1 - - [06/Mar/2025 23:06:51] "OPTIONS /search?query=cake HTTP/1.1" 200 -
127.0.0.1 - - [06/Mar/2025 23:06:52] "GET /search?query=cake HTTP/1.1" 200 -
127.0.0.1 - - [06/Mar/2025 23:07:13] "OPTIONS /search_nearest_image?query=ca HTTP/1.1" 200 -
127.0.0.1 - - [06/Mar/2025 23:07:13] "GET /search_nearest_image?query=ca HTTP/1.1" 200 -
127.0.0.1 - - [06/Mar/2025 23:07:13] "OPTIONS /search?query=ca HTTP/1.1" 200 -
127.0.0.1 - - [06/Mar/2025 23:07:14] "GET /search?query=ca HTTP/1.1" 200 -
127.0.0.1 - - [06/Mar/2025 23:07:29] "OPTIONS /search_nearest_image?query=poyato HTTP/1.1" 200 -
127.0.0.1 - - [06/Mar/2025 23:07:29] "GET /search_nearest_image?query=poyato HTTP/1.1" 200 -
127

### Testing Instructions

1. **Authentication:** Use a REST client (or cURL) to POST to `/login` with JSON payload, e.g.: 
   ```json
   {"username": "user1", "password": "password1"}
   ```
   You'll receive a token in the response. Use that token in the `Authorization` header for subsequent requests.

2. **Search:** GET `/search?query=chicken` with the header `Authorization: <token>` to retrieve matching recipes.

3. **Detailed View:** GET `/recipe/<recipe_id>` to fetch full details for a recipe.

4. **Bookmarking:** POST to `/bookmark` with JSON payload containing a `recipe_id` and an optional `rating`.

5. **Folder Management:** GET or POST to `/folders` to list or create folders.

6. **Recommendations:** GET `/recommendations` to retrieve a list of recommended recipes (dummy implementation).