# Food Recipe Browser Project

By `652115013 Narongchai Rongthong`

Firstly we load the data from parquet file provided

In [8]:
import pandas as pd

# Check if recipes_df is already loaded
if 'recipes_df' not in globals():
    recipes_df = pd.read_parquet('resource/recipes.parquet')
    recipes_df['RecipeServings'].fillna(0.0, inplace=True)  # Fill NaN with default value
    print(f"Loaded {len(recipes_df)} recipes.")
else:
    print("Recipes data already loaded.")
    print(f"Loaded {len(recipes_df)} recipes.")


Recipes data already loaded.
Loaded 522517 recipes.


In [9]:
from elasticsearch import Elasticsearch

es_client = Elasticsearch(
    "https://localhost:9200",
    basic_auth=("elastic", "_Z9BSk2zcMuFD=-1LlAX"),
    ca_certs="~/http_ca.crt"
)

if es_client.ping():
    print("Connected to Elasticsearch")
else:
    print("Elasticsearch connection failed")

Elasticsearch connection failed


Then we can start indexing the data
- applying fields we need
    - id
    - name
    - ingredients
    - instuctions

For searching i want to join those together so its easier to find into `cleaned` "search text"

along with extra cleaned name

Through `stemming` and removing `stopwords`

In [10]:
# Setup text cleaner
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('stopwords')

stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))
# Exclude specific stopwords
important_stop_words =  {"with", "and"}
custom_stopwords = set(stopwords.words('english')) - important_stop_words  

def clean_text(text):
    tokens = word_tokenize(text.lower())  
    filtered_tokens = [word for word in tokens if word not in custom_stopwords]  
    stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]  
    return " ".join(stemmed_tokens)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!



Additionally, since if we send very short query like `"t"` or `"to"` we'd get completely empty results
instead we can make it try to show up something that matches their `ngrams`.

In [11]:
from elasticsearch.helpers import bulk
import numpy as np

# Define index name and sample size for development
index_name = "recipes"
sample_size = 1000 # Set the sample size for testing (adjust as needed)

# Delete the index if it already exists
es_client.indices.delete(index=index_name, ignore=[400, 404])

# Create the index with a mapping that uses an English analyzer
mapping = {
    "settings": {
        "analysis": {
            "tokenizer": {
                "ngram_tokenizer": {
                    "type": "ngram",
                    "min_gram": 2,  # Minimum length of n-grams
                    "max_gram": 3,  # Maximum length of n-grams
                    "token_chars": ["letter", "digit"]
                }
            },
            "analyzer": {
                "default": {
                    "type": "english"
                },
                "ngram_analyzer": {  # Add a custom n-gram analyzer
                    "type": "custom",
                    "tokenizer": "ngram_tokenizer",
                    "filter": ["lowercase"]
                }
            }
        }
    },
    "mappings": {
        "properties": {
            "recipe_id": {"type": "keyword"},
            "name": { 
                "type": "text", 
                "analyzer": "english",
                "fields": { 
                    "ngram": {  # Add an n-gram variant of the name field
                        "type": "text", 
                        "analyzer": "ngram_analyzer"
                    }
                }
            },
            "cleaned_name": {"type": "text", "analyzer": "english"},
            "author_name": {"type": "text", "analyzer": "english"},
            "recipe_category": {"type": "text", "analyzer": "english"},
            "description": {"type": "text", "analyzer": "english"},
            "ingredients": {
                "type": "text",
                "analyzer": "english",
                "fields": {
                    "raw": {
                        "type": "keyword"
                    }
                }
            },
            "instructions": {
                "type": "text",
                "analyzer": "english",
                "fields": {
                    "raw": {
                        "type": "keyword"
                    }
                }
            },
            "keywords": {
                "type": "text",
                "analyzer": "english",
                "fields": {
                    "raw": {
                        "type": "keyword"
                    }
                }
            },
            "search_text": {"type": "text", "analyzer": "english"},
            "image_urls": {"type": "keyword"},
            # Time-related fields
            "cook_time": {"type": "text"},
            "prep_time": {"type": "text"},
            "total_time": {"type": "text"},
            # Nutritional content fields
            "calories": {"type": "float"},
            "fat_content": {"type": "float"},
            "cholesterol_content": {"type": "float"},
            "carbohydrate_content": {"type": "float"},
            "fiber_content": {"type": "float"},
            "sugar_content": {"type": "float"},
            "protein_content": {"type": "float"},
            "recipe_servings": {"type": "float"},
        }
    }
}


# Create the index
es_client.indices.create(index=index_name, body=mapping)
print(f"Created index: {index_name}")

# Get a sample of the recipes for development (you can adjust sample size)
recipes_sample = recipes_df.head(sample_size)

# Prepare the documents for bulk indexing
def generate_docs(df):
    for idx, row in df.iterrows():
        # Main Informations
        recipe_id = str(int(float(row.get('RecipeId', idx))))  # Ensures it's always an integer string
        name = str(row.get('Name') or '')
        cleaned_name = clean_text(name)
        author_name = str(row.get('AuthorName') or '')
        recipe_category = str(row.get('RecipeCategory') or '')
        description = str(row.get('Description') or '')
        
        # Process ingredients as a list of (ingredient, quantity) pairs
        ingredients = list(zip(row.get('RecipeIngredientParts', []), row.get('RecipeIngredientQuantities', [])))
        
        # Process instructions safely
        instructions_val = row.get('RecipeInstructions')
        if instructions_val is None:
            instructions_list = []
        elif isinstance(instructions_val, np.ndarray):
            instructions_list = instructions_val.tolist()
        else:
            instructions_list = instructions_val
        instructions_text = " ".join(map(str, instructions_list)) if instructions_list else ''
        
        # Process keywords safely
        keywords_val = row.get('Keywords')
        if keywords_val is None:
            keywords = []
        elif isinstance(keywords_val, np.ndarray):
            keywords = keywords_val.tolist()
        else:
            keywords = keywords_val
        keywords_text = " ".join(filter(None, map(str, keywords))) if len(keywords) > 0 else ''
        
        # Time
        cook_time = str(row.get('CookTime') or '')
        prep_time = str(row.get('PrepTime') or '')
        total_time = str(row.get('TotalTime') or '')
        
        # Nutritional Contents
        calories = float(row.get('Calories') or 0.0)
        fat_content = float(row.get('FatContent') or 0.0)
        cholesterol_content = float(row.get('CholesterolContent') or 0.0)
        carbohydrate_content = float(row.get('CarbohydrateContent') or 0.0)
        fiber_content = float(row.get('FiberContent') or 0.0)
        sugar_content = float(row.get('SugarContent') or 0.0)
        protein_content = float(row.get('ProteinContent') or 0.0)
        recipe_servings = float(row.get('RecipeServings') or 0.0)
        
        # Process ingredients into text for search_text
        ingredients_text = " ".join([f"{str(ing)} {str(qty)}" for ing, qty in ingredients]) if ingredients else ''
        
        # Combine and clean everything
        combined_text = " ".join([name, description, ingredients_text, instructions_text, keywords_text])
        search_text = clean_text(combined_text)
        
        # Process image_urls safely
        image_urls_val = row.get('Images')
        if image_urls_val is None:
            image_urls = []
        elif isinstance(image_urls_val, np.ndarray):
            image_urls = image_urls_val.tolist()
        else:
            image_urls = image_urls_val

        doc = {
            "_op_type": "index",
            "_index": index_name,
            "_id": recipe_id,
            "_source": {
                "recipe_id": recipe_id,
                "name": name,
                "cleaned_name": cleaned_name,
                "author_name": author_name,
                "recipe_category": recipe_category,
                "description": description,
                "ingredients": ingredients,  # stored as list of tuples
                "instructions": instructions_list,  # stored as a list
                # Time
                "cook_time": cook_time,
                "prep_time": prep_time,
                "total_time": total_time,
                # Nutritional Contents
                "calories": calories,
                "fat_content": fat_content,
                "cholesterol_content": cholesterol_content,
                "carbohydrate_content": carbohydrate_content,
                "fiber_content": fiber_content,
                "sugar_content": sugar_content,
                "protein_content": protein_content,
                "recipe_servings": recipe_servings,
                # Searching words
                "keywords": keywords,
                "search_text": search_text,
                "image_urls": image_urls
            }
        }
        yield doc


# Bulk index the sample documents
run_all = False
if run_all == True:
    bulk(es_client, generate_docs(recipes_df)) # full size
else:
    bulk(es_client, generate_docs(recipes_sample)) # limited size

print(f"Indexed {len(recipes_sample)} recipes into Elasticsearch.")


  es_client.indices.delete(index=index_name, ignore=[400, 404])


ConnectionError: Connection error caused by: ConnectionError(Connection error caused by: ProtocolError(('Connection aborted.', ConnectionAbortedError(10053, 'An established connection was aborted by the software in your host machine', None, 10053, None))))

User related database were created in docker-compose's sql

Create flask app to expose api

In [None]:
# --- Flask API Endpoints ---
import os
import json
import time
import random
import uuid
from flask_cors import CORS
from flask import Flask, request, jsonify, g
from flask_sqlalchemy import SQLAlchemy

app = Flask(__name__)
CORS(app, supports_credentials=True, resources={r"/*": {"origins": "*"}})
# Connection to database
app.config['SQLALCHEMY_DATABASE_URI'] = 'mysql+pymysql://user:user_password@localhost:3309/my_database'
app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = False

db = SQLAlchemy(app)
# Development mode token (for easier development)
DEV_TOKEN = "dev" 

def generate_token():
    return str(random.randint(100000, 999999))

class User(db.Model):
    __tablename__ = "users"
    
    username = db.Column(db.String(50), primary_key=True)
    password_hash = db.Column(db.String(255), nullable=False)

    sessions = db.relationship("Session", backref="user", cascade="all, delete", lazy=True)
    bookmarks = db.relationship("Bookmark", backref="user", cascade="all, delete", lazy=True)
    folders = db.relationship("Folder", backref="user", cascade="all, delete", lazy=True)


class Session(db.Model):
    __tablename__ = "sessions"

    token = db.Column(db.String(36), primary_key=True, default=lambda: str(uuid.uuid4()))
    username = db.Column(db.String(50), db.ForeignKey("users.username", ondelete="CASCADE"), nullable=False)


class Bookmark(db.Model):
    __tablename__ = "bookmarks"

    id = db.Column(db.Integer, primary_key=True, autoincrement=True)
    username = db.Column(db.String(50), db.ForeignKey("users.username", ondelete="CASCADE"), nullable=False)
    recipe_id = db.Column(db.Integer, nullable=False)
    rating = db.Column(db.Integer, nullable=True)

    created_at = db.Column(db.TIMESTAMP, server_default=db.func.current_timestamp())

    __table_args__ = (
        db.CheckConstraint("rating BETWEEN 1 AND 5", name="valid_rating"),
    )


class Folder(db.Model):
    __tablename__ = "folders"

    id = db.Column(db.Integer, primary_key=True, autoincrement=True)
    username = db.Column(db.String(50), db.ForeignKey("users.username", ondelete="CASCADE"), nullable=False)
    folder_name = db.Column(db.String(100), nullable=False)

    folder_recipes = db.relationship("FolderRecipe", backref="folder", cascade="all, delete", lazy=True)

    __table_args__ = (
        db.UniqueConstraint("username", "folder_name", name="unique_folder"),
    )


class FolderRecipe(db.Model):
    __tablename__ = "folder_recipes"

    folder_id = db.Column(db.Integer, db.ForeignKey("folders.id", ondelete="CASCADE"), primary_key=True)
    recipe_id = db.Column(db.Integer, primary_key=True)


: 

In [None]:
# App routes

@app.before_request
def start_timer():
    g.start_time = time.time()

@app.after_request
def add_elapsed_time(response):
    if hasattr(g, 'start_time'):
        response_time = time.time() - g.start_time
        response_json = response.get_json()
        if response_json:  # Only modify if response is JSON
            response_json["response_time"] = round(response_time, 4)
            response.set_data(json.dumps(response_json))  # Update response body
    return response

# USER HANDLING
# UC-001: User Authentication (using the database)
from werkzeug.security import generate_password_hash
from werkzeug.security import check_password_hash
@app.route('/register', methods=['POST'])
def register():
    data = request.get_json()
    username = data.get("username")
    password = data.get("password")

    # Validate that username and password are provided
    if not username or not password:
        return jsonify({"message": "Username and password are required"}), 400

    # Check if the username already exists
    existing_user = User.query.filter_by(username=username).first()
    if existing_user:
        return jsonify({"message": "Username already taken"}), 400

    # Hash the password before saving it to the database
    password_hash = generate_password_hash(password)

    # Create a new user and save it to the database
    new_user = User(username=username, password_hash=password_hash)
    db.session.add(new_user)
    db.session.commit()

    return jsonify({"message": "User registered successfully"}), 201

@app.route('/login', methods=['POST'])
def login():
    data = request.get_json()
    username = data.get("username")
    password = data.get("password")
    
    user = User.query.filter_by(username=username).first()
    if user and check_password_hash(user.password_hash, password):
        token = generate_token()
        new_session = Session(token=token, username=user.username)
        db.session.add(new_session)
        db.session.commit()
        return jsonify({"message": "Login successful", "username": username, "token": token})
    
    return jsonify({"message": "Invalid credentials"}), 401

@app.route('/logout', methods=['POST'])
def logout():
    token = request.headers.get("Authorization")
    session_obj = Session.query.filter_by(token=token).first()
    if session_obj:
        db.session.delete(session_obj)
        db.session.commit()
        return jsonify({"message": "Logout successful"})
    
    return jsonify({"message": "Invalid token"}), 401

# Helper function to check authentication
def is_authenticated(request):
    token = request.headers.get("Authorization")
    if token == DEV_TOKEN:
        return True
    return Session.query.filter_by(token=token).first() is not None

# SEARCHING
# UC-002 & UC-003: Recipe Search Functionality & Display Results
@app.route('/search', methods=['GET'])
def search():
    if not is_authenticated(request):
        return jsonify({"message": "Unauthorized"}), 401
    
    query = request.args.get("query", "")
    cleaned_query = clean_text(query)
    res = es_client.search(index=index_name, body={
        "query": {
            "bool": {
                "should": [
                    { "match": { "name": { "query": query, "boost": 3 } } },
                    { "match": { "name.ngram": { "query": query, "boost": 2 } } },
                    { "match": { "stemmed_name": { "query": cleaned_query, "boost": 2 } } },
                    { "match": { "search_text": { "query": cleaned_query, "fuzziness": "AUTO", "boost": 1 } } }
                ]
            }
        }
    })
    hits = res["hits"]["hits"]
    results = [
        {
            "recipe_id": hit["_source"]["recipe_id"],
            "name": hit["_source"]["name"],
            "snippet": hit["_source"]["description"][:75],
            "image_urls": hit["_source"].get("image_urls", "")
        } for hit in hits
    ]
    return jsonify({"results": results})

@app.route('/search_nearest_image', methods=['GET'])
def search_nearest_image():
    if not is_authenticated(request):
        return jsonify({"message": "Unauthorized"}), 401
    
    query = request.args.get("query", "")
    cleaned_query = clean_text(query)
    res = es_client.search(index=index_name, body={
        "query": {
            "bool": {
                "should": [
                    { "match": { "name": { "query": query, "boost": 3 } } },
                    { "match": { "name.ngram": { "query": query, "boost": 2 } } },
                    { "match": { "stemmed_name": { "query": cleaned_query, "boost": 2 } } },
                    { "match": { "search_text": { "query": cleaned_query, "fuzziness": "AUTO", "boost": 1 } } }
                ]
            }
        }
    })
    
    hits = res["hits"]["hits"]
    for hit in hits:
        top_hit = hit["_source"]
        if "image_urls" in top_hit and top_hit["image_urls"]:
            return jsonify({"result": {
                "recipe_id": top_hit["recipe_id"],
                "name": top_hit["name"],
                "image_urls": top_hit["image_urls"]
            }})
    
    return jsonify({"message": "No results with images found"}), 404

# UC-004: Detailed Dish Information
@app.route('/recipe/<recipe_id>', methods=['GET'])
def recipe_detail(recipe_id):
    if not is_authenticated(request):
        return jsonify({"message": "Unauthorized"}), 401
    res = es_client.get(index=index_name, id=recipe_id)
    result = res["_source"]
    result.pop("cleaned_name", None)
    result.pop("search_text", None)
    return jsonify(result)

# UC-006: Bookmarking and Rating (using the database)
@app.route('/bookmark', methods=['POST'])
def bookmark():
    if not is_authenticated(request):
        return jsonify({"message": "Unauthorized"}), 401
    
    data = request.get_json()
    recipe_id = data.get("recipe_id")
    rating = data.get("rating")
    token = request.headers.get("Authorization")
    session_obj = Session.query.filter_by(token=token).first()
    if not session_obj:
        return jsonify({"message": "Invalid session"}), 401
    
    new_bookmark = Bookmark(user_id=session_obj.user_id, recipe_id=recipe_id, rating=rating)
    db.session.add(new_bookmark)
    db.session.commit()
    return jsonify({"message": "Bookmarked successfully"})

# UC-005: Folder Management
@app.route('/folders', methods=['GET', 'POST'])
def folders():
    # Authorization
    token = request.headers.get("Authorization")
    session_obj = Session.query.filter_by(token=token).first()
    if not session_obj:
        return jsonify({"message": "Unauthorized"}), 401

    username = session_obj.username  # Get the authenticated user

    if request.method == 'GET':
        # Fetch all folders for the user
        user_folders = Folder.query.filter_by(username=username).all()
        return jsonify([folder.folder_name for folder in user_folders])

    elif request.method == 'POST':
        data = request.get_json()
        folder_name = data.get("folder_name")

        if not folder_name:
            return jsonify({"message": "Folder name is required"}), 400

        # Check if the folder already exists
        existing_folder = Folder.query.filter_by(username=username, folder_name=folder_name).first()
        if existing_folder:
            return jsonify({"message": "Folder already exists"}), 400

        # Create and save the new folder
        new_folder = Folder(username=username, folder_name=folder_name)
        db.session.add(new_folder)
        db.session.commit()

        return jsonify({"message": f"Folder '{folder_name}' created"}), 201


: 

In [None]:
# Run the Flask app on port 5000
app.run(port=5000, debug=False)

 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
127.0.0.1 - - [11/Mar/2025 23:16:32] "OPTIONS /login HTTP/1.1" 200 -
127.0.0.1 - - [11/Mar/2025 23:16:32] "POST /login HTTP/1.1" 200 -
127.0.0.1 - - [11/Mar/2025 23:18:37] "OPTIONS /login HTTP/1.1" 200 -
127.0.0.1 - - [11/Mar/2025 23:18:38] "POST /login HTTP/1.1" 200 -
127.0.0.1 - - [11/Mar/2025 23:19:27] "OPTIONS /login HTTP/1.1" 200 -
127.0.0.1 - - [11/Mar/2025 23:19:27] "POST /login HTTP/1.1" 200 -
127.0.0.1 - - [11/Mar/2025 23:23:58] "OPTIONS /login HTTP/1.1" 200 -
127.0.0.1 - - [11/Mar/2025 23:23:58] "POST /login HTTP/1.1" 200 -
127.0.0.1 - - [11/Mar/2025 23:26:33] "OPTIONS /login HTTP/1.1" 200 -
127.0.0.1 - - [11/Mar/2025 23:26:34] "POST /login HTTP/1.1" 200 -
127.0.0.1 - - [11/Mar/2025 23:29:11] "OPTIONS /login HTTP/1.1" 200 -
127.0.0.1 - - [11/Mar/2025 23:29:11] "POST /login HTTP/1.1" 401 -
127.0.0.1 - - [11/Mar/2025 23:29:13] "POST /login HTTP/1.1" 200 -
127.0.0.1 - - [12/Mar/2025 16:03:17] "GET /predict HTTP/1.1" 404 -


: 

### Testing Instructions

1. **Authentication:**
   - **Login:** Use a REST client (or cURL) to POST to `/login` with a JSON payload containing `username` and `password`, e.g.:
     ```json
     {"username": "user1", "password": "password1"}
     ```
     You will receive a response with a token:
     ```json
     {
       "message": "Login successful",
       "token": "<token>"
     }
     ```
     Use this token in the `Authorization` header for subsequent requests.

   - **Logout:** POST to `/logout` with the token in the `Authorization` header to log out:
     ```json
     {"Authorization": "<token>"}
     ```
     The response will confirm successful logout:
     ```json
     {"message": "Logout successful"}
     ```

2. **Search Recipes:**
   - **Search by Query:** Send a GET request to `/search?query=chicken` with the `Authorization` header:
     ```json
     {"Authorization": "<token>"}
     ```
     The response will return matching recipes:
     ```json
     {
       "results": [
         {
           "recipe_id": 123,
           "name": "Grilled Chicken",
           "snippet": "A delicious grilled chicken recipe...",
           "image_urls": ["url1", "url2"]
         },
         ...
       ]
     }
     ```

3. **Search for Recipes with Images:**
   - **Search Nearest Image:** Send a GET request to `/search_nearest_image?query=chicken` with the `Authorization` header. If there are results with images, you will get a response like:
     ```json
     {
       "result": {
         "recipe_id": 123,
         "name": "Grilled Chicken",
         "image_urls": ["url1", "url2"]
       }
     }
     ```
     If no images are found:
     ```json
     {"message": "No results with images found"}
     ```

4. **Detailed Recipe Information:**
   - **Recipe Details:** Send a GET request to `/recipe/<recipe_id>` (replace `<recipe_id>` with a valid ID). The response will return the full details of the recipe:
     ```json
     {
       "recipe_id": 123,
       "name": "Grilled Chicken",
       "ingredients": "chicken, spices, oil...",
       "steps": ["Step 1", "Step 2"],
       "image_urls": ["url1", "url2"]
     }
     ```

5. **Bookmarking a Recipe:**
   - **Bookmark Recipe:** Send a POST request to `/bookmark` with a JSON payload containing `recipe_id` and an optional `rating`:
     ```json
     {
       "recipe_id": 123,
       "rating": 4
     }
     ```
     The response will confirm the bookmark:
     ```json
     {"message": "Bookmarked successfully"}
     ```

6. **Folder Management:**
   - **View Folders:** Send a GET request to `/folders` with the `Authorization` header. The response will list the user’s folders:
     ```json
     {
       "folders": ["Favorites", "Quick Meals", ...]
     }
     ```

   - **Create Folder:** Send a POST request to `/folders` with a JSON payload containing `folder_name`:
     ```json
     {
       "folder_name": "Healthy Recipes"
     }
     ```
     If successful, the response will be:
     ```json
     {"message": "Folder 'Healthy Recipes' created"}
     ```

     If the folder already exists, you will receive an error:
     ```json
     {"message": "Folder already exists"}
     ```

   - **Unauthorized Requests:** For any request that requires authentication (e.g., search, bookmarking, folders), if no valid token is provided in the `Authorization` header, you will receive:
     ```json
     {"message": "Unauthorized"}
     ```

7. **Response Time:**
   - Each response will include the `response_time` in milliseconds, which can be checked for performance testing.