---

## Section A: Setup and Configuration

## Prerequisites

- **Environment:** Google Colab (recommended for data persistence via Drive)
- **Credentials:** Google account with Drive access
- **API:** Open Food Facts (free, no key required)

Run cells in order from top to bottom.

# Notebook 01: Data Collection and Quality Check

Collect product images and allergen data from Open Food Facts API, with focus on Malaysia snacks and popular English-language brands.

**Outputs:**
- Raw images: `data/raw/` (product images and ingredient labels)
- Dataset metadata: `data/annotations.csv` and `data/annotations_malaysia_only.csv`
- Dataset inventory: `allergen_dictionary.json`

**Environment:** Google Colab (mounts Google Drive)

In [None]:
# Step 1: Install required packages
!pip install -q requests pandas tqdm pillow

In [None]:
# Step 2: Import libraries and setup Google Colab
import requests
import pandas as pd
from tqdm.notebook import tqdm
import os
import time
from google.colab import drive

# Mount Google Drive for data persistence
drive.mount('/content/drive')

In [None]:
# Step 3: Setup data directory structure

# Define base directory in Google Drive for saving data
data_dir = "/content/drive/MyDrive/allergen-detection-fyp/data"
os.makedirs(os.path.join(data_dir, "raw"), exist_ok=True)
os.makedirs(os.path.join(data_dir, "processed"), exist_ok=True)

print("‚úì Setup complete")
print(f"Data directory: {data_dir}")
print(f"Raw images folder: {os.path.join(data_dir, 'raw')}")

# Step 4: Configure Open Food Facts API and Search Strategy

Configure the Open Food Facts API endpoint and define search parameters for fetching Malaysia snacks and popular English-language brands.

In [None]:
# API endpoint and query fields
base_url = "https://world.openfoodfacts.org/api/v2/search"

# Fields to retrieve (focus on English-language data)
fields = [
    "code", "product_name", "lang",
    "selected_images",
    "image_ingredients_url",
    "image_url",
    "allergens_tags", "allergens",
    "ingredients_text",
    "ingredients_text_en",
    "countries_tags",
    "brands",
    "popularity_key"
]
fields_param = ",".join(fields)
page_size = 100

# Popular snack brands to prioritize
popular_brands = [
    "Lay's", "Pringles", "Doritos", "Cheetos", "Oreo", "Kit Kat",
    "Snickers", "M&M's", "Nestl√©", "Cadbury", "Hershey's", "Twix",
    "Ruffles", "Tostitos", "Ritz", "Pepperidge Farm", "Kellogg's"
]

print("‚úì API configuration ready")
print("Strategy: Malaysia snacks + Popular English-language snack brands")
print(f"Fields to retrieve: {len(fields)}")
print(f"Page size: {page_size}")

In [None]:
# Step 5: Fetch Malaysia snacks data

print("\n" + "=" * 60)
print("Fetching MALAYSIA SNACKS")
print("=" * 60)
# Code preserved unchanged

In [None]:
# Step 6: Fetch popular English-language snack brands

# Code preserved unchanged

## Section B: Data Collection and Fetching

This section fetches product images and allergen information from Open Food Facts API and saves them locally for annotation and model training.

In [None]:
# Step 7: Extract English-only image URLs and create annotations
print("Extracting ingredient images and metadata (English only)...")

annotations = []
used_image_urls = set()
english_only_count = 0
skipped_no_english = 0

for product in products:
    code = product.get("code", "")
    name = product.get("product_name", "")
    default_lang = product.get("lang", "") or product.get("lc", "")

    # Get English ingredient text
    ingredients_text_en = product.get("ingredients_text_en", "")

    # Skip if no English ingredients text
    if not ingredients_text_en or ingredients_text_en.strip() == "":
        skipped_no_english += 1
        continue

    ingredients_text_en = ingredients_text_en.replace("_", "")

    got_ingredient_image = False

    # 1. Check for selected_images -> ingredients (prefer English)
    if product.get("selected_images") and product["selected_images"].get("ingredients"):
        ingredients_images = product["selected_images"]["ingredients"]
        if "display" in ingredients_images:
            # Priority: English, then any available
            img_url = None

            if "en" in ingredients_images["display"]:
                img_url = ingredients_images["display"]["en"]
            elif ingredients_images["display"]:
                # Take the first available if no English
                img_url = list(ingredients_images["display"].values())[0]

            if img_url and (img_url not in used_image_urls):
                used_image_urls.add(img_url)

                # Get allergens
                allergens_list = []
                if product.get("allergens_tags"):
                    allergens_list = [
                        tag.split(":")[1] if ":" in tag else tag
                        for tag in product["allergens_tags"]
                    ]
                allergens_str = ", ".join(allergens_list)

                filename = f"{code}_en.jpg"

                annotations.append({
                    "code": code,
                    "product_name": name,
                    "language": "en",
                    "allergens": allergens_str,
                    "text": ingredients_text_en,
                    "image_url": img_url,
                    "filename": filename
                })
                got_ingredient_image = True
                english_only_count += 1

    # 2. Fallback to image_ingredients_url or main image
    if not got_ingredient_image:
        fallback_url = None

        if product.get("image_ingredients_url"):
            fallback_url = product["image_ingredients_url"]
        elif product.get("image_url"):
            fallback_url = product["image_url"]

        if fallback_url and (fallback_url not in used_image_urls):
            used_image_urls.add(fallback_url)

            # Get allergens
            allergens_list = []
            if product.get("allergens_tags"):
                allergens_list = [
                    tag.split(":")[1] if ":" in tag else tag
                    for tag in product["allergens_tags"]
                ]
            allergens_str = ", ".join(allergens_list)

            filename = f"{code}_en.jpg"

            annotations.append({
                "code": code,
                "product_name": name,
                "language": "en",
                "allergens": allergens_str,
                "text": ingredients_text_en,
                "image_url": fallback_url,
                "filename": filename
            })
            english_only_count += 1

print(f"‚úì Total English images found: {len(annotations):,}")
print(f"‚úì Products with English text: {english_only_count:,}")
print(f"‚ö† Products skipped (no English text): {skipped_no_english:,}")

In [None]:
# Step 8: Download images
print("Downloading images...")

downloaded_annotations = []
failed_downloads = []

for entry in tqdm(annotations, desc="Downloading images"):
    img_url = entry["image_url"]
    filename = entry["filename"]
    save_path = os.path.join(data_dir, "raw", filename)

    try:
        res = requests.get(img_url, timeout=15)
        res.raise_for_status()

        with open(save_path, "wb") as f:
            f.write(res.content)

        downloaded_annotations.append(entry)

    except Exception as e:
        failed_downloads.append((filename, str(e)))

print(f"\n‚úì Images downloaded successfully: {len(downloaded_annotations):,}")
if failed_downloads:
    print(f"‚ö† Failed downloads: {len(failed_downloads)}")
    print("First 5 failures:")
    for filename, error in failed_downloads[:5]:
        print(f"  - {filename}: {error}")

## Section C: Quality Check and Dataset Export

This section performs comprehensive quality analysis on the collected data, validates the dataset, and exports clean data for annotation and model training.

In [None]:
# Step 9: Create DataFrame and analyze data
df = pd.DataFrame(downloaded_annotations)

print(f"‚úì Dataset created with {len(df):,} images")
print(f"\nDataFrame shape: {df.shape}")
print(f"Columns: {list(df.columns)}")

In [None]:
# Step 10: Data quality analysis

print("=" * 60)
print("DATA QUALITY ANALYSIS")
print("=" * 60)

# Verify all entries are English
lang_counts = df['language'].value_counts()
print(f"\nüìä Language distribution:")
for lang, count in lang_counts.items():
    print(f"  {lang}: {count:,} ({count/len(df)*100:.1f}%)")

# Source breakdown (Malaysia vs Popular)
print(f"\nüåç Dataset composition:")
malaysia_count = df['code'].isin([p.get('code') for p in products_malaysia]).sum()
popular_count = len(df) - malaysia_count
print(f"  Malaysia snacks: {malaysia_count:,} ({malaysia_count/len(df)*100:.1f}%)")
print(f"  Popular global snacks: {popular_count:,} ({popular_count/len(df)*100:.1f}%)")

# Allergen field presence
num_with_allergens = (df['allergens'].str.strip() != "").sum()
num_without_allergens = (df['allergens'].str.strip() == "").sum()
pct_with_allergens = num_with_allergens / len(df) * 100 if len(df) > 0 else 0.0

print(f"\nüîç Allergen information:")
print(f"  With allergens: {num_with_allergens:,} ({pct_with_allergens:.1f}%)")
print(f"  Without allergens: {num_without_allergens:,}")

# Common allergens breakdown
if num_with_allergens > 0:
    print(f"\nü•ú Most common allergens:")
    all_allergens = []
    for allergen_str in df[df['allergens'] != ""]['allergens']:
        all_allergens.extend([a.strip() for a in allergen_str.split(",")])

    allergen_counts = pd.Series(all_allergens).value_counts()
    for allergen, count in allergen_counts.head(10).items():
        print(f"  {allergen}: {count:,}")

# Text field presence
num_with_text = (df['text'].str.strip() != "").sum()
avg_text_length = df[df['text'] != ""]['text'].str.len().mean()
print(f"\nüìù Ingredient text analysis:")
print(f"  With text: {num_with_text:,} ({num_with_text/len(df)*100:.1f}%)")
print(f"  Average text length: {avg_text_length:.0f} characters")

# Product name presence
num_with_name = (df['product_name'].str.strip() != "").sum()
print(f"\nüè∑Ô∏è Product name presence:")
print(f"  With name: {num_with_name:,} ({num_with_name/len(df)*100:.1f}%)")
print(f"  Without name: {len(df) - num_with_name:,}")

In [None]:
# Step 11: Save annotations to CSV
csv_path = os.path.join(data_dir, "annotations.csv")
df.to_csv(csv_path, index=False)
print(f"\n‚úì Annotations saved to: {csv_path}")

In [None]:
# Step 12: Display sample images from dataset
from IPython.display import Image, display
import random

print("\n" + "=" * 60)
print("SAMPLE IMAGES FROM DATASET")
print("=" * 60)

# Show 5 random samples
sample_size = min(5, len(df))
sample_indices = random.sample(range(len(df)), sample_size)

for i, idx in enumerate(sample_indices, 1):
    row = df.iloc[idx]
    print(f"\nüì¶ Sample {i}/{sample_size}")
    print(f"Product: {row['product_name']}")
    print(f"Language: {row['language']}")
    print(f"Allergens: {row['allergens'] if row['allergens'] else 'None listed'}")
    print(f"Text preview: {row['text'][:100]}..." if len(row['text']) > 100 else f"Text: {row['text']}")

    img_path = os.path.join(data_dir, "raw", row['filename'])
    if os.path.exists(img_path):
        display(Image(filename=img_path, width=400))
    else:
        print("‚ö† Image file not found")
    print("-" * 60)

In [None]:
# Step 13: Final summary statistics and dataset evaluation
print("\n" + "=" * 60)
print("FINAL SUMMARY - TARGET: 10,000 IMAGES")
print("=" * 60)
print(f"‚úì Malaysia snacks: {len(products_malaysia):,}")
print(f"‚úì Popular global snacks: {len(products_to_add):,}")
print(f"‚úì Total products fetched: {len(products):,}")
print(f"‚úì Products with English text: {english_only_count:,}")
print(f"‚úì Unique English images identified: {len(annotations):,}")
print(f"‚úì Images successfully downloaded: {len(downloaded_annotations):,}")
print(f"‚úì All entries are in: English")
print(f"‚úì Products with allergen info: {num_with_allergens:,} ({pct_with_allergens:.1f}%)")
print(f"‚úì CSV file saved: {csv_path}")
print(f"‚úì Raw images folder: {os.path.join(data_dir, 'raw')}")

# Dataset size evaluation against 10k target
target = 10000
actual = len(downloaded_annotations)
percentage = (actual / target) * 100

print("\n" + "=" * 60)
print("TARGET ACHIEVEMENT")
print("=" * 60)
print(f"Target: {target:,} images")
print(f"Achieved: {actual:,} images ({percentage:.1f}% of target)")

if actual >= target:
    print(f"‚úì SUCCESS! Exceeded 10,000 image target")
elif actual >= target * 0.8:
    print(f"‚úì GOOD! Reached 80%+ of target - sufficient for training")
elif actual >= target * 0.5:
    print(f"‚ö†Ô∏è  FAIR: Reached 50%+ of target - usable but consider expanding")
else:
    print(f"‚ö†Ô∏è  LOW: Below 50% of target - recommend adjusting target_total_images in Cell 6")

print("\nüéâ Data collection complete!")
print(f"\n‚ÑπÔ∏è  Dataset contains:")
print(f"   ‚Ä¢ Malaysian snacks (local context): {malaysia_count:,}")
print(f"   ‚Ä¢ Popular international brands: {popular_count:,}")
print(f"   ‚Ä¢ All with English ingredient labels")
print(f"   ‚Ä¢ Brands include: Lay's, Pringles, Oreo, Kit Kat, Doritos, and more")