In [1]:
# Food Additive Embeddings Generator for Google Colab
# Run this in Google Colab to generate embeddings for your CSV data

import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import json
from google.colab import files

# Upload your CSV files
print("Please upload your food additive CSV files...")
uploaded = files.upload()

# Load the model (using a lightweight but effective model)
print("Loading sentence transformer model...")
model = SentenceTransformer('all-MiniLM-L6-v2')  # 384-dimensional embeddings, fast and efficient

# Process each uploaded CSV file
additive_data = []

for filename in uploaded.keys():
    print(f"\nProcessing {filename}...")
    df = pd.read_csv(filename)

    # Check if this is the substances CSV (has Substance, Other Names, Used for columns)
    if 'Substance' in df.columns and 'Other Names' in df.columns:
        print("Processing substances CSV...")
        for _, row in df.iterrows():
            substance = str(row['Substance']).strip()
            other_names = str(row['Other Names']).strip() if pd.notna(row['Other Names']) else ""
            technical_effect = str(row['Used for (Technical Effect)']).strip() if pd.notna(row['Used for (Technical Effect)']) else ""

            if substance and substance != 'nan':
                # Create searchable text combining all information
                searchable_text = f"{substance} {other_names} {technical_effect}".strip()

                additive_data.append({
                    'substance': substance,
                    'other_names': other_names,
                    'technical_effect': technical_effect,
                    'searchable_text': searchable_text
                })

    # Check if this is the technical effects CSV (has Technical_Effect, Consumer_Explanation)
    elif 'Technical_Effect' in df.columns and 'Consumer_Explanation' in df.columns:
        print("Processing technical effects CSV...")
        for _, row in df.iterrows():
            technical_effect = str(row['Technical_Effect']).strip()
            consumer_explanation = str(row['Consumer_Explanation']).strip() if pd.notna(row['Consumer_Explanation']) else ""

            if technical_effect and technical_effect != 'nan':
                # Create searchable text
                searchable_text = f"{technical_effect} {consumer_explanation}".strip()

                additive_data.append({
                    'substance': technical_effect,  # Use technical effect as substance name
                    'other_names': "",
                    'technical_effect': consumer_explanation,
                    'searchable_text': searchable_text
                })

print(f"\nTotal additive records: {len(additive_data)}")

# Remove duplicates based on substance name
seen_substances = set()
unique_data = []
for item in additive_data:
    substance_lower = item['substance'].lower()
    if substance_lower not in seen_substances:
        seen_substances.add(substance_lower)
        unique_data.append(item)

print(f"Unique additive records: {len(unique_data)}")

# Generate embeddings
print("\nGenerating embeddings...")
searchable_texts = [item['searchable_text'] for item in unique_data]
embeddings = model.encode(searchable_texts, show_progress_bar=True)

# Convert numpy arrays to lists for JSON serialization
embeddings_list = embeddings.tolist()

# Create the final data structure
final_data = []
for i, item in enumerate(unique_data):
    final_data.append({
        'substance': item['substance'],
        'other_names': item['other_names'],
        'technical_effect': item['technical_effect'],
        'searchable_text': item['searchable_text'],
        'embedding': embeddings_list[i]
    })

# Save as JSON
output_filename = 'food_additive_embeddings.json'
with open(output_filename, 'w', encoding='utf-8') as f:
    json.dump({
        'model_name': 'all-MiniLM-L6-v2',
        'embedding_dimension': len(embeddings_list[0]) if embeddings_list else 0,
        'total_records': len(final_data),
        'data': final_data
    }, f, indent=2, ensure_ascii=False)

print(f"\nSaved embeddings to {output_filename}")
print(f"Embedding dimension: {len(embeddings_list[0]) if embeddings_list else 0}")
print(f"File size: {len(json.dumps(final_data)) / 1024 / 1024:.2f} MB")

# Download the file
files.download(output_filename)

print("\nDone! Add the downloaded JSON file to your Xcode project bundle.")

Please upload your food additive CSV files...


Saving FoodSubstances_cleaned_v2.csv to FoodSubstances_cleaned_v2.csv
Loading sentence transformer model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]


Processing FoodSubstances_cleaned_v2.csv...
Processing substances CSV...

Total additive records: 3971
Unique additive records: 3971

Generating embeddings...


Batches:   0%|          | 0/125 [00:00<?, ?it/s]


Saved embeddings to food_additive_embeddings.json
Embedding dimension: 384
File size: 33.89 MB


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


Done! Add the downloaded JSON file to your Xcode project bundle.
