In [1]:
import os
import sys
import json
from pathlib import Path

import faiss
import numpy as np
import polars as pl
import torch
from PIL import Image
from transformers import CLIPProcessor, CLIPModel

parent_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
if parent_dir not in sys.path:
    sys.path.append(parent_dir)

DATA_DIR = Path(parent_dir) / "data"
IMAGES_DIR = DATA_DIR / "images"
INDEX_DIR = DATA_DIR / "index"
INDEX_DIR.mkdir(exist_ok=True)

# Use GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")


Using device: cpu


In [2]:
# Load CLIP model for image embeddings
model_name = "openai/clip-vit-base-patch32"
model = CLIPModel.from_pretrained(model_name).to(device)
processor = CLIPProcessor.from_pretrained(model_name)
model.eval()

print(f"Loaded CLIP model: {model_name}")


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/605M [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

Loaded CLIP model: openai/clip-vit-base-patch32


In [3]:
# Load dataset
df = pl.read_excel(DATA_DIR / "smartdiet_dataset.xlsx")
print(f"Loaded {len(df)} products from dataset")
df.head()


Loaded 4095 products from dataset


protein,fat,title,is_edible,carbohydrates,category,calories,public_url,id,_id
f64,f64,str,bool,f64,str,f64,str,i64,str
0.0,0.0,"""–ó–µ–ª—ë–Ω—ã–π —á–∞–π_Bayce 95-11 400–≥—Ä""",True,0.0,"""Beverage""",2.0,"""https://safebite-s3.s3.amazona‚Ä¶",69972,"""01Uz9Eh4tMKkkUI8zmTr"""
1.0,2.5,"""–ú–æ—Ä–æ–∂. –ë–æ–Ω –ü–∞—Ä–∏ —Å —Ñ—Ä—É–∫—Ç.—Å–æ–∫–æ–º""",True,25.0,"""Ice Cream""",125.0,"""https://safebite-s3.s3.amazona‚Ä¶",69332,"""01YaRoAioWVJaPOTy85E"""
-1.0,-1.0,"""–®–æ–∫ –ø–ª–∏—Ç –†–æ—Å—Å–∏—è —Ç–µ–º –º–∏–Ω–¥–∞–ª—å 82‚Ä¶",True,-1.0,"""Chocolate""",-1.0,"""https://safebite-s3.s3.amazona‚Ä¶",9041,"""02UfvwkJzBRyUHDFdq2v"""
27.5,25.0,"""Delmark_–°/–ö_–û—Ö–æ—Ç–Ω–∏—á—å—è""",True,2.5,"""Meat Product""",350.0,"""https://safebite-s3.s3.amazona‚Ä¶",43181,"""035LM4tsGLrKLY9QpQ2m"""
0.0,0.0,"""–ì–†–ò–ù–§–ò–õ–î –ì–æ–ª–¥–µ–Ω –¶–µ–π–ª–æ–Ω 100–≥.—á–∞‚Ä¶",True,0.0,"""Tea""",2.0,"""https://safebite-s3.s3.amazona‚Ä¶",56510,"""03zvGGbYBDpKSi1VJNbH"""


In [6]:
def get_image_embedding(image_path: str) -> np.ndarray | None:
    """Generate CLIP embedding for a single image."""
    try:
        image = Image.open(image_path).convert("RGBA")
        inputs = processor(images=image, return_tensors="pt").to(device)
        
        with torch.no_grad():
            image_features = model.get_image_features(**inputs)
        
        # Normalize the embedding
        embedding = image_features.cpu().numpy().flatten()
        embedding = embedding / np.linalg.norm(embedding)
        return embedding
    except Exception as e:
        print(f"Error processing {image_path}: {e}")
        return None


def get_product_embedding(product_id: int, aggregation: str = "mean") -> np.ndarray | None:
    """Get aggregated embedding for all images of a product."""
    product_dir = IMAGES_DIR / str(product_id)
    
    if not product_dir.exists():
        return None
    
    image_files = list(product_dir.glob("*.jpg")) + list(product_dir.glob("*.png"))
    
    if not image_files:
        return None
    
    embeddings = []
    for img_path in image_files:
        emb = get_image_embedding(str(img_path))
        if emb is not None:
            embeddings.append(emb)
    
    if not embeddings:
        return None
    
    # Aggregate embeddings (mean or first)
    if aggregation == "mean":
        agg_embedding = np.mean(embeddings, axis=0)
    else:
        agg_embedding = embeddings[0]
    
    # Normalize again after aggregation
    agg_embedding = agg_embedding / np.linalg.norm(agg_embedding)
    return agg_embedding


In [7]:
# Generate embeddings for all products with images
embeddings_list = []
metadata_list = []  # Store product info for each embedding

for row in df.iter_rows(named=True):
    product_id = row["id"]
    
    embedding = get_product_embedding(product_id)
    
    if embedding is not None:
        embeddings_list.append(embedding)
        metadata_list.append({
            "id": product_id,
            "_id": row["_id"],
            "title": row["title"],
            "category": row["category"],
            "calories": row["calories"],
            "protein": row["protein"],
            "fat": row["fat"],
            "carbohydrates": row["carbohydrates"],
        })
        
        if len(embeddings_list) % 100 == 0:
            print(f"Processed {len(embeddings_list)} products...")

print(f"\nTotal products with embeddings: {len(embeddings_list)}")


Processed 100 products...
Processed 200 products...
Processed 300 products...
Processed 400 products...
Processed 500 products...
Processed 600 products...
Processed 700 products...
Processed 800 products...
Processed 900 products...
Processed 1000 products...
Processed 1100 products...
Processed 1200 products...
Processed 1300 products...
Processed 1400 products...
Processed 1500 products...
Processed 1600 products...
Processed 1700 products...
Processed 1800 products...
Processed 1900 products...
Processed 2000 products...
Processed 2100 products...
Processed 2200 products...
Processed 2300 products...
Processed 2400 products...
Processed 2500 products...
Processed 2600 products...
Processed 2700 products...
Processed 2800 products...
Processed 2900 products...




Processed 3000 products...
Processed 3100 products...
Processed 3200 products...
Processed 3300 products...
Processed 3400 products...
Processed 3500 products...
Processed 3600 products...
Processed 3700 products...
Processed 3800 products...
Processed 3900 products...
Processed 4000 products...

Total products with embeddings: 4084


In [8]:
# Create FAISS index
embeddings_array = np.array(embeddings_list).astype("float32")
dimension = embeddings_array.shape[1]

print(f"Embedding dimension: {dimension}")
print(f"Embeddings shape: {embeddings_array.shape}")

# Create index - using IndexFlatIP for cosine similarity (since vectors are normalized)
index = faiss.IndexFlatIP(dimension)
index.add(embeddings_array)

print(f"FAISS index created with {index.ntotal} vectors")


Embedding dimension: 512
Embeddings shape: (4084, 512)
FAISS index created with 4084 vectors


In [9]:
# Save FAISS index and metadata
faiss.write_index(index, str(INDEX_DIR / "products.index"))

with open(INDEX_DIR / "products_metadata.json", "w", encoding="utf-8") as f:
    json.dump(metadata_list, f, ensure_ascii=False, indent=2)

print(f"Saved index to: {INDEX_DIR / 'products.index'}")
print(f"Saved metadata to: {INDEX_DIR / 'products_metadata.json'}")


Saved index to: /home/nedogeek/Documents/code/smartdiet/data/data/index/products.index
Saved metadata to: /home/nedogeek/Documents/code/smartdiet/data/data/index/products_metadata.json


In [10]:
def search_similar_products(query_image_path: str, top_k: int = 5) -> list[dict]:
    """Search for similar products given an image path."""
    # Get embedding for query image
    query_embedding = get_image_embedding(query_image_path)
    
    if query_embedding is None:
        return []
    
    query_embedding = query_embedding.reshape(1, -1).astype("float32")
    
    # Search in FAISS index
    distances, indices = index.search(query_embedding, top_k)
    
    results = []
    for i, (dist, idx) in enumerate(zip(distances[0], indices[0])):
        result = metadata_list[idx].copy()
        result["similarity_score"] = float(dist)
        result["rank"] = i + 1
        results.append(result)
    
    return results


def search_by_text(query_text: str, top_k: int = 5) -> list[dict]:
    """Search for products using text query (CLIP text encoder)."""
    inputs = processor(text=[query_text], return_tensors="pt", padding=True).to(device)
    
    with torch.no_grad():
        text_features = model.get_text_features(**inputs)
    
    # Normalize
    query_embedding = text_features.cpu().numpy().flatten()
    query_embedding = query_embedding / np.linalg.norm(query_embedding)
    query_embedding = query_embedding.reshape(1, -1).astype("float32")
    
    # Search
    distances, indices = index.search(query_embedding, top_k)
    
    results = []
    for i, (dist, idx) in enumerate(zip(distances[0], indices[0])):
        result = metadata_list[idx].copy()
        result["similarity_score"] = float(dist)
        result["rank"] = i + 1
        results.append(result)
    
    return results


In [11]:
# Demo: Search by text
print("üîç Text search: 'chocolate bar'\n")
results = search_by_text("chocolate bar", top_k=5)

for r in results:
    print(f"#{r['rank']} [{r['similarity_score']:.3f}] {r['title']} ({r['category']})")


üîç Text search: 'chocolate bar'

#1 [0.349] –®–æ–∫.–±–∞—Ç–æ–Ω—á–∏–∫ –ê–ª—ë–Ω–∫–∞ —Å –≤–∞—Ä —Å–≥—É—â 48–≥ (Snack)
#2 [0.346] –®–æ–∫. KinChocolate 100–≥ (Chocolate)
#3 [0.345] –®–æ–∫–æ–ª.–ø–ª–∏—Ç–∫–∞ Geisha –º–æ–ª.—à–æ–∫.—Å –Ω–∞—á–∏–Ω. (Confectionery)
#4 [0.345] –®–æ–∫–æ–ª.–±–∞—Ç.Karl Fazer —Ö—Ä—É—Å.—à–æ–∫–æ–ª.—Ç—Ä—é—Ñ.37–≥ (Snack)
#5 [0.343] –®–æ–∫–æ–ª–∞–¥.–∫–æ–Ω—Ñ.Karl Fazer —Ç–µ–º–Ω.—à–æ–∫.70% (Chocolate)


In [12]:
# Demo: Search by image (using first product's image as query)
sample_product_id = metadata_list[0]["id"]
sample_image_dir = IMAGES_DIR / str(sample_product_id)
sample_images = list(sample_image_dir.glob("*.jpg"))

if sample_images:
    print(f"üñºÔ∏è Image search: using image from product '{metadata_list[0]['title']}'\n")
    results = search_similar_products(str(sample_images[0]), top_k=5)
    
    for r in results:
        print(f"#{r['rank']} [{r['similarity_score']:.3f}] {r['title']} ({r['category']})")


üñºÔ∏è Image search: using image from product '–ó–µ–ª—ë–Ω—ã–π —á–∞–π_Bayce 95-11 400–≥—Ä'

#1 [0.913] –ó–µ–ª—ë–Ω—ã–π —á–∞–π_Bayce 95-11 400–≥—Ä (Beverage)
#2 [0.889] –ß–∞–π Bayce –∑–µ–ª—ë–Ω—ã–π ‚Ññ95 80–≥—Ä (Beverage)
#3 [0.877] –ó–µ–ª—ë–Ω—ã–π —á–∞–π_Bayce 110-11 400–≥—Ä (Beverages)
#4 [0.856] –ß–∞–π Bayce –ó–µ–ª—ë–Ω—ã–π ‚Ññ66 –º/—É 400–≥ (Beverage)
#5 [0.805] –ß–∞–π Bayce –∑–µ–ª—ë–Ω—ã–π 110-11 (Beverage)


## Loading the Index for Production Use

To load the saved index in another script or application:


In [13]:
# Example: How to load the saved index in another script
"""
import faiss
import json

# Load index
index = faiss.read_index("data/index/products.index")

# Load metadata
with open("data/index/products_metadata.json", "r", encoding="utf-8") as f:
    metadata = json.load(f)

# Now you can use:
# distances, indices = index.search(query_embedding, top_k)
# product_info = metadata[indices[0][0]]
"""

print("Index files saved! Ready for production use.")


Index files saved! Ready for production use.


In [None]:
# Pass any image path to find similar products
results = search_similar_products("/home/nedogeek/Documents/code/smartdiet/data/data/images/830/000003.jpg", top_k=5)

for r in results:
    print(f"{r['title']} - Score: {r['similarity_score']:.3f}")