# Build FAISS Index for Image Search

This notebook builds a FAISS index from images located in `data_dir/index_images` folder.

The process:
1. Load configuration from `configs/configs.yaml`
2. Extract features from all images using pre-trained CNN model
3. Build FAISS index for fast similarity search
4. Save embeddings and index to cache

In [None]:
import yaml
import sys
from pathlib import Path

# Add parent directory to path to import modules
sys.path.insert(0, str(Path.cwd().parent))

from settings import settings
from ml import ImageSearchEngine

In [None]:
parent_dir = Path(parent_dir)
config_path = parent_dir / "configs" / "configs.yaml"
data_dir = parent_dir / "data"

# Configuration is now loaded from settings.py
print("Current configuration:")
print(f"  Device: {settings.ml.device}")
print(f"  Model: {settings.ml.model_name}")
print(f"  Image size: {settings.ml.img_size}")
print(f"  Batch size: {settings.ml.batch_size}")
print(f"  Embedding dim: {settings.ml.embedding_dim}")
print(f"  Similarity metric: {settings.ml.similarity_metric}")
print(f"  Data directory: {settings.ml.data_dir}")
print(f"  Cache directory: {settings.ml.cache_dir}")

In [None]:
# Set data directory to index_images subfolder
base_data_dir = Path(settings.ml.data_dir)
index_images_dir = base_data_dir / "index_images"

print(f"Base data directory: {base_data_dir}")
print(f"Index images directory: {index_images_dir}")
print(f"Directory exists: {index_images_dir.exists()}")

In [None]:
# Note: Settings are now loaded from configs/configs.yaml via settings.py
# No need to create Config object manually
print("\nUsing settings from configs/configs.yaml")
print(f"All configuration is managed centrally via settings.py")

In [8]:
# Create Config object
config = Config(
    device=config_dict['index']['device'],
    img_size=config_dict['index']['img_size'],
    batch_size=config_dict['index']['batch_size'],
    embedding_dim=config_dict['index']['embedding_dim'],
    model_name=config_dict['index']['model_name'],
    similarity_metric=config_dict['index']['similarity_metric'],
    data_dir=str(index_images_dir),  # Use index_images subfolder
    cache_dir=config_dict['cache_dir'],
    log_level=config_dict['log_level']
)

print(f"\nConfiguration created:")
print(f"  Device: {config.device}")
print(f"  Model: {config.model_name}")
print(f"  Image size: {config.img_size}")
print(f"  Batch size: {config.batch_size}")
print(f"  Embedding dim: {config.embedding_dim}")
print(f"  Similarity metric: {config.similarity_metric}")
print(f"  Data directory: {config.data_dir}")
print(f"  Cache directory: {config.cache_dir}")


Configuration created:
  Device: cpu
  Model: resnet50
  Image size: 224
  Batch size: 32
  Embedding dim: 2048
  Similarity metric: cosine
  Data directory: /home/nedogeek/Documents/code/smartdiet/data/index_images
  Cache directory: ./cache


## 2. Check Dataset

In [9]:
# Verify dataset exists and count images
if not index_images_dir.exists():
    print(f"⚠️ WARNING: Directory {index_images_dir} does not exist!")
    print(f"Please create it and add product images in subdirectories.")
else:
    # Count images and products
    image_extensions = {'.jpg', '.jpeg', '.png', '.webp'}
    products = [d for d in index_images_dir.iterdir() if d.is_dir()]
    
    total_images = 0
    for product_dir in products:
        images = [f for f in product_dir.rglob("*") if f.suffix.lower() in image_extensions]
        total_images += len(images)
        print(f"  {product_dir.name}: {len(images)} images")
    
    print(f"\n📊 Total: {len(products)} products, {total_images} images")

  ee052f13-487e-447a-a671-9bbf7d814e6b: 5 images
  8c5f49b1-1ca4-42fe-a96f-6e1ed3a577d4: 5 images
  d8988451-5d93-4192-a085-52159fef356b: 5 images
  536c35ee-0be5-466e-bc2b-17ccbeaae7d2: 5 images
  3bbe2cc5-bb03-4044-acb3-dda3835e9421: 5 images
  3aac158e-844a-4efa-bc90-f1057031181e: 5 images
  60e54823-74c2-4f50-8eeb-bdc269c42b38: 5 images
  e01febca-c0a3-40dd-a5f6-4d254d93f233: 5 images
  3a1ba631-076c-4e8a-a6f5-32cb90049bcb: 5 images
  06f0e2a8-e2ee-450c-8d37-d54644532ee1: 5 images
  48ddfe1e-731e-4f1a-b10c-063a1d9291fe: 5 images
  28bb2fd4-558b-4704-8251-a892fb1f9e4e: 5 images
  b25a86dc-9d64-4c66-8578-6bdbac11dfce: 5 images
  f95e62dd-928e-42ce-ab54-562c186c6232: 5 images
  2f6ea84e-3724-4bcb-a775-199e91c63e29: 5 images
  f31de8e0-422b-47aa-a9ae-f4d8aaac841c: 5 images
  de8ebac3-1b78-4c28-a66e-f004ded6f68c: 5 images
  f4b1af73-b35b-4cae-8ea4-794d664c624e: 5 images
  6ebdeae0-5f9a-4ed1-b60e-da4e641ba21f: 5 images
  069f96a4-7f3a-474f-9b5b-96dbaf40e17d: 5 images
  b20b12f5-84c4-42dd

# Initialize the search engine (no config parameter needed)
print("Initializing Image Search Engine...")
search_engine = ImageSearchEngine()
print("✅ Search engine initialized")

In [10]:
# Initialize the search engine
print("Initializing Image Search Engine...")
search_engine = ImageSearchEngine(config)
print("✅ Search engine initialized")

2025-10-04 19:59:02,068 - ml.utils - INFO - Loading resnet50 model...


Initializing Image Search Engine...
Downloading: "https://download.pytorch.org/models/resnet50-11ad3fa6.pth" to /home/nedogeek/.cache/torch/hub/checkpoints/resnet50-11ad3fa6.pth


100%|██████████| 97.8M/97.8M [00:17<00:00, 5.97MB/s]
2025-10-04 19:59:20,760 - ml.utils - INFO - Model loaded successfully on cpu


✅ Search engine initialized


## 4. Build FAISS Index

This step will:
- Load all images from the dataset
- Extract features using the pre-trained model
- Create FAISS index
- Save embeddings to cache

**Note**: This may take several minutes depending on dataset size.

In [11]:
# Build index (force rebuild to ignore cache)
print("Building FAISS index...")
print("This may take a few minutes depending on dataset size...\n")

search_engine.build_index(force_rebuild=True)

print("\n✅ Index built successfully!")

2025-10-04 19:59:22,915 - ml.utils - INFO - Building index from scratch...
2025-10-04 19:59:23,065 - ml.utils - INFO - Loaded 14257 images from 2893 categories


Building FAISS index...
This may take a few minutes depending on dataset size...



Extracting features: 100%|██████████| 446/446 [17:32<00:00,  2.36s/it]
2025-10-04 20:16:55,643 - ml.utils - INFO - Embeddings saved to cache/embeddings.pkl
2025-10-04 20:16:55,792 - ml.utils - INFO - FAISS index created with 14257 vectors



✅ Index built successfully!


## 5. Verify Index and Get Statistics

In [12]:
# Get dataset statistics
stats = search_engine.get_statistics()

print("📊 Dataset Statistics:")
print(f"  Total images: {stats.get('total_images', 0)}")
print(f"  Unique products: {stats.get('unique_products', 0)}")
print(f"  Avg images per product: {stats.get('avg_images_per_product', 0):.2f}")

print(f"\n🔝 Top 10 products by image count:")
for product_id, count in stats.get('product_distribution', {}).items():
    print(f"  {product_id}: {count} images")

📊 Dataset Statistics:
  Total images: 14257
  Unique products: 2893
  Avg images per product: 4.93

🔝 Top 10 products by image count:
  ee052f13-487e-447a-a671-9bbf7d814e6b: 5 images
  8c5f49b1-1ca4-42fe-a96f-6e1ed3a577d4: 5 images
  d8988451-5d93-4192-a085-52159fef356b: 5 images
  536c35ee-0be5-466e-bc2b-17ccbeaae7d2: 5 images
  3bbe2cc5-bb03-4044-acb3-dda3835e9421: 5 images
  3aac158e-844a-4efa-bc90-f1057031181e: 5 images
  60e54823-74c2-4f50-8eeb-bdc269c42b38: 5 images
  e01febca-c0a3-40dd-a5f6-4d254d93f233: 5 images
  3a1ba631-076c-4e8a-a6f5-32cb90049bcb: 5 images
  06f0e2a8-e2ee-450c-8d37-d54644532ee1: 5 images


In [13]:
# Verify FAISS index
if search_engine.index is not None:
    print(f"\n✅ FAISS index created successfully")
    print(f"  Index type: {type(search_engine.index).__name__}")
    print(f"  Total vectors: {search_engine.index.ntotal}")
    print(f"  Vector dimension: {search_engine.index.d}")
else:
    print("\n⚠️ WARNING: Index not created")


✅ FAISS index created successfully
  Index type: IndexFlatIP
  Total vectors: 14257
  Vector dimension: 2048


## 6. Test Search (Optional)

Test the search functionality with a sample image from the dataset.

In [14]:
# Test search with first image from dataset
if search_engine.image_paths:
    test_image_path = search_engine.image_paths[0]
    print(f"Testing search with image: {test_image_path}")
    
    results = search_engine.search(test_image_path, top_k=5)
    
    print(f"\n🔍 Top {len(results)} similar products:")
    for result in results:
        print(f"  Rank {result['rank']}: {result['product_id']}")
        print(f"    Similarity: {result['similarity']:.4f}")
        print(f"    Distance: {result['distance']:.4f}")
        print(f"    Image: {result['image_path']}")
        print()
else:
    print("No images in dataset to test")

Testing search with image: /home/nedogeek/Documents/code/smartdiet/data/index_images/ee052f13-487e-447a-a671-9bbf7d814e6b/000001.jpg

🔍 Top 5 similar products:
  Rank 1: f4bf2482-0f8b-4a38-b0fa-7b4ad9019dc9
    Similarity: 1.0000
    Distance: 1.0000
    Image: /home/nedogeek/Documents/code/smartdiet/data/index_images/f4bf2482-0f8b-4a38-b0fa-7b4ad9019dc9/000001.jpg

  Rank 2: 1a016a22-44d5-4786-9560-d6ef2b3cdbb0
    Similarity: 1.0000
    Distance: 1.0000
    Image: /home/nedogeek/Documents/code/smartdiet/data/index_images/1a016a22-44d5-4786-9560-d6ef2b3cdbb0/000001.jpg

  Rank 3: ee052f13-487e-447a-a671-9bbf7d814e6b
    Similarity: 1.0000
    Distance: 1.0000
    Image: /home/nedogeek/Documents/code/smartdiet/data/index_images/ee052f13-487e-447a-a671-9bbf7d814e6b/000001.jpg

  Rank 4: 1a016a22-44d5-4786-9560-d6ef2b3cdbb0
    Similarity: 0.9046
    Distance: 0.9046
    Image: /home/nedogeek/Documents/code/smartdiet/data/index_images/1a016a22-44d5-4786-9560-d6ef2b3cdbb0/000005.jpg

  Ra

# Check cache file
cache_file = Path(settings.ml.cache_dir) / "embeddings.pkl"

if cache_file.exists():
    cache_size_mb = cache_file.stat().st_size / (1024 * 1024)
    print(f"✅ Cache file saved: {cache_file}")
    print(f"  Size: {cache_size_mb:.2f} MB")
    print(f"\n💡 The API will load from this cache on startup.")
    print(f"   To rebuild, delete the cache file or use force_rebuild=True")
else:
    print(f"⚠️ Cache file not found: {cache_file}")

# Save index and metadata to specified files
import faiss
import pickle

index_file = Path("smart_diet_v0.1.index")
metadata_file = Path("metadata.pkl")

# Save FAISS index
if search_engine.index is not None:
    faiss.write_index(search_engine.index, str(index_file))
    print(f"\n✅ FAISS index saved to: {index_file}")
    index_size_mb = index_file.stat().st_size / (1024 * 1024)
    print(f"  Size: {index_size_mb:.2f} MB")
else:
    print("\n⚠️ No index to save")

# Save metadata (image paths and product IDs)
metadata = {
    'image_paths': search_engine.image_paths,
    'product_ids': search_engine.product_ids,
    'config': {
        'model_name': settings.ml.model_name,
        'embedding_dim': settings.ml.embedding_dim,
        'similarity_metric': settings.ml.similarity_metric,
        'img_size': settings.ml.img_size
    }
}

with open(metadata_file, 'wb') as f:
    pickle.dump(metadata, f)

print(f"\n✅ Metadata saved to: {metadata_file}")
metadata_size_mb = metadata_file.stat().st_size / (1024 * 1024)
print(f"  Size: {metadata_size_mb:.2f} MB")
print(f"  Contains: {len(metadata['image_paths'])} image paths, {len(set(metadata['product_ids']))} unique products")

In [15]:
# Check cache file
cache_file = Path(config.cache_dir) / "embeddings.pkl"

if cache_file.exists():
    cache_size_mb = cache_file.stat().st_size / (1024 * 1024)
    print(f"✅ Cache file saved: {cache_file}")
    print(f"  Size: {cache_size_mb:.2f} MB")
    print(f"\n💡 The API will load from this cache on startup.")
    print(f"   To rebuild, delete the cache file or use force_rebuild=True")
else:
    print(f"⚠️ Cache file not found: {cache_file}")

✅ Cache file saved: cache/embeddings.pkl
  Size: 113.01 MB

💡 The API will load from this cache on startup.
   To rebuild, delete the cache file or use force_rebuild=True


In [None]:
# Save index and metadata to specified files
import faiss
import pickle

index_file = data_dir / "smart_diet_v0.1.index"
metadata_file = data_dir / "metadata.pkl"

# Save FAISS index
if search_engine.index is not None:
    faiss.write_index(search_engine.index, str(index_file))
    print(f"✅ FAISS index saved to: {index_file}")
    index_size_mb = index_file.stat().st_size / (1024 * 1024)
    print(f"  Size: {index_size_mb:.2f} MB")
else:
    print("⚠️ No index to save")

# Save metadata (image paths and product IDs)
metadata = {
    'image_paths': search_engine.image_paths,
    'product_ids': search_engine.product_ids,
    'config': {
        'model_name': config.model_name,
        'embedding_dim': config.embedding_dim,
        'similarity_metric': config.similarity_metric,
        'img_size': config.img_size
    }
}

with open(metadata_file, 'wb') as f:
    pickle.dump(metadata, f)

print(f"\n✅ Metadata saved to: {metadata_file}")
metadata_size_mb = metadata_file.stat().st_size / (1024 * 1024)
print(f"  Size: {metadata_size_mb:.2f} MB")
print(f"  Contains: {len(metadata['image_paths'])} image paths, {len(set(metadata['product_ids']))} unique products")

✅ FAISS index saved to: smart_diet_v0.1.index
  Size: 111.38 MB

✅ Metadata saved to: metadata.pkl
  Size: 1.63 MB
  Contains: 14257 image paths, 2893 unique products


## ✅ Done!

The FAISS index has been built and cached. You can now start the API server:

```bash
python app.py
```

The API will automatically load the cached embeddings on startup.