# VLM-RAG Test Notebook

## System Components:
1. **Vector Database** - ChromaDB for storing past interactions
2. **Visual Interpreter** - OpenRouter API for vision models 
3. **Validation Judge** - LLM judge for comparing responses

## Research Questions:
1. To what degree do BLV users visual needs change across similar visual contexts?
2. Can we leverage past users interactions to provide more relevant future visual interpretations?


In [1]:
# Setup and imports
import sys
import os

# Add the parent directory to path so we can import our modules
sys.path.append('..')

# Now we can import from the src directory
from src.vector_db import SimpleVectorDB

# Step 1: Initialize the Vector Database
print("🚀 Initializing Vector Database for Image Embeddings...")

# Initialize the simplified vector database
db = SimpleVectorDB()

print("\n✅ Vector Database initialized!")



🚀 Initializing Vector Database for Image Embeddings...


Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given


Vector DB initialized at: ./data/chroma_db
Persistence enabled: Data will be saved to disk

✅ Vector Database initialized!


# Load Collection or Create a new One

In [3]:

print("🗄️ Creating fresh collection for this experiment...")

experiment_name = "vizwiz_experiment_v1"

# Create new database instance with fresh collection
db.use_collection("vizwiz_500_sample", "500 random VizWiz samples")
# Check it's empty
stats = db.get_collection_stats()
print(f"📊 Fresh collection stats: {stats}")

print("✅ Fresh collection created for controlled experiment!")


Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given


🗄️ Creating fresh collection for this experiment...
Collection 'vizwiz_500_sample' ready
Now using collection: vizwiz_500_sample
📊 Fresh collection stats: {'total_images': 500, 'collection_name': 'vizwiz_500_sample', 'persisted': True}
✅ Fresh collection created for controlled experiment!


# Load 500 entries from vizwiz

In [3]:
# Step 2: Load VizWiz data and generate random sample
import json
import random
import pandas as pd
from IPython.display import display
from pathlib import Path

print("📝 Loading VizWiz data and generating random sample...")

# Load the original JSON data
data_path = Path('./data/original/all.json')
try:
    with open(data_path, 'r') as f:
        all_data = json.load(f)
    print(f"✅ Loaded {len(all_data)} total entries from all.json")
except FileNotFoundError:
    print(f"❌ File '{data_path}' not found!")
    print("Please make sure you have the VizWiz data in the correct location.")
    all_data = {}

# Check if collection already has data
stats = db.get_collection_stats()
if stats.get("total_images", 0) > 0:
    print(f"📊 Found existing collection with {stats['total_images']} entries")
    
    # Get all IDs from the existing collection
    results = db.current_collection.get()
    existing_ids = results["ids"]
    print(f"🔄 Using {len(existing_ids)} existing IDs from collection")
    
    # Extract sample data from all_data using existing IDs
    sample_data = {}
    for key in existing_ids:
        if key in all_data:
            sample_data[key] = all_data[key]
    
    print(f"✅ Extracted {len(sample_data)} entries from existing collection")
else:
    # Generate 500 random IDs
    if all_data:
        # Get all available keys from the JSON
        available_keys = list(all_data.keys())
        print(f"Available keys range: {min(available_keys)} to {max(available_keys)}")
        
        # Generate 500 random keys (without replacement)
        sample_size = min(500, len(available_keys))
        random_keys = random.sample(available_keys, sample_size)
        
        print(f"🎲 Generated {len(random_keys)} random keys for sampling")
        print(f"Sample keys: {random_keys[:10]}...")  # Show first 10
        
        # Extract sample data
        sample_data = {}
        for key in random_keys:
            if key in all_data:
                sample_data[key] = all_data[key]
        
        print(f"✅ Extracted {len(sample_data)} entries for processing")
    else:
        sample_data = {}
        print("⚠️ No data loaded - will use dummy data for testing")

# Show example of what we extracted
if sample_data:
    example_key = list(sample_data.keys())[0]
    example_entry = sample_data[example_key]
    print(f"\n📋 Example entry (ID: {example_key}):")
    for field in ['question', 'answerability', 'question_type', 'crowd_majority', 'image_url']:
        if field in example_entry:
            print(f"  {field}: {example_entry[field]}")
    
    # Display first 10 rows as a table
    print("\n📊 First 10 sampled entries:")
    sample_rows = []
    for i, (key, data) in enumerate(list(sample_data.items())[:10]):
        sample_rows.append({
            'ID': key,
            'Question': data.get('question', 'N/A'),
            'Answerability': data.get('answerability', 'N/A'),
            'Question Type': data.get('question_type', 'N/A'),
            'Crowd Majority': data.get('crowd_majority', 'N/A'),
            'Image URL': data.get('image_url', 'N/A')
        })
    
    # Create and display DataFrame
    df = pd.DataFrame(sample_rows)
    display(df)


Failed to send telemetry event CollectionGetEvent: capture() takes 1 positional argument but 3 were given


📝 Loading VizWiz data and generating random sample...
✅ Loaded 600 total entries from all.json
📊 Found existing collection with 500 entries
🔄 Using 500 existing IDs from collection
✅ Extracted 500 entries from existing collection

📋 Example entry (ID: 596):
  question: what is the scientific name of this leaf?
  answerability: unanswerable
  question_type: Others
  crowd_majority: unanswerable
  image_url: https://vizwiz.cs.colorado.edu/VizWiz_visualization_img/VizWiz_val_00002308.jpg

📊 First 10 sampled entries:


Unnamed: 0,ID,Question,Answerability,Question Type,Crowd Majority,Image URL
0,596,what is the scientific name of this leaf?,unanswerable,Others,unanswerable,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...
1,39,What is this bottle?,answerable,Identification,febreze,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...
2,416,For how long do I cook this in the microwave?,unanswerable,Reading,unanswerable,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...
3,56,What is this?,answerable,Identification,2 water bottles,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...
4,31,What is this?,answerable,Identification,jambalaya mix,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...
5,230,What color is this highlighter?,answerable,Description,yellow,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...
6,372,when did this expire?,unanswerable,Reading,unanswerable,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...
7,433,"What time does it say, on my set top box? Than...",answerable,Reading,12:10,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...
8,308,What does the label say?,answerable,Reading,chap stick,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...
9,377,What's the warning label?,answerable,Reading,may cause drowsiness,https://vizwiz.cs.colorado.edu/VizWiz_visualiz...


# Check Collection Currently loaded

In [4]:
stats = db.get_collection_stats()
print(f"📊 Fresh collection stats: {stats}")

📊 Fresh collection stats: {'total_images': 500, 'collection_name': 'vizwiz_500_sample', 'persisted': True}


## Define embedding generation functions

In [5]:

from chromadb.utils import embedding_functions
import cohere
import requests
import base64
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

def cohere_generate_image_embedding(image_path):
    """
    Generate embeddings for an image using Cohere's multimodal embedding model
    
    Args:
        image_path: Path to the image file
        
    Returns:
        List of float embeddings for the image
    """
    # Initialize Cohere client with API key from environment variables
    co = cohere.ClientV2(api_key=os.getenv("COHERE_API_KEY"))
    # Read image file or download from URL
    if image_path.startswith(('http://', 'https://')):
        image = requests.get(image_path)
        image_content = image.content
        content_type = image.headers.get("Content-Type", "image/jpeg")
    else:
        with open(image_path, 'rb') as f:
            image_content = f.read()
        content_type = "image/jpeg"  # Default assumption
    
    # Convert image to base64 format
    stringified_buffer = base64.b64encode(image_content).decode("utf-8")
    image_base64 = f"data:{content_type};base64,{stringified_buffer}"
    
    # Generate embedding using Cohere's API
    response = co.embed(
        model="embed-v4.0",
        input_type="image",
        embedding_types=["float"],
        images=[image_base64],
    )
    
    # Return the embedding vector
    return response.embeddings.float


## Test embedding functions

In [7]:
# Extract and print the first item from sample_data with focus on image_url
first_item = list(sample_data.items())[:1]
if first_item:
    entry_id, entry_data = first_item[0]
    image_url = entry_data.get('image_url', 'No image URL found')
    print(f"Entry ID: {entry_id}, Image URL: {image_url}")

    print("test embedding generation")
    print("embedding generated", cohere_generate_image_embedding(image_url))
else:
    print("No data found in sample_data")

Entry ID: 126, Image URL: https://vizwiz.cs.colorado.edu/VizWiz_visualization_img/VizWiz_train_00019874.jpg
test embedding generation
MBKKF3QGUZWfr6O02WQfoclbqv6URt20FHbeAHCu
embedding generated [[0.01373291, -0.025146484, 0.02368164, -0.036621094, 0.08544922, 0.018432617, 0.00680542, 0.006866455, -0.03857422, -0.03112793, -0.01928711, 0.08105469, -0.015319824, -0.013244629, 0.010314941, 0.048583984, 0.03125, 0.015075684, 0.020874023, -0.014770508, -0.015075684, -0.010070801, 0.045898438, 0.02722168, -0.028076172, -0.027832031, 0.030639648, 0.020385742, 0.02746582, 0.025512695, -0.036621094, -0.003616333, -0.0006828308, -0.048339844, -0.03125, 0.0017776489, 0.03564453, 0.0013198853, 0.020141602, -0.024169922, 0.028442383, -0.011657715, 0.01184082, 0.010192871, 0.027832031, 0.03930664, 0.021850586, 0.017822266, -0.041259766, -0.019897461, -0.027954102, 0.05102539, -0.0006980896, -0.018676758, -0.009277344, 0.019042969, 0.021362305, 0.008117676, 0.0012435913, -0.022460938, 0.011230469, 0

## Upload data into collection

In [5]:

# Step 3: Process sample data and add to vector database
print("🔍 Processing sample data and adding to vector database...")

# Check current stats
stats = db.get_collection_stats()
print(f"Current database stats: {stats}")

# Process the sample data if we have it
if sample_data:
    print(f"\n📊 Processing {len(sample_data)} VizWiz entries...")
    
    added_count = 0
    for entry_id, entry_data in list(sample_data.items())[:]:  # Process first 2 for demo
        try:
            image_url = entry_data.get('image_url', '')
            embedding = cohere_generate_image_embedding(image_url)
            
            # Extract required fields from VizWiz data
            question = entry_data.get('question', 'No question provided')
            answerability = entry_data.get('answerability', 'unknown')
            question_type = entry_data.get('question_type', 'unknown')
            crowd_answers = entry_data.get('crowd_answers', [])
            crowd_majority = entry_data.get('crowd_majority', 'unknown')
            
            
            # Add to vector database
            db.add_image_embedding(
                embedding_id=entry_id,
                image_embedding=embedding,
                question=question,
                answerability=answerability,
                question_type=question_type,
                image_url=image_url,
                crowd_answers=crowd_answers,
                crowd_majority=crowd_majority
            )
            
            added_count += 1
            
        except Exception as e:
            print(f"❌ Error processing entry {entry_id}: {e}")
    
    print(f"✅ Successfully added {added_count} entries to vector database")
    
# Check updated stats
stats = db.get_collection_stats()
print(f"\n📊 Updated database stats: {stats}")


🔍 Processing sample data and adding to vector database...
Current database stats: {'total_images': 2, 'collection_name': 'vizwiz_500_sample', 'persisted': True}

📊 Processing 500 VizWiz entries...


Failed to send telemetry event CollectionAddEvent: capture() takes 1 positional argument but 3 were given


Added embedding 416 to collection 'vizwiz_500_sample' (persisted to disk)
Added embedding 56 to collection 'vizwiz_500_sample' (persisted to disk)
Added embedding 31 to collection 'vizwiz_500_sample' (persisted to disk)
Added embedding 230 to collection 'vizwiz_500_sample' (persisted to disk)
Added embedding 372 to collection 'vizwiz_500_sample' (persisted to disk)
Added embedding 433 to collection 'vizwiz_500_sample' (persisted to disk)
Added embedding 308 to collection 'vizwiz_500_sample' (persisted to disk)
Added embedding 377 to collection 'vizwiz_500_sample' (persisted to disk)
Added embedding 157 to collection 'vizwiz_500_sample' (persisted to disk)
Added embedding 272 to collection 'vizwiz_500_sample' (persisted to disk)
Added embedding 418 to collection 'vizwiz_500_sample' (persisted to disk)
Added embedding 84 to collection 'vizwiz_500_sample' (persisted to disk)
Added embedding 218 to collection 'vizwiz_500_sample' (persisted to disk)
Added embedding 13 to collection 'vizwiz_

Add of existing embedding ID: 596
Insert of existing embedding ID: 596


Added embedding 596 to collection 'vizwiz_500_sample' (persisted to disk)
Added embedding 161 to collection 'vizwiz_500_sample' (persisted to disk)
Added embedding 293 to collection 'vizwiz_500_sample' (persisted to disk)
Added embedding 310 to collection 'vizwiz_500_sample' (persisted to disk)
Added embedding 524 to collection 'vizwiz_500_sample' (persisted to disk)
Added embedding 332 to collection 'vizwiz_500_sample' (persisted to disk)
Added embedding 481 to collection 'vizwiz_500_sample' (persisted to disk)
Added embedding 567 to collection 'vizwiz_500_sample' (persisted to disk)
Added embedding 78 to collection 'vizwiz_500_sample' (persisted to disk)
Added embedding 536 to collection 'vizwiz_500_sample' (persisted to disk)
Added embedding 463 to collection 'vizwiz_500_sample' (persisted to disk)
Added embedding 220 to collection 'vizwiz_500_sample' (persisted to disk)
Added embedding 562 to collection 'vizwiz_500_sample' (persisted to disk)
Added embedding 115 to collection 'vizw

Failed to send telemetry event CollectionAddEvent: capture() takes 1 positional argument but 3 were given


Added embedding 186 to collection 'vizwiz_500_sample' (persisted to disk)
Added embedding 217 to collection 'vizwiz_500_sample' (persisted to disk)
Added embedding 322 to collection 'vizwiz_500_sample' (persisted to disk)
Added embedding 187 to collection 'vizwiz_500_sample' (persisted to disk)
Added embedding 91 to collection 'vizwiz_500_sample' (persisted to disk)
Added embedding 176 to collection 'vizwiz_500_sample' (persisted to disk)
Added embedding 148 to collection 'vizwiz_500_sample' (persisted to disk)
Added embedding 374 to collection 'vizwiz_500_sample' (persisted to disk)
Added embedding 208 to collection 'vizwiz_500_sample' (persisted to disk)
Added embedding 37 to collection 'vizwiz_500_sample' (persisted to disk)
Added embedding 319 to collection 'vizwiz_500_sample' (persisted to disk)
Added embedding 352 to collection 'vizwiz_500_sample' (persisted to disk)
Added embedding 438 to collection 'vizwiz_500_sample' (persisted to disk)
Added embedding 45 to collection 'vizwiz

Failed to send telemetry event CollectionAddEvent: capture() takes 1 positional argument but 3 were given


Added embedding 173 to collection 'vizwiz_500_sample' (persisted to disk)
Added embedding 400 to collection 'vizwiz_500_sample' (persisted to disk)
Added embedding 106 to collection 'vizwiz_500_sample' (persisted to disk)
Added embedding 139 to collection 'vizwiz_500_sample' (persisted to disk)
Added embedding 126 to collection 'vizwiz_500_sample' (persisted to disk)
Added embedding 348 to collection 'vizwiz_500_sample' (persisted to disk)
Added embedding 420 to collection 'vizwiz_500_sample' (persisted to disk)
Added embedding 192 to collection 'vizwiz_500_sample' (persisted to disk)
Added embedding 476 to collection 'vizwiz_500_sample' (persisted to disk)
Added embedding 144 to collection 'vizwiz_500_sample' (persisted to disk)
Added embedding 122 to collection 'vizwiz_500_sample' (persisted to disk)
Added embedding 304 to collection 'vizwiz_500_sample' (persisted to disk)
Added embedding 17 to collection 'vizwiz_500_sample' (persisted to disk)
Added embedding 81 to collection 'vizwi

Failed to send telemetry event CollectionAddEvent: capture() takes 1 positional argument but 3 were given


Added embedding 71 to collection 'vizwiz_500_sample' (persisted to disk)
Added embedding 38 to collection 'vizwiz_500_sample' (persisted to disk)
Added embedding 499 to collection 'vizwiz_500_sample' (persisted to disk)
Added embedding 109 to collection 'vizwiz_500_sample' (persisted to disk)
Added embedding 119 to collection 'vizwiz_500_sample' (persisted to disk)
Added embedding 395 to collection 'vizwiz_500_sample' (persisted to disk)
Added embedding 465 to collection 'vizwiz_500_sample' (persisted to disk)
Added embedding 320 to collection 'vizwiz_500_sample' (persisted to disk)
Added embedding 314 to collection 'vizwiz_500_sample' (persisted to disk)
Added embedding 275 to collection 'vizwiz_500_sample' (persisted to disk)
Added embedding 22 to collection 'vizwiz_500_sample' (persisted to disk)
Added embedding 225 to collection 'vizwiz_500_sample' (persisted to disk)
Added embedding 592 to collection 'vizwiz_500_sample' (persisted to disk)
Added embedding 100 to collection 'vizwiz

Add of existing embedding ID: 39
Insert of existing embedding ID: 39


Added embedding 39 to collection 'vizwiz_500_sample' (persisted to disk)
Added embedding 388 to collection 'vizwiz_500_sample' (persisted to disk)
Added embedding 128 to collection 'vizwiz_500_sample' (persisted to disk)
Added embedding 264 to collection 'vizwiz_500_sample' (persisted to disk)
Added embedding 419 to collection 'vizwiz_500_sample' (persisted to disk)
Added embedding 232 to collection 'vizwiz_500_sample' (persisted to disk)
Added embedding 368 to collection 'vizwiz_500_sample' (persisted to disk)
Added embedding 541 to collection 'vizwiz_500_sample' (persisted to disk)
Added embedding 51 to collection 'vizwiz_500_sample' (persisted to disk)
Added embedding 508 to collection 'vizwiz_500_sample' (persisted to disk)
Added embedding 29 to collection 'vizwiz_500_sample' (persisted to disk)
Added embedding 577 to collection 'vizwiz_500_sample' (persisted to disk)
Added embedding 268 to collection 'vizwiz_500_sample' (persisted to disk)
Added embedding 386 to collection 'vizwiz

Failed to send telemetry event CollectionAddEvent: capture() takes 1 positional argument but 3 were given


Added embedding 274 to collection 'vizwiz_500_sample' (persisted to disk)
Added embedding 590 to collection 'vizwiz_500_sample' (persisted to disk)
Added embedding 525 to collection 'vizwiz_500_sample' (persisted to disk)
Added embedding 543 to collection 'vizwiz_500_sample' (persisted to disk)
Added embedding 302 to collection 'vizwiz_500_sample' (persisted to disk)
Added embedding 445 to collection 'vizwiz_500_sample' (persisted to disk)
Added embedding 485 to collection 'vizwiz_500_sample' (persisted to disk)
Added embedding 188 to collection 'vizwiz_500_sample' (persisted to disk)
Added embedding 255 to collection 'vizwiz_500_sample' (persisted to disk)
Added embedding 8 to collection 'vizwiz_500_sample' (persisted to disk)
Added embedding 265 to collection 'vizwiz_500_sample' (persisted to disk)
Added embedding 277 to collection 'vizwiz_500_sample' (persisted to disk)
Added embedding 282 to collection 'vizwiz_500_sample' (persisted to disk)
Added embedding 110 to collection 'vizwi

# Testing Search

In [7]:

# Load and use random validation embedding for similarity search
print("\n🎲 Loading validation embeddings and selecting random query...")

try:
    # Load the validation embeddings file
    validation_file = "./data/embeddings/lf_vqa_validation_embeddings_cohere.json"
    with open(validation_file, 'r') as f:
        validation_data = json.load(f)
    
    print(f"📁 Loaded validation file with {validation_data['count']} embeddings")
    
    # Randomly select one validation embedding
    random_validation = random.choice(validation_data['items'])
    query_embedding = random_validation['embedding'][0]  # Extract the embedding vector
    query_metadata = random_validation['metadata']
    query_id = random_validation['id']
    
    print(f"🎯 Selected random validation image:")
    print(f"   ID: {query_id}")
    print(f"   Question: {query_metadata['question']}")
    print(f"   Answer type: {query_metadata['answerability']}")
    print(f"   Question type: {query_metadata['question_type']}")
    print(f"   Crowd majority: {query_metadata['crowd_majority']}")
    print(f"   Image URL: {query_metadata['image_url']}")
    
    # Perform similarity search using the random validation embedding
    results = db.search_similar_images(query_embedding, n_results=3)
    
    print(f"\n🔎 Found {results['count']} similar images:")
    for i, result in enumerate(results['similar_images']):
        metadata = result['metadata']
        print(f"  {i+1}. ID: {result['id']}")
        print(f"     Question: {metadata['question']}")
        print(f"     Answer type: {metadata['answerability']}")
        print(f"     Question type: {metadata['question_type']}")
        print(f"     Crowd majority: {metadata['crowd_majority']}")
        print(f"     Image URL: {metadata['image_url']}")
        print(f"     Distance: {result['distance']:.3f}")
        print()

except FileNotFoundError:
    print(f"❌ Validation embeddings file not found: {validation_file}")
    print("Please make sure the validation embeddings have been generated first.")
except Exception as e:
    print(f"❌ Error loading validation embeddings: {e}")




🎲 Loading validation embeddings and selecting random query...
📁 Loaded validation file with 100 embeddings
🎯 Selected random validation image:
   ID: 342
   Question: What's this CD called?
   Answer type: unanswerable
   Question type: Reading
   Crowd majority: unanswerable
   Image URL: https://vizwiz.cs.colorado.edu/VizWiz_visualization_img/VizWiz_train_00004093.jpg

🔎 Found 3 similar images:
  1. ID: 84
     Question: Exactly is on my shirt?
     Answer type: unanswerable
     Question type: Identification
     Crowd majority: unanswerable
     Image URL: https://vizwiz.cs.colorado.edu/VizWiz_visualization_img/VizWiz_train_00013869.jpg
     Distance: 0.840

  2. ID: 328
     Question: What does the display say?
     Answer type: answerable
     Question type: Reading
     Crowd majority: unsuitable image
     Image URL: https://vizwiz.cs.colorado.edu/VizWiz_visualization_img/VizWiz_train_00019947.jpg
     Distance: 0.886

  3. ID: 408
     Question: What is, what does this, that,

# Precalculate Embeddings and Store

In [19]:
# Generate and save embeddings for validation images
import json
import os
from pathlib import Path

print("\n💾 Generating and saving validation embeddings to disk...")

# Create embeddings directory
os.makedirs("../data/embeddings", exist_ok=True)

# Prepare filename for validation embeddings
validation_filename = "./data/embeddings/lf_vqa_validation_embeddings.json"

# Generate and save validation embeddings
try:
    validation_embeddings = []
    
    print(f"Generating embeddings for {len(validation_sample)} validation images...")
    
    # Extract validation entries from all_data using validation_sample IDs
    for val_id in validation_sample:
        if val_id in all_data:  # Only check if ID exists in original data
            val_entry = all_data[val_id]
            image_url = val_entry.get('image_url', '')
            
            if image_url:
                # Generate embedding for this validation image using original data
                embedding = cohere_generate_image_embedding(image_url)
                
                validation_embeddings.append({
                    "id": val_id,
                    "embedding": embedding,
                    "metadata": {
                        "question": val_entry.get('question', ''),
                        "answerability": val_entry.get('answerability', ''),
                        "question_type": val_entry.get('question_type', ''),
                        "crowd_majority": val_entry.get('crowd_majority', ''),
                        "image_url": image_url
                    }
                })
                print(f"  Generated embedding for validation ID: {val_id} (from original data)")
            else:
                print(f"  ⚠️ No image URL found for validation ID: {val_id}")
        else:
            print(f"  ⚠️ Validation ID {val_id} not found in original data")
    
    # Save validation embeddings to file
    validation_data = {
        "count": len(validation_embeddings),
        "items": validation_embeddings,
    }
    
    with open(validation_filename, 'w') as f:
        json.dump(validation_data, f)
    
    print(f"✅ Saved {len(validation_embeddings)} validation embeddings to {validation_filename}")
    print(f"📊 Success rate: {len(validation_embeddings)}/{len(validation_sample)} validation images processed")
except Exception as e:
    print(f"❌ Error generating validation embeddings: {str(e)}")



💾 Generating and saving validation embeddings to disk...
Generating embeddings for 100 validation images...
  Generated embedding for validation ID: 396 (from original data)
  Generated embedding for validation ID: 60 (from original data)
  Generated embedding for validation ID: 519 (from original data)
  Generated embedding for validation ID: 77 (from original data)
  Generated embedding for validation ID: 391 (from original data)
  Generated embedding for validation ID: 583 (from original data)
  Generated embedding for validation ID: 41 (from original data)
  Generated embedding for validation ID: 453 (from original data)
  Generated embedding for validation ID: 35 (from original data)
  Generated embedding for validation ID: 464 (from original data)
  Generated embedding for validation ID: 61 (from original data)
  Generated embedding for validation ID: 305 (from original data)
  Generated embedding for validation ID: 74 (from original data)
  Generated embedding for validation ID

# Testing Query