# 02 - Vector Database Setup and Indexing

This notebook covers:
- Setting up FAISS, Pinecone, and ChromaDB vector databases
- Indexing embeddings from the previous notebook
- Implementing similarity search
- Performance comparison

In [None]:
import os
import sys
sys.path.append('../src')

import numpy as np
import pandas as pd
import pickle
from pathlib import Path
from dotenv import load_dotenv
import time

# Load environment variables
load_dotenv()

# Paths
PROCESSED_DIR = Path('../data/processed')
INDEX_DIR = Path('../data/processed/indexes')
INDEX_DIR.mkdir(parents=True, exist_ok=True)

## 1. Load Processed Embeddings

In [None]:
# Load embeddings
text_embeddings = np.load(PROCESSED_DIR / 'text_embeddings.npy')
image_embeddings = np.load(PROCESSED_DIR / 'image_embeddings.npy')
clip_text_embeddings = np.load(PROCESSED_DIR / 'clip_text_embeddings.npy')

# Load metadata
metadata_df = pd.read_csv(PROCESSED_DIR / 'metadata_processed.csv')

print(f"Loaded {len(text_embeddings)} text embeddings of dimension {text_embeddings.shape[1]}")
print(f"Loaded {len(image_embeddings)} image embeddings of dimension {image_embeddings.shape[1]}")
print(f"Loaded {len(clip_text_embeddings)} CLIP text embeddings of dimension {clip_text_embeddings.shape[1]}")
print(f"\nMetadata: {len(metadata_df)} items")
metadata_df.head()

## 2. FAISS Vector Database Setup

In [None]:
import faiss

class FAISSIndex:
    def __init__(self, dimension, index_type='flatl2'):
        """
        Initialize FAISS index
        index_type: 'flatl2', 'flatip' (inner product), 'ivf' (inverted file)
        """
        self.dimension = dimension
        
        if index_type == 'flatl2':
            self.index = faiss.IndexFlatL2(dimension)
        elif index_type == 'flatip':
            self.index = faiss.IndexFlatIP(dimension)
        elif index_type == 'ivf':
            # IVF with 100 clusters for faster search on large datasets
            quantizer = faiss.IndexFlatL2(dimension)
            self.index = faiss.IndexIVFFlat(quantizer, dimension, 100)
        else:
            raise ValueError(f"Unknown index type: {index_type}")
    
    def add_vectors(self, vectors):
        """Add vectors to the index"""
        vectors = vectors.astype('float32')
        
        # Train IVF index if needed
        if isinstance(self.index, faiss.IndexIVFFlat):
            if not self.index.is_trained:
                self.index.train(vectors)
        
        self.index.add(vectors)
        print(f"Added {len(vectors)} vectors. Total: {self.index.ntotal}")
    
    def search(self, query_vector, k=5):
        """Search for k nearest neighbors"""
        query_vector = query_vector.astype('float32').reshape(1, -1)
        distances, indices = self.index.search(query_vector, k)
        return distances[0], indices[0]
    
    def save(self, path):
        """Save index to disk"""
        faiss.write_index(self.index, str(path))
        print(f"Index saved to {path}")
    
    @classmethod
    def load(cls, path):
        """Load index from disk"""
        obj = cls.__new__(cls)
        obj.index = faiss.read_index(str(path))
        obj.dimension = obj.index.d
        print(f"Index loaded from {path}")
        return obj

In [None]:
# Create FAISS indexes for different embedding types
print("Creating FAISS indexes...\n")

# Text embeddings index (cosine similarity via inner product)
text_index = FAISSIndex(text_embeddings.shape[1], index_type='flatip')
# Normalize for cosine similarity
text_embeddings_norm = text_embeddings / np.linalg.norm(text_embeddings, axis=1, keepdims=True)
text_index.add_vectors(text_embeddings_norm)

# Image embeddings index
image_index = FAISSIndex(image_embeddings.shape[1], index_type='flatip')
image_embeddings_norm = image_embeddings / np.linalg.norm(image_embeddings, axis=1, keepdims=True)
image_index.add_vectors(image_embeddings_norm)

# CLIP text embeddings index (multimodal)
clip_index = FAISSIndex(clip_text_embeddings.shape[1], index_type='flatip')
clip_embeddings_norm = clip_text_embeddings / np.linalg.norm(clip_text_embeddings, axis=1, keepdims=True)
clip_index.add_vectors(clip_embeddings_norm)

# Save indexes
text_index.save(INDEX_DIR / 'faiss_text.index')
image_index.save(INDEX_DIR / 'faiss_image.index')
clip_index.save(INDEX_DIR / 'faiss_clip.index')

## 3. ChromaDB Setup

In [None]:
import chromadb
from chromadb.config import Settings

# Initialize ChromaDB client
chroma_client = chromadb.PersistentClient(
    path=str(INDEX_DIR / 'chromadb'),
    settings=Settings(anonymized_telemetry=False)
)

# Create collections for different modalities
print("Creating ChromaDB collections...\n")

# Delete existing collections if they exist
try:
    chroma_client.delete_collection("text_embeddings")
    chroma_client.delete_collection("image_embeddings")
    chroma_client.delete_collection("multimodal_embeddings")
except:
    pass

# Text collection
text_collection = chroma_client.create_collection(
    name="text_embeddings",
    metadata={"description": "OpenAI text embeddings"}
)

# Image collection
image_collection = chroma_client.create_collection(
    name="image_embeddings",
    metadata={"description": "CLIP image embeddings"}
)

# Multimodal collection (CLIP text + image aligned)
multimodal_collection = chroma_client.create_collection(
    name="multimodal_embeddings",
    metadata={"description": "CLIP multimodal embeddings"}
)

print("Collections created successfully")

In [None]:
# Add documents to ChromaDB collections
print("Adding documents to ChromaDB...\n")

# Prepare documents and metadata
ids = metadata_df['id'].tolist()
documents = [f"{row['title']}. {row['description']}" for _, row in metadata_df.iterrows()]
metadatas = metadata_df[['title', 'category', 'image_path']].to_dict('records')

# Add to text collection
text_collection.add(
    ids=ids,
    embeddings=text_embeddings.tolist(),
    documents=documents,
    metadatas=metadatas
)
print(f"Added {len(ids)} items to text collection")

# Add to image collection
image_collection.add(
    ids=ids,
    embeddings=image_embeddings.tolist(),
    documents=documents,
    metadatas=metadatas
)
print(f"Added {len(ids)} items to image collection")

# Add to multimodal collection
multimodal_collection.add(
    ids=ids,
    embeddings=clip_text_embeddings.tolist(),
    documents=documents,
    metadatas=metadatas
)
print(f"Added {len(ids)} items to multimodal collection")

## 4. Pinecone Setup (Optional)

In [None]:
# Uncomment to use Pinecone

# from pinecone import Pinecone, ServerlessSpec

# # Initialize Pinecone
# pc = Pinecone(api_key=os.getenv('PINECONE_API_KEY'))

# # Create index
# index_name = "multimodal-rag"

# # Delete existing index if it exists
# if index_name in pc.list_indexes().names():
#     pc.delete_index(index_name)

# # Create new index
# pc.create_index(
#     name=index_name,
#     dimension=text_embeddings.shape[1],
#     metric='cosine',
#     spec=ServerlessSpec(
#         cloud='aws',
#         region='us-east-1'
#     )
# )

# # Connect to index
# index = pc.Index(index_name)

# # Prepare vectors for upsert
# vectors_to_upsert = []
# for i, (idx, row) in enumerate(metadata_df.iterrows()):
#     vectors_to_upsert.append({
#         'id': row['id'],
#         'values': text_embeddings[i].tolist(),
#         'metadata': {
#             'title': row['title'],
#             'description': row['description'],
#             'category': row['category']
#         }
#     })

# # Upsert vectors
# index.upsert(vectors=vectors_to_upsert)
# print(f"Upserted {len(vectors_to_upsert)} vectors to Pinecone")

## 5. Test Similarity Search

In [None]:
def display_search_results(query, distances, indices, metadata_df, k=5):
    """Display search results in a readable format"""
    print(f"\nQuery: {query}")
    print("=" * 80)
    
    for i, (dist, idx) in enumerate(zip(distances[:k], indices[:k])):
        if idx < len(metadata_df):
            item = metadata_df.iloc[idx]
            print(f"\nRank {i+1} | Similarity: {1-dist:.4f}")
            print(f"Title: {item['title']}")
            print(f"Description: {item['description']}")
            print(f"Category: {item['category']}")
            print("-" * 80)

# Test with FAISS
print("\n" + "="*80)
print("TESTING FAISS SEARCH")
print("="*80)

# Use first item as query
test_query_idx = 0
test_query_embedding = text_embeddings_norm[test_query_idx]
test_query_text = f"{metadata_df.iloc[test_query_idx]['title']}. {metadata_df.iloc[test_query_idx]['description']}"

# Search
distances, indices = text_index.search(test_query_embedding, k=5)
display_search_results(test_query_text, distances, indices, metadata_df)

In [None]:
# Test with ChromaDB
print("\n" + "="*80)
print("TESTING CHROMADB SEARCH")
print("="*80)

# Query ChromaDB
results = text_collection.query(
    query_embeddings=[test_query_embedding.tolist()],
    n_results=5
)

print(f"\nQuery: {test_query_text}")
print("=" * 80)

for i, (doc, metadata, distance) in enumerate(zip(
    results['documents'][0],
    results['metadatas'][0],
    results['distances'][0]
)):
    print(f"\nRank {i+1} | Distance: {distance:.4f}")
    print(f"Title: {metadata['title']}")
    print(f"Document: {doc}")
    print(f"Category: {metadata['category']}")
    print("-" * 80)

## 6. Performance Comparison

In [None]:
import time

def benchmark_search(index_func, query_embedding, n_queries=100, k=5):
    """Benchmark search performance"""
    start_time = time.time()
    
    for _ in range(n_queries):
        index_func(query_embedding, k)
    
    end_time = time.time()
    avg_time = (end_time - start_time) / n_queries * 1000  # ms
    
    return avg_time

print("\nBenchmarking search performance...\n")

# FAISS benchmark
faiss_time = benchmark_search(text_index.search, test_query_embedding, n_queries=100)
print(f"FAISS average query time: {faiss_time:.3f} ms")

# ChromaDB benchmark
def chroma_search(query_emb, k):
    return text_collection.query(query_embeddings=[query_emb.tolist()], n_results=k)

chroma_time = benchmark_search(chroma_search, test_query_embedding, n_queries=100)
print(f"ChromaDB average query time: {chroma_time:.3f} ms")

print(f"\nFAISS is {chroma_time/faiss_time:.2f}x faster than ChromaDB")

## 7. Hybrid Search (Text + Image)

In [None]:
def hybrid_search(text_query_emb, image_query_emb, text_weight=0.5, k=5):
    """
    Perform hybrid search combining text and image embeddings
    """
    # Search in both indexes
    text_distances, text_indices = text_index.search(text_query_emb, k=k*2)
    image_distances, image_indices = image_index.search(image_query_emb, k=k*2)
    
    # Combine scores (convert distances to similarities)
    text_scores = 1 - text_distances
    image_scores = 1 - image_distances
    
    # Create a dictionary to aggregate scores
    combined_scores = {}
    
    for idx, score in zip(text_indices, text_scores):
        combined_scores[idx] = text_weight * score
    
    for idx, score in zip(image_indices, image_scores):
        if idx in combined_scores:
            combined_scores[idx] += (1 - text_weight) * score
        else:
            combined_scores[idx] = (1 - text_weight) * score
    
    # Sort by combined score
    sorted_results = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)[:k]
    
    indices = np.array([idx for idx, _ in sorted_results])
    scores = np.array([score for _, score in sorted_results])
    
    return scores, indices

# Test hybrid search
print("\n" + "="*80)
print("TESTING HYBRID SEARCH (Text + Image)")
print("="*80)

scores, indices = hybrid_search(
    text_embeddings_norm[0],
    image_embeddings_norm[0],
    text_weight=0.6,
    k=5
)

# Display results
distances = 1 - scores  # Convert back to distances for display function
display_search_results("Hybrid query (text + image)", distances, indices, metadata_df)

## Summary

In this notebook, we:
1. Set up FAISS vector database for fast similarity search
2. Set up ChromaDB for managed vector storage
3. Created indexes for text, image, and multimodal embeddings
4. Implemented and tested similarity search
5. Benchmarked performance across different vector databases
6. Implemented hybrid search combining text and image modalities

Next step: Notebook 03 - RAG Pipeline Implementation