# FAISS Retrieval Benchmarks CPU vs GPU

In [6]:
import numpy as np
import faiss
import time
from typing import Tuple, List
import pandas as pd


# CPU Benchmark

> To run this section, you need to create a new environment with `faiss-cpu` installed

In [7]:
class FAISSBenchmark:
    """Benchmark class for FAISS index creation and retrieval"""
    
    def __init__(self, embedding_dim: int = 4096):
        self.embedding_dim = embedding_dim
        self.index = None
        self.embeddings = None
        self.benchmark_results = {
            'index_creation_times': [],
            'search_times': [],
            'num_embeddings': [],
            'num_queries': [],
            'k_values': []
        }
    
    def generate_dummy_embeddings(self, num_embeddings: int) -> np.ndarray:
        """Generate random embeddings for benchmarking"""
        # Generate random embeddings normalized to unit length
        embeddings = np.random.randn(num_embeddings, self.embedding_dim).astype('float32')
        # Normalize embeddings
        norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
        embeddings = embeddings / norms
        return embeddings
    
    def create_index(self, embeddings: np.ndarray, index_type: str = 'flat') -> Tuple[faiss.Index, float]:
        """Create FAISS index and measure time"""
        start_time = time.time()
        

        index = faiss.IndexFlatL2(self.embedding_dim)
        index.add(embeddings)
        
        creation_time = time.time() - start_time
        print(f"{index_type.upper()} index created with {index.ntotal} entries in {creation_time:.3f} seconds")
        
        return index, creation_time
    
    def benchmark_search(self, index: faiss.Index, query_embeddings: np.ndarray, k: int = 10) -> Tuple[np.ndarray, np.ndarray, float]:
        """Benchmark search performance"""
        start_time = time.time()
        distances, indices = index.search(query_embeddings, k)
        search_time = time.time() - start_time
        
        avg_time_per_query = search_time / len(query_embeddings)
        print(f"Search completed: {len(query_embeddings)} queries, k={k}, total time: {search_time:.3f}s, avg: {avg_time_per_query*1000:.2f}ms/query")
        
        return distances, indices, search_time


In [8]:
def run_comprehensive_benchmark():
    """Run comprehensive benchmarks for different dataset sizes and index types"""
    benchmark = FAISSBenchmark(embedding_dim=4096)
    
    # Test different dataset sizes
    dataset_sizes = [1000, 5000, 10000, 50000]
    index_types = ['flat']
    k_values = [1, 5, 10, 50]
    num_queries = 100
    
    results = []
    
    for num_embeddings in dataset_sizes:
        print(f"\n{'='*60}")
        print(f"Testing with {num_embeddings} embeddings (dim={benchmark.embedding_dim})")
        print(f"{'='*60}")
        
        # Generate dummy data
        print(f"\nGenerating {num_embeddings} dummy embeddings...")
        embeddings = benchmark.generate_dummy_embeddings(num_embeddings)
        query_embeddings = benchmark.generate_dummy_embeddings(num_queries)
        
        for index_type in index_types:
            print(f"\n--- Testing {index_type.upper()} index ---")
            
            # Create index
            index, creation_time = benchmark.create_index(embeddings, index_type)
            
            # Set search parameters for IVF
            if index_type == 'ivf':
                nprobe = min(64, index.nlist // 2)  # number of clusters to search
                index.nprobe = nprobe
                print(f"IVF index: nprobe set to {nprobe}")
            
            # Test different k values
            for k in k_values:
                if k <= num_embeddings:  # Only test k values that make sense
                    distances, indices, search_time = benchmark.benchmark_search(index, query_embeddings, k)
                    
                    results.append({
                        'num_embeddings': num_embeddings,
                        'index_type': index_type,
                        'k': k,
                        'num_queries': num_queries,
                        'creation_time': creation_time,
                        'total_search_time': search_time,
                        'avg_search_time_ms': (search_time / num_queries) * 1000,
                        'embedding_dim': benchmark.embedding_dim
                    })
    
    return pd.DataFrame(results)


In [None]:
# Simple benchmark example with smaller dataset
def simple_benchmark_example():
    """Run a simple benchmark example with visualization"""
    print("Running Simple FAISS Benchmark Example")
    print("=" * 60)
    
    # Initialize benchmark with 4096-dimensional embeddings
    benchmark = FAISSBenchmark(embedding_dim=4096)
    
    # Test parameters
    num_embeddings = 10000
    num_queries = 100
    k = 10
    
    # Generate dummy embeddings
    print(f"\nGenerating {num_embeddings} embeddings with dimension {benchmark.embedding_dim}...")
    embeddings = benchmark.generate_dummy_embeddings(num_embeddings)
    query_embeddings = benchmark.generate_dummy_embeddings(num_queries)
    
    print(f"Embeddings shape: {embeddings.shape}")
    print(f"Query embeddings shape: {query_embeddings.shape}")
    print(f"Memory usage: ~{embeddings.nbytes / (1024**2):.1f} MB")
    
    # Test different index types
    results = {}
    index_types = ['flat']
    
    for index_type in index_types:
        print(f"\n{'-'*40}")
        print(f"Testing {index_type.upper()} Index")
        print(f"{'-'*40}")
        
        # Create index
        index, creation_time = benchmark.create_index(embeddings, index_type)
        
        # Perform search
        distances, indices, search_time = benchmark.benchmark_search(index, query_embeddings, k)
        
        # Store results
        results[index_type] = {
            'creation_time': creation_time,
            'search_time': search_time,
            'avg_search_time_ms': (search_time / num_queries) * 1000,
            'distances': distances,
            'indices': indices
        }
        
        # Show sample results
        print(f"\nSample results for first query:")
        print(f"Top {min(5, k)} nearest neighbors: {indices[0][:5]}")
        print(f"Distances: {distances[0][:5]}")
    
    # Print comparison
    print(f"\n{'='*60}")
    print("PERFORMANCE COMPARISON")
    print(f"{'='*60}")
    print(f"{'Index Type':<15} {'Creation Time':<15} {'Avg Search Time':<20}")
    print(f"{'-'*50}")
    for index_type, result in results.items():
        print(f"{index_type.upper():<15} {result['creation_time']:.3f}s {' '*9} {result['avg_search_time_ms']:.2f}ms/query")
    
    return results

# Run the simple benchmark
simple_results = simple_benchmark_example()


Running Simple FAISS Benchmark Example

Generating 10000 embeddings with dimension 4096...
Embeddings shape: (10000, 4096)
Query embeddings shape: (100, 4096)
Memory usage: ~156.2 MB

----------------------------------------
Testing FLAT Index
----------------------------------------
FLAT index created with 10000 entries in 0.028 seconds
Search completed: 100 queries, k=10, total time: 0.106s, avg: 1.06ms/query

Sample results for first query:
Top 5 nearest neighbors: [2073  632 4796 3997  888]
Distances: [1.8831728 1.8926971 1.8950738 1.896204  1.8965175]

PERFORMANCE COMPARISON
Index Type      Creation Time   Avg Search Time     
--------------------------------------------------
FLAT            0.028s           1.06ms/query


# GPU benchmark

> To run this section, you need to create a new evnrionment with faiss-gpu-cuvs installed

```bash
conda install -c pytorch -c nvidia -c rapidsai -c conda-forge libnvjitlink faiss-gpu-cuvs=1.11.0
```


In [5]:
import numpy as np
import faiss
import time
from typing import Tuple, List
import pandas as pd

In [6]:
class CuVSFAISSBenchmark:
    """Benchmark class for cuVS GPU FAISS flat index creation and retrieval"""
    
    def __init__(self, embedding_dim: int = 4096, device: int = 0):
        self.embedding_dim = embedding_dim
        self.device = device
        # GPU resource handle
        self.res = faiss.StandardGpuResources()
        self.benchmark_results = {
            'index_creation_times': [],
            'search_times': [],
            'num_embeddings': [],
            'num_queries': [],
            'k_values': []
        }

    def generate_dummy_embeddings(self, num_embeddings: int) -> np.ndarray:
        """Generate random unit‐normalized embeddings"""
        emb = np.random.randn(num_embeddings, self.embedding_dim).astype('float32')
        emb /= np.linalg.norm(emb, axis=1, keepdims=True)
        return emb

    def create_gpu_flat_index(self, embeddings: np.ndarray) -> Tuple[faiss.Index, float]:
        """Build a cuVS flat L2 index on GPU and measure creation time."""
        t0 = time.time()

        # Configure cuVS
        cfg = faiss.GpuIndexFlatConfig()
        cfg.device = self.device
        cfg.useFloat16 = False
        cfg.use_cuvs = True

        # Create GPU flat index
        index_gpu = faiss.GpuIndexFlatL2(self.res, self.embedding_dim, cfg)
        index_gpu.add(embeddings)

        creation_time = time.time() - t0
        print(f"cuVS FLAT GPU index with {index_gpu.ntotal} entries created in {creation_time:.3f}s")
        return index_gpu, creation_time

    def benchmark_search(self, index: faiss.Index, queries: np.ndarray, k: int = 10) -> Tuple[np.ndarray, np.ndarray, float]:
        """Time a GPU‐based search."""
        t0 = time.time()
        D, I = index.search(queries, k)
        total = time.time() - t0
        avg_ms = total / len(queries) * 1000
        print(f"GPU Search: {len(queries)} queries, k={k}, total={total:.3f}s, avg={avg_ms:.2f}ms/query")
        return D, I, total

def run_gpu_benchmark():
    bm = CuVSFAISSBenchmark(embedding_dim=4096, device=0)
    dataset_sizes = [1000, 5000, 10000, 50000]
    k_values       = [1, 5, 10, 50]
    num_queries    = 100
    
    records = []
    for N in dataset_sizes:
        print(f"\n=== Benchmarking {N} embeddings ===")
        xb = bm.generate_dummy_embeddings(N)
        xq = bm.generate_dummy_embeddings(num_queries)

        # build index
        index, t_create = bm.create_gpu_flat_index(xb)

        for k in k_values:
            if k <= N:
                D, I, t_search = bm.benchmark_search(index, xq, k)
                records.append({
                    'num_embeddings': int(N),
                    'creation_time_s': float(t_create),
                    'k': int(k),
                    'avg_search_time_ms': float(t_search / num_queries * 1000)
                })

    # Now it's safe to make a DataFrame of pure Python scalars:
    return records

In [7]:
records = run_gpu_benchmark()


=== Benchmarking 1000 embeddings ===
cuVS FLAT GPU index with 1000 entries created in 0.363s
GPU Search: 100 queries, k=1, total=0.029s, avg=0.29ms/query
GPU Search: 100 queries, k=5, total=0.001s, avg=0.01ms/query
GPU Search: 100 queries, k=10, total=0.001s, avg=0.01ms/query
GPU Search: 100 queries, k=50, total=0.003s, avg=0.03ms/query

=== Benchmarking 5000 embeddings ===
cuVS FLAT GPU index with 5000 entries created in 0.009s
GPU Search: 100 queries, k=1, total=0.002s, avg=0.02ms/query
GPU Search: 100 queries, k=5, total=0.001s, avg=0.01ms/query
GPU Search: 100 queries, k=10, total=0.001s, avg=0.01ms/query
GPU Search: 100 queries, k=50, total=0.001s, avg=0.01ms/query

=== Benchmarking 10000 embeddings ===
cuVS FLAT GPU index with 10000 entries created in 0.015s
GPU Search: 100 queries, k=1, total=0.003s, avg=0.03ms/query
GPU Search: 100 queries, k=5, total=0.002s, avg=0.02ms/query
GPU Search: 100 queries, k=10, total=0.002s, avg=0.02ms/query
GPU Search: 100 queries, k=50, total=0.0

In [12]:
import pandas as pd
import numpy as np


records

[{'num_embeddings': 1000,
  'creation_time_s': 0.36345696449279785,
  'k': 1,
  'avg_search_time_ms': 0.2937960624694824},
 {'num_embeddings': 1000,
  'creation_time_s': 0.36345696449279785,
  'k': 5,
  'avg_search_time_ms': 0.010058879852294922},
 {'num_embeddings': 1000,
  'creation_time_s': 0.36345696449279785,
  'k': 10,
  'avg_search_time_ms': 0.00934600830078125},
 {'num_embeddings': 1000,
  'creation_time_s': 0.36345696449279785,
  'k': 50,
  'avg_search_time_ms': 0.03309488296508789},
 {'num_embeddings': 5000,
  'creation_time_s': 0.008816242218017578,
  'k': 1,
  'avg_search_time_ms': 0.018193721771240234},
 {'num_embeddings': 5000,
  'creation_time_s': 0.008816242218017578,
  'k': 5,
  'avg_search_time_ms': 0.014033317565917969},
 {'num_embeddings': 5000,
  'creation_time_s': 0.008816242218017578,
  'k': 10,
  'avg_search_time_ms': 0.014102458953857422},
 {'num_embeddings': 5000,
  'creation_time_s': 0.008816242218017578,
  'k': 50,
  'avg_search_time_ms': 0.01432657241821289

In [61]:
print("Average creation time:", np.round(np.average([r['creation_time_s'] for r in records]), 3), " s")
print("Average search time:", np.round(np.average([r['avg_search_time_ms'] for r in records]), 3), " ms/query ")

Average creation time: 0.065  s
Average search time: 0.04  ms/query 


- CPU search time: 1.06ms/query
- GPU-cuVS search time (1xA100-80G): 0.04  ms/query  -> **26.5x speed up**