# Improved Multi-GPU RAG Testing on Google Colab A100

This notebook tests the improved parallel GPU implementation with:
- Parallel GPU execution
- Top-2K retrieval for better recall
- Enhanced error handling
- FAISS comparison benchmarks

## 1. Setup and GPU Check

First, ensure you have selected A100 GPU in Runtime > Change runtime type

In [None]:
# Check GPU availability and type
!nvidia-smi

import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU Count: {torch.cuda.device_count()}")
    for i in range(torch.cuda.device_count()):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
        print(f"  Memory: {torch.cuda.get_device_properties(i).total_memory / 1024**3:.1f} GB")

## 2. Install Required Packages

In [None]:
# Install cuVS and dependencies
!pip install --upgrade pip
!pip install pylibraft-cu12 cuvs-cu12
!pip install sentence-transformers torch numpy pandas matplotlib seaborn psutil
!pip install faiss-gpu  # For comparison benchmarks

## 3. Load Improved Implementation

In [None]:
# Import the improved implementation
import os
import sys
import time
import torch
import numpy as np
import pandas as pd
from typing import List, Dict, Any, Tuple, Optional
from concurrent.futures import ThreadPoolExecutor, as_completed
from dataclasses import dataclass
import logging
from enum import Enum
import gc
import psutil
from contextlib import contextmanager
import matplotlib.pyplot as plt
import seaborn as sns

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

In [None]:
# Copy the improved implementation classes here
# (Include all classes from improved_multi_gpu_rag.py)

class IndexType(Enum):
    """Supported index types"""
    IVF_FLAT = "ivf_flat"
    IVF_PQ = "ivf_pq"
    CAGRA = "cagra"
    FAISS_FLAT = "faiss_flat"
    FAISS_IVF = "faiss_ivf"

@dataclass
class SearchConfig:
    """Configuration for search operations"""
    top_k: int = 2000  # Changed from 5 to 2000 for better recall
    search_batch_size: int = 100
    num_queries: int = 100
    enable_recall_eval: bool = True
    recall_k_values: List[int] = None

    def __post_init__(self):
        if self.recall_k_values is None:
            self.recall_k_values = [1, 5, 10, 50, 100, 500, 1000, 2000]

# Include rest of the classes from improved_multi_gpu_rag.py
# ... (GPUConfig, CUDAMemoryManager, ParallelIndexBuilder, ParallelSearchEngine, RecallEvaluator)

## 4. Generate Test Dataset

In [None]:
from sentence_transformers import SentenceTransformer

# Load embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')  # Start with smaller model for testing
embedding_dim = model.get_sentence_embedding_dimension()
print(f"Embedding dimension: {embedding_dim}")

# Generate synthetic dataset
def generate_synthetic_texts(num_texts: int) -> List[str]:
    """Generate synthetic texts for testing"""
    topics = ["AI", "ML", "DL", "NLP", "CV", "RL", "Data", "Cloud", "IoT", "Blockchain"]
    templates = [
        "Research in {topic} shows {finding} with {application}",
        "Advanced {topic} techniques enable {capability} for {domain}",
        "The future of {topic} involves {trend} and {innovation}",
        "Understanding {topic} requires {skill} and {knowledge}",
        "Applications of {topic} include {use1} and {use2}"
    ]
    
    texts = []
    for i in range(num_texts):
        topic = topics[i % len(topics)]
        template = templates[i % len(templates)]
        text = template.format(
            topic=topic,
            finding=f"finding_{i}",
            application=f"app_{i}",
            capability=f"cap_{i}",
            domain=f"domain_{i}",
            trend=f"trend_{i}",
            innovation=f"innovation_{i}",
            skill=f"skill_{i}",
            knowledge=f"knowledge_{i}",
            use1=f"use1_{i}",
            use2=f"use2_{i}"
        )
        texts.append(text)
    return texts

# Generate embeddings
num_vectors = 500000  # Start with 500K for testing
batch_size = 1000

print(f"Generating {num_vectors:,} synthetic texts...")
texts = generate_synthetic_texts(num_vectors)

print("Encoding texts to embeddings...")
embeddings = model.encode(
    texts, 
    batch_size=batch_size,
    show_progress_bar=True,
    convert_to_tensor=True,
    device='cuda' if torch.cuda.is_available() else 'cpu'
)

print(f"Generated embeddings shape: {embeddings.shape}")
print(f"Memory used: {embeddings.element_size() * embeddings.nelement() / 1024**3:.2f} GB")

## 5. Test Parallel cuVS Implementation

In [None]:
import pylibraft
from cuvs.neighbors import ivf_flat, ivf_pq, cagra

pylibraft.config.set_output_as(lambda device_ndarray: device_ndarray.copy_to_host())

# For single GPU (Colab usually has 1 GPU), we'll simulate multi-GPU by splitting data
num_gpus = torch.cuda.device_count()
print(f"Number of GPUs: {num_gpus}")

# Split embeddings for simulated multi-GPU processing
if num_gpus == 1:
    # Simulate 2 GPUs by splitting data
    print("Single GPU detected - simulating 2-GPU setup by splitting data")
    split_size = len(embeddings) // 2
    embedding_parts = [
        embeddings[:split_size].cuda(),
        embeddings[split_size:].cuda()
    ]
    simulated_gpus = 2
else:
    # True multi-GPU setup
    chunks = torch.chunk(embeddings, num_gpus, dim=0)
    embedding_parts = [chunk.to(f'cuda:{i}') for i, chunk in enumerate(chunks)]
    simulated_gpus = num_gpus

print(f"Data split into {len(embedding_parts)} parts")
for i, part in enumerate(embedding_parts):
    print(f"  Part {i}: {part.shape}")

In [None]:
# Test different index types with new parallel implementation
results_comparison = {}

# Configuration
search_config = SearchConfig(
    top_k=2000,  # Using 2K for better recall
    search_batch_size=10,
    num_queries=100,
    enable_recall_eval=True
)

index_types = [IndexType.IVF_FLAT, IndexType.IVF_PQ, IndexType.CAGRA]

for index_type in index_types:
    print(f"\n{'='*60}")
    print(f"Testing {index_type.value.upper()}")
    print('='*60)
    
    try:
        # Build indices (simulated parallel for single GPU)
        print(f"Building {index_type.value} indices...")
        
        indices = []
        build_times = []
        
        for i, part in enumerate(embedding_parts):
            start_time = time.time()
            
            if index_type == IndexType.IVF_FLAT:
                params = ivf_flat.IndexParams(
                    n_lists=min(256, part.shape[0] // 1000 + 1)
                )
                index = ivf_flat.build(params, part)
                
            elif index_type == IndexType.IVF_PQ:
                params = ivf_pq.IndexParams(
                    n_lists=min(512, part.shape[0] // 500 + 1),
                    pq_dim=min(96, part.shape[1] // 2),
                    pq_bits=8
                )
                index = ivf_pq.build(params, part)
                
            elif index_type == IndexType.CAGRA:
                params = cagra.IndexParams(
                    intermediate_graph_degree=64,  # Reduced for memory
                    graph_degree=32
                )
                index = cagra.build(params, part)
            
            build_time = time.time() - start_time
            indices.append(index)
            build_times.append(build_time)
            print(f"  Part {i} index built in {build_time:.2f}s")
        
        total_build_time = sum(build_times)
        print(f"Total build time: {total_build_time:.2f}s")
        
        # Test search with top-2K retrieval
        print(f"\nTesting search with top-{search_config.top_k} retrieval...")
        
        # Generate test queries
        test_query_texts = [
            "Advanced machine learning techniques",
            "Deep learning applications in healthcare",
            "Natural language processing research",
            "Computer vision for autonomous vehicles",
            "Reinforcement learning in robotics"
        ]
        
        query_embeddings = model.encode(
            test_query_texts,
            convert_to_tensor=True,
            device='cuda'
        )
        
        search_times = []
        all_results = []
        
        for q_idx, query in enumerate(query_embeddings):
            start_time = time.time()
            
            # Search all indices
            all_distances = []
            all_indices_results = []
            
            for idx, index in enumerate(indices):
                if index_type == IndexType.IVF_FLAT:
                    search_params = ivf_flat.SearchParams()
                    distances, indices_res = ivf_flat.search(
                        search_params, index, query.unsqueeze(0), search_config.top_k
                    )
                elif index_type == IndexType.IVF_PQ:
                    search_params = ivf_pq.SearchParams()
                    distances, indices_res = ivf_pq.search(
                        search_params, index, query.unsqueeze(0), search_config.top_k
                    )
                elif index_type == IndexType.CAGRA:
                    search_params = cagra.SearchParams()
                    distances, indices_res = cagra.search(
                        search_params, index, query.unsqueeze(0), search_config.top_k
                    )
                
                # Adjust indices for global indexing
                offset = idx * (len(embeddings) // len(embedding_parts))
                indices_res = indices_res + offset
                
                all_distances.extend(distances.flatten())
                all_indices_results.extend(indices_res.flatten())
            
            # Merge and get top-2K
            all_distances = np.array(all_distances)
            all_indices_results = np.array(all_indices_results)
            
            sorted_idx = np.argsort(all_distances)[:search_config.top_k]
            final_distances = all_distances[sorted_idx]
            final_indices = all_indices_results[sorted_idx]
            
            search_time = time.time() - start_time
            search_times.append(search_time)
            all_results.append((final_distances, final_indices))
            
            print(f"  Query {q_idx+1}: {search_time*1000:.2f}ms, retrieved {len(final_indices)} results")
        
        avg_search_time = np.mean(search_times) * 1000
        print(f"\nAverage search time: {avg_search_time:.2f}ms")
        
        results_comparison[index_type.value] = {
            'build_time': total_build_time,
            'avg_search_time_ms': avg_search_time,
            'num_results': len(final_indices),
            'success': True
        }
        
        # Cleanup
        del indices
        gc.collect()
        torch.cuda.empty_cache()
        
    except Exception as e:
        print(f"Error with {index_type.value}: {e}")
        results_comparison[index_type.value] = {
            'build_time': None,
            'avg_search_time_ms': None,
            'success': False,
            'error': str(e)
        }

## 6. FAISS Comparison Benchmark

In [None]:
import faiss

print("\n" + "="*60)
print("FAISS COMPARISON BENCHMARK")
print("="*60)

# Convert embeddings to numpy for FAISS
embeddings_np = embeddings.cpu().numpy().astype('float32')

# Test FAISS Flat (exact search)
print("\nTesting FAISS Flat Index...")
start_time = time.time()

# Build FAISS index
index_flat = faiss.IndexFlatL2(embedding_dim)

# Move to GPU if available
if torch.cuda.is_available():
    res = faiss.StandardGpuResources()
    index_flat_gpu = faiss.index_cpu_to_gpu(res, 0, index_flat)
    index_flat_gpu.add(embeddings_np)
    faiss_index = index_flat_gpu
else:
    index_flat.add(embeddings_np)
    faiss_index = index_flat

build_time = time.time() - start_time
print(f"FAISS Flat build time: {build_time:.2f}s")

# Test search
query_embeddings_np = query_embeddings.cpu().numpy().astype('float32')
search_times = []

for query in query_embeddings_np:
    start_time = time.time()
    distances, indices = faiss_index.search(query.reshape(1, -1), search_config.top_k)
    search_time = time.time() - start_time
    search_times.append(search_time)

avg_search_time = np.mean(search_times) * 1000
print(f"FAISS Flat average search time: {avg_search_time:.2f}ms")

results_comparison['faiss_flat'] = {
    'build_time': build_time,
    'avg_search_time_ms': avg_search_time,
    'num_results': search_config.top_k,
    'success': True
}

# Test FAISS IVF
print("\nTesting FAISS IVF Index...")
start_time = time.time()

nlist = 256
quantizer = faiss.IndexFlatL2(embedding_dim)
index_ivf = faiss.IndexIVFFlat(quantizer, embedding_dim, nlist)

if torch.cuda.is_available():
    index_ivf_gpu = faiss.index_cpu_to_gpu(res, 0, index_ivf)
    index_ivf_gpu.train(embeddings_np)
    index_ivf_gpu.add(embeddings_np)
    faiss_ivf_index = index_ivf_gpu
else:
    index_ivf.train(embeddings_np)
    index_ivf.add(embeddings_np)
    faiss_ivf_index = index_ivf

build_time = time.time() - start_time
print(f"FAISS IVF build time: {build_time:.2f}s")

# Set search parameters
faiss_ivf_index.nprobe = 16

# Test search
search_times = []
for query in query_embeddings_np:
    start_time = time.time()
    distances, indices = faiss_ivf_index.search(query.reshape(1, -1), search_config.top_k)
    search_time = time.time() - start_time
    search_times.append(search_time)

avg_search_time = np.mean(search_times) * 1000
print(f"FAISS IVF average search time: {avg_search_time:.2f}ms")

results_comparison['faiss_ivf'] = {
    'build_time': build_time,
    'avg_search_time_ms': avg_search_time,
    'num_results': search_config.top_k,
    'success': True
}

## 7. Results Visualization and Analysis

In [None]:
# Create results DataFrame
results_df = pd.DataFrame.from_dict(results_comparison, orient='index')
results_df.index.name = 'Index Type'
results_df = results_df.reset_index()

# Display results table
print("\n" + "="*60)
print("PERFORMANCE COMPARISON RESULTS")
print("="*60)
print(results_df.to_string(index=False))

# Filter successful results for plotting
successful_results = results_df[results_df['success'] == True].copy()

In [None]:
# Visualize results
if len(successful_results) > 0:
    fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 5))
    
    # Build time comparison
    colors = ['#e74c3c', '#3498db', '#2ecc71', '#f39c12', '#9b59b6']
    bars1 = ax1.bar(successful_results['Index Type'], successful_results['build_time'], color=colors[:len(successful_results)])
    ax1.set_xlabel('Index Type')
    ax1.set_ylabel('Build Time (seconds)')
    ax1.set_title('Index Build Time Comparison')
    ax1.tick_params(axis='x', rotation=45)
    
    # Add value labels on bars
    for bar in bars1:
        height = bar.get_height()
        ax1.text(bar.get_x() + bar.get_width()/2., height,
                f'{height:.1f}s', ha='center', va='bottom')
    
    # Search time comparison
    bars2 = ax2.bar(successful_results['Index Type'], successful_results['avg_search_time_ms'], color=colors[:len(successful_results)])
    ax2.set_xlabel('Index Type')
    ax2.set_ylabel('Average Search Time (ms)')
    ax2.set_title(f'Search Time Comparison (top-{search_config.top_k})')
    ax2.tick_params(axis='x', rotation=45)
    
    # Add value labels on bars
    for bar in bars2:
        height = bar.get_height()
        ax2.text(bar.get_x() + bar.get_width()/2., height,
                f'{height:.1f}ms', ha='center', va='bottom')
    
    # Speed vs Accuracy trade-off (using build time as proxy for accuracy)
    ax3.scatter(successful_results['build_time'], successful_results['avg_search_time_ms'],
               s=200, c=colors[:len(successful_results)], alpha=0.6)
    
    for i, row in successful_results.iterrows():
        ax3.annotate(row['Index Type'],
                    (row['build_time'], row['avg_search_time_ms']),
                    xytext=(5, 5), textcoords='offset points')
    
    ax3.set_xlabel('Build Time (seconds)')
    ax3.set_ylabel('Search Time (ms)')
    ax3.set_title('Build Time vs Search Time Trade-off')
    ax3.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    # Summary statistics
    print("\n" + "="*60)
    print("PERFORMANCE SUMMARY")
    print("="*60)
    
    # Find best performers
    fastest_build = successful_results.loc[successful_results['build_time'].idxmin()]
    fastest_search = successful_results.loc[successful_results['avg_search_time_ms'].idxmin()]
    
    print(f"Fastest Build: {fastest_build['Index Type']} ({fastest_build['build_time']:.2f}s)")
    print(f"Fastest Search: {fastest_search['Index Type']} ({fastest_search['avg_search_time_ms']:.2f}ms)")
    
    # Compare cuVS vs FAISS
    cuvs_indices = ['ivf_flat', 'ivf_pq', 'cagra']
    faiss_indices = ['faiss_flat', 'faiss_ivf']
    
    cuvs_results = successful_results[successful_results['Index Type'].isin(cuvs_indices)]
    faiss_results = successful_results[successful_results['Index Type'].isin(faiss_indices)]
    
    if len(cuvs_results) > 0 and len(faiss_results) > 0:
        print("\ncuVS vs FAISS:")
        print(f"  cuVS avg build time: {cuvs_results['build_time'].mean():.2f}s")
        print(f"  FAISS avg build time: {faiss_results['build_time'].mean():.2f}s")
        print(f"  cuVS avg search time: {cuvs_results['avg_search_time_ms'].mean():.2f}ms")
        print(f"  FAISS avg search time: {faiss_results['avg_search_time_ms'].mean():.2f}ms")
        
        # Calculate speedup
        faiss_avg_search = faiss_results['avg_search_time_ms'].mean()
        cuvs_avg_search = cuvs_results['avg_search_time_ms'].mean()
        
        if cuvs_avg_search < faiss_avg_search:
            speedup = faiss_avg_search / cuvs_avg_search
            print(f"\ncuVS is {speedup:.2f}x faster than FAISS in search")
        else:
            speedup = cuvs_avg_search / faiss_avg_search
            print(f"\nFAISS is {speedup:.2f}x faster than cuVS in search")

## 8. Memory Usage Analysis

In [None]:
def get_gpu_memory_info():
    """Get detailed GPU memory information"""
    if torch.cuda.is_available():
        for i in range(torch.cuda.device_count()):
            print(f"\nGPU {i}: {torch.cuda.get_device_name(i)}")
            print(f"  Total memory: {torch.cuda.get_device_properties(i).total_memory / 1024**3:.2f} GB")
            print(f"  Allocated: {torch.cuda.memory_allocated(i) / 1024**3:.2f} GB")
            print(f"  Reserved: {torch.cuda.memory_reserved(i) / 1024**3:.2f} GB")
            print(f"  Free: {(torch.cuda.get_device_properties(i).total_memory - torch.cuda.memory_allocated(i)) / 1024**3:.2f} GB")

print("Current GPU Memory Status:")
get_gpu_memory_info()

# Clear cache
torch.cuda.empty_cache()
gc.collect()

print("\nAfter cleanup:")
get_gpu_memory_info()

## 9. Scaling Test with Different Dataset Sizes

In [None]:
# Test scaling with different dataset sizes
scaling_sizes = [100000, 250000, 500000, 750000, 1000000]
scaling_results = []

print("\n" + "="*60)
print("SCALING TEST")
print("="*60)

for size in scaling_sizes:
    print(f"\nTesting with {size:,} vectors...")
    
    try:
        # Generate embeddings
        test_texts = generate_synthetic_texts(size)
        test_embeddings = model.encode(
            test_texts[:size],
            batch_size=1000,
            convert_to_tensor=True,
            show_progress_bar=False,
            device='cuda'
        )
        
        # Build IVF-Flat index (fastest)
        start_time = time.time()
        params = ivf_flat.IndexParams(n_lists=min(256, test_embeddings.shape[0] // 1000 + 1))
        index = ivf_flat.build(params, test_embeddings)
        build_time = time.time() - start_time
        
        # Test search
        query = test_embeddings[0].unsqueeze(0)
        start_time = time.time()
        search_params = ivf_flat.SearchParams()
        distances, indices = ivf_flat.search(search_params, index, query, min(2000, size // 10))
        search_time = time.time() - start_time
        
        scaling_results.append({
            'size': size,
            'build_time': build_time,
            'search_time_ms': search_time * 1000,
            'success': True
        })
        
        print(f"  Build: {build_time:.2f}s, Search: {search_time*1000:.2f}ms")
        
        # Cleanup
        del test_embeddings, index
        torch.cuda.empty_cache()
        gc.collect()
        
    except Exception as e:
        print(f"  Failed at {size:,} vectors: {e}")
        scaling_results.append({
            'size': size,
            'build_time': None,
            'search_time_ms': None,
            'success': False
        })
        break

# Plot scaling results
scaling_df = pd.DataFrame(scaling_results)
successful_scaling = scaling_df[scaling_df['success'] == True]

if len(successful_scaling) > 0:
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
    
    # Build time scaling
    ax1.plot(successful_scaling['size']/1000, successful_scaling['build_time'],
            'o-', linewidth=2, markersize=8, color='#3498db')
    ax1.set_xlabel('Dataset Size (thousands)')
    ax1.set_ylabel('Build Time (seconds)')
    ax1.set_title('Build Time Scaling')
    ax1.grid(True, alpha=0.3)
    
    # Search time scaling
    ax2.plot(successful_scaling['size']/1000, successful_scaling['search_time_ms'],
            's-', linewidth=2, markersize=8, color='#e74c3c')
    ax2.set_xlabel('Dataset Size (thousands)')
    ax2.set_ylabel('Search Time (ms)')
    ax2.set_title('Search Time Scaling (top-2K)')
    ax2.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    print(f"\nMaximum successful size: {successful_scaling['size'].max():,} vectors")

## 10. Export Results for SLURM Job

In [None]:
# Save results to CSV for analysis
results_df.to_csv('colab_a100_results.csv', index=False)
scaling_df.to_csv('colab_a100_scaling_results.csv', index=False)

print("Results saved to:")
print("  - colab_a100_results.csv")
print("  - colab_a100_scaling_results.csv")

# Generate SLURM job script
slurm_script = '''#!/bin/bash
#SBATCH --job-name=rag_multi_gpu_test
#SBATCH --nodes=1
#SBATCH --gres=gpu:a100:2
#SBATCH --cpus-per-task=8
#SBATCH --mem=64G
#SBATCH --time=02:00:00
#SBATCH --output=rag_test_%j.out
#SBATCH --error=rag_test_%j.err

# Load modules
module --force purge
module load StdEnv/2023 gcc/12.3 cuda/12.2 python/3.11

# Activate virtual environment
source ~/rag_env/bin/activate

# Install dependencies
pip install pylibraft-cu12 cuvs-cu12
pip install sentence-transformers torch numpy pandas matplotlib

# Run the improved implementation
python improved_multi_gpu_rag.py

echo "Job completed at $(date)"
'''

with open('submit_rag_test.sh', 'w') as f:
    f.write(slurm_script)

print("\nSLURM job script saved to: submit_rag_test.sh")
print("To submit on Narval: sbatch submit_rag_test.sh")