In [None]:
# Step 1: Check if GPU is available
!nvidia-smi


Sun Apr 20 09:22:54 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   37C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:

# Step 2: Write the CUDA code into a file
cuda_code = r"""
// Save this as tiled_vector_search_100k_fixed.cu
#include <stdio.h>
#include <math.h>

#define N 100000       // 100K vectors
#define D 64           // Dimensions
#define TILE_SIZE 256  // Tile size
#define THREADS 256    // Threads per block

__global__ void vector_search_tiled(float *database, float *query, int *best_idx, float *best_dist) {
    __shared__ float shared_vectors[TILE_SIZE][D];
    __shared__ float shared_query[D];

    int tid = threadIdx.x;
    int block_start = blockIdx.x * TILE_SIZE;

    if (tid < D) {
        shared_query[tid] = query[tid];
    }
    __syncthreads();

    if (block_start + tid < N) {
        for (int d = 0; d < D; d++) {
            shared_vectors[tid][d] = database[(block_start + tid) * D + d];
        }
    }
    __syncthreads();

    float min_dist = 1e30f;
    int min_idx = -1;

    for (int i = 0; i < TILE_SIZE && (block_start + i) < N; i++) {
        float dist = 0.0f;
        for (int d = 0; d < D; d++) {
            float diff = shared_vectors[i][d] - shared_query[d];
            dist += diff * diff;
        }
        if (dist < min_dist) {
            min_dist = dist;
            min_idx = block_start + i;
        }
    }

    __shared__ float block_min_dist[THREADS];
    __shared__ int block_min_idx[THREADS];

    block_min_dist[tid] = min_dist;
    block_min_idx[tid] = min_idx;
    __syncthreads();

    for (int stride = THREADS / 2; stride > 0; stride >>= 1) {
        if (tid < stride) {
            if (block_min_dist[tid + stride] < block_min_dist[tid]) {
                block_min_dist[tid] = block_min_dist[tid + stride];
                block_min_idx[tid] = block_min_idx[tid + stride];
            }
        }
        __syncthreads();
    }

    if (tid == 0) {
        best_idx[blockIdx.x] = block_min_idx[0];
        best_dist[blockIdx.x] = block_min_dist[0];
    }
}

__global__ void final_reduce(int *best_idx, float *best_dist, int *final_idx, float *final_dist, int num_blocks) {
    int tid = threadIdx.x;
    if (tid == 0) {
        float min_dist = 1e30f;
        int min_idx = -1;
        for (int i = 0; i < num_blocks; i++) {
            if (best_dist[i] < min_dist) {
                min_dist = best_dist[i];
                min_idx = best_idx[i];
            }
        }
        *final_idx = min_idx;
        *final_dist = sqrtf(min_dist);
    }
}

int main() {
    float *h_database = (float*)malloc(N * D * sizeof(float));
    float *h_query = (float*)malloc(D * sizeof(float));
    int h_final_idx;
    float h_final_dist;

    for (int i = 0; i < N * D; i++) h_database[i] = (float)(i % 100) / 100.0f;
    for (int i = 0; i < D; i++) h_query[i] = 0.5f;

    float *d_database, *d_query;
    int *d_best_idx, *d_final_idx;
    float *d_best_dist, *d_final_dist;

    cudaMalloc(&d_database, N * D * sizeof(float));
    cudaMalloc(&d_query, D * sizeof(float));

    int num_blocks = (N + TILE_SIZE - 1) / TILE_SIZE;
    cudaMalloc(&d_best_idx, num_blocks * sizeof(int));
    cudaMalloc(&d_best_dist, num_blocks * sizeof(float));
    cudaMalloc(&d_final_idx, sizeof(int));
    cudaMalloc(&d_final_dist, sizeof(float));

    // Timing
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    cudaEventRecord(start);

    // Upload
    cudaMemcpy(d_database, h_database, N * D * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_query, h_query, D * sizeof(float), cudaMemcpyHostToDevice);

    // Search
    vector_search_tiled<<<num_blocks, THREADS>>>(d_database, d_query, d_best_idx, d_best_dist);
    final_reduce<<<1, 1>>>(d_best_idx, d_best_dist, d_final_idx, d_final_dist, num_blocks);

    cudaEventRecord(stop);
    cudaEventSynchronize(stop);

    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);

    cudaMemcpy(&h_final_idx, d_final_idx, sizeof(int), cudaMemcpyDeviceToHost);
    cudaMemcpy(&h_final_dist, d_final_dist, sizeof(float), cudaMemcpyDeviceToHost);

    printf("Best match index: %d\n", h_final_idx);
    printf("Best match distance: %.5f\n", h_final_dist);
    printf("Total CUDA time (upload + search): %.5f ms\n", milliseconds);

    cudaFree(d_database);
    cudaFree(d_query);
    cudaFree(d_best_idx);
    cudaFree(d_best_dist);
    cudaFree(d_final_idx);
    cudaFree(d_final_dist);
    free(h_database);
    free(h_query);

    return 0;
}


"""

with open('vector_search.cu', 'w') as f:
    f.write(cuda_code)



In [None]:
# Step 3: Compile
!nvcc vector_search.cu -o vector_search

# Step 4: Run!
!./vector_search


Best match index: 0
Best match distance: 0.00000
Total CUDA time (upload + search): 13.10003 ms


In [None]:
!pip install chromadb




In [None]:

import chromadb
import numpy as np
import time

# Initialize ChromaDB client
client = chromadb.Client()

index_name = "vector-benchmark-100k"

# Clean up if already exists
existing_collections = [c.name for c in client.list_collections()]
if index_name in existing_collections:
    client.delete_collection(name=index_name)

collection = client.create_collection(name=index_name)

# Generate 100K vectors
N = 100000
D = 64
vectors = np.random.rand(N, D).astype(np.float32)
ids = [str(i) for i in range(N)]

# Start total timer
start = time.time()

# Upload in batches (because Chroma has batch limits)
def batched_add(collection, vectors, ids, batch_size=5000):
    for i in range(0, len(vectors), batch_size):
        collection.add(
            embeddings=vectors[i:i+batch_size].tolist(),
            metadatas=[{"dummy": "data"} for _ in range(len(vectors[i:i+batch_size]))],
            documents=["doc" + str(j) for j in range(i, i + len(vectors[i:i+batch_size]))],
            ids=ids[i:i+batch_size]
        )

batched_add(collection, vectors, ids)

# Query one vector
query_vector = np.random.rand(D).astype(np.float32)
results = collection.query(
    query_embeddings=[query_vector.tolist()],
    n_results=1
)

# End timer
end = time.time()

latency_ms = (end - start) * 1000

print(f"ChromaDB total time (upload + search) for 100K vectors: {latency_ms:.5f} ms")
print(f"Top match: {results['ids'][0][0]}")



ChromaDB total time (upload + search) for 100K vectors: 45751.05643 ms
Top match: 42864
