In [1]:
import numpy as np
import faiss
import time
import json

In [7]:
# Load embeddings
embeddings = np.load("data/spotify_vectors_10d.npy").astype('float32')
print(f"Loaded embeddings: {embeddings.shape}")

n_samples, d = embeddings.shape

# ============================================
# 1. Build Brute-Force Index (Ground Truth)
# ============================================
print("\n=== Building Brute-Force Index ===")
index_flat = faiss.IndexFlatL2(d)
index_flat.add(embeddings)
faiss.write_index(index_flat, "data/index_flat_l2.faiss")
print(f"Flat index built with {index_flat.ntotal} vectors")

Loaded embeddings: (169776, 10)

=== Building Brute-Force Index ===
Flat index built with 169776 vectors


In [8]:
# 2. Build IVF-Flat Index (ANN with Clustering)
# ============================================
print("\n=== Building IVF-Flat Index ===")

# Parameters
# For 169,776 tracks: sqrt(169776) â‰ˆ 412
nlist = int(np.sqrt(n_samples))  # Number of clusters (rule of thumb)
nlist = min(max(nlist, 100), 4096)  # Clamp between 100 and 4096

print(f"Dataset size: {n_samples} tracks")
print(f"Using nlist={nlist} clusters")

# Create quantizer (flat index for cluster centroids)
quantizer = faiss.IndexFlatL2(d)

# Create IVF index
index_ivf = faiss.IndexIVFFlat(quantizer, d, nlist, faiss.METRIC_L2)

# Train the index on embeddings
print("Training IVF index...")
start = time.time()
index_ivf.train(embeddings)
print(f"Training completed in {time.time() - start:.2f}s")

# Add vectors to index
print("Adding vectors to IVF index...")
start = time.time()
index_ivf.add(embeddings)
print(f"Added {index_ivf.ntotal} vectors in {time.time() - start:.2f}s")

# Save IVF index
faiss.write_index(index_ivf, "data/index_ivf_flat.faiss")
print(f"Saved IVF index to data/index_ivf_flat.faiss")



=== Building IVF-Flat Index ===
Dataset size: 169776 tracks
Using nlist=412 clusters
Training IVF index...
Training completed in 0.14s
Adding vectors to IVF index...
Added 169776 vectors in 0.02s
Saved IVF index to data/index_ivf_flat.faiss


In [9]:
# 3. Test with Different nprobe Values
# ============================================
print("\n=== Testing nprobe Parameter ===")

# Load a test query
test_query = embeddings[1000:1001]  # Use track 1000 as test

nprobe_values = [1, 5, 10, 20, 50, 100, 200]
k = 10

print(f"\nSearching for top-{k} neighbors with different nprobe values:")
print(f"{'nprobe':<10} {'Latency (ms)':<15} {'Recall@{k}':<15}")
print("-" * 40)

# Get ground truth from flat index
D_true, I_true = index_flat.search(test_query, k)
true_neighbors = set(I_true[0])

for nprobe in nprobe_values:
    if nprobe > nlist:
        continue
    
    index_ivf.nprobe = nprobe
    
    start = time.time()
    D, I = index_ivf.search(test_query, k)
    latency_ms = (time.time() - start) * 1000
    
    # Calculate recall
    retrieved = set(I[0])
    recall = len(retrieved & true_neighbors) / k
    
    print(f"{nprobe:<10} {latency_ms:<15.3f} {recall:<15.3f}")


=== Testing nprobe Parameter ===

Searching for top-10 neighbors with different nprobe values:
nprobe     Latency (ms)    Recall@{k}     
----------------------------------------
1          0.254           1.000          
5          0.090           1.000          
10         0.067           1.000          
20         0.084           1.000          
50         0.187           1.000          
100        0.499           1.000          
200        0.807           1.000          


In [10]:
# 4. Save Index Configuration
# ============================================
config = {
    "embedding_dim": d,
    "n_samples": n_samples,
    "nlist": nlist,
    "index_type": "IVFFlat",
    "metric": "L2",
    "recommended_nprobe": [10, 20, 50]  # For different recall requirements
}

with open("data/index_config.json", "w") as f:
    json.dump(config, f, indent=2)

print("\n=== Index Construction Complete ===")
print(f"Flat index: data/index_flat_l2.faiss")
print(f"IVF index: data/index_ivf_flat.faiss")
print(f"Config: data/index_config.json")


=== Index Construction Complete ===
Flat index: data/index_flat_l2.faiss
IVF index: data/index_ivf_flat.faiss
Config: data/index_config.json
