#### Installation

```bash
conda install -c pytorch -c nvidia faiss-gpu=1.8.0
```


### Dataset

* SIFT http://corpus-texmex.irisa.fr/ 

```bash
# Downlaod the data
wget http://ann-benchmarks.com/sift-128-euclidean.hdf5
```


In [2]:
import h5py

data_file = "sift-128-euclidean.hdf5"

with h5py.File(data_file, 'r') as f:
    print("Keys in the file:", list(f.keys()))
    for key in f.keys():
        print(f"Dataset '{key}' shape: {f[key].shape}")
        print(f"Dataset '{key}' dtype: {f[key].dtype}")

    corpus = f['train'][:]
    query = f['test'][:]

Keys in the file: ['distances', 'neighbors', 'test', 'train']
Dataset 'distances' shape: (10000, 100)
Dataset 'distances' dtype: float32
Dataset 'neighbors' shape: (10000, 100)
Dataset 'neighbors' dtype: int32
Dataset 'test' shape: (10000, 128)
Dataset 'test' dtype: float32
Dataset 'train' shape: (1000000, 128)
Dataset 'train' dtype: float32


In [3]:
d = corpus[0].shape[0]
k = 100

In [18]:
import numpy as np

# helper function to calculate recall
def calc_recall(res, gt):
    recall = 0
    for i in range(len(res)):
        r = len(np.intersect1d(res[i], gt[i])) / len(gt[i])
        recall += r
    return recall / len(res)


### FAISS Flat Index

In [7]:
import faiss

In [None]:
index = faiss.IndexFlatL2(d)
index.add(corpus)

D, I_truth = index.search(query, k)

## IVF Index

In [26]:
nlist = 5
nprob = 3

quantizer = faiss.IndexFlatL2(d)
index = faiss.IndexIVFFlat(quantizer, d, nlist)
index.nprobe = nprob

index.train(corpus)
index.add(corpus)

# search
D, I = index.search(query, k)

# calculate recall
recall = calc_recall(I, I_truth)
print(f"Recall (IVF) = {recall:.4f}")

Recall (IVF) = 0.9999


### HNSW Index

In [24]:
M = 64
ef_search = 32
ef_construction = 64

index = faiss.IndexHNSWFlat(d, M)
# set the two parameters before adding data
index.hnsw.efConstruction = ef_construction
index.hnsw.efSearch = ef_search

index.add(corpus)

# search
D, I = index.search(query, k)

# calculate recall
recall = calc_recall(I, I_truth)
print(f"Recall (HNSW): {recall}")

Recall (HNSW): 0.8968109999999735


### LSH

In [25]:
nbits = d * 8

index = faiss.IndexLSH(d, nbits)
index.train(corpus)
index.add(corpus)

# search
D, I = index.search(query, k)

# calculate recall
recall = calc_recall(I, I_truth)
print(f"Recall (LSH): {recall}")

Recall (LSH): 0.5856730000000039


### Scalar Quantizer Index

In [23]:
qtype = faiss.ScalarQuantizer.QT_8bit
metric = faiss.METRIC_L2

index = faiss.IndexScalarQuantizer(d, qtype, metric)
index.train(corpus)
index.add(corpus)

# search
D, I = index.search(query, k)

# calculate recall
recall = calc_recall(I, I_truth)
print(f"Recall (Scalar Quantizer Index): {recall}")

Recall (Scalar Quantizer Index): 0.990444999999872


### Product Quantizer Index

In [27]:
M = 16
nbits = 8
metric = faiss.METRIC_L2

index = faiss.IndexPQ(d, M, nbits, metric)

index.train(corpus)
index.add(corpus)

# search
D, I = index.search(query, k)

# calculate recall
recall = calc_recall(I, I_truth)
print(f"Recall (PQ): {recall}")

Recall (PQ): 0.630898999999999
