Based on https://www.pinecone.io/learn/series/faiss/vector-indexes/

# Set up runtime

In [13]:
%pip install faiss-cpu
%pip install humanize

Collecting faiss-cpu
  Downloading faiss_cpu-1.7.4-cp311-cp311-macosx_11_0_arm64.whl (2.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.7/2.7 MB[0m [31m19.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.7.4
Note: you may need to restart the kernel to use updated packages.


In [1]:
import os
import humanize
import shutil
import urllib.request as request
from contextlib import closing

# Load and prepare demo data

In [2]:
# first we download the Sift1M dataset
with closing(request.urlopen('ftp://ftp.irisa.fr/local/texmex/corpus/sift.tar.gz')) as r:
    with open('sift.tar.gz', 'wb') as f:
        shutil.copyfileobj(r, f)

In [2]:
import tarfile

# the download leaves us with a tar.gz file, we unzip it
tar = tarfile.open('sift.tar.gz', "r:gz")
tar.extractall()

In [3]:
import numpy as np

# now define a function to read the fvecs file format of Sift1M dataset
def read_fvecs(fp):
    a = np.fromfile(fp, dtype='int32')
    d = a[0]
    return a.reshape(-1, d + 1)[:, 1:].copy().view('float32')

In [5]:
# data we will search through
xb = read_fvecs('./sift/sift_base.fvecs')  # 1M samples
# also get some query vectors to search with
xq = read_fvecs('./sift/sift_query.fvecs')
# take just one query (there are many in sift_learn.fvecs)
xq = xq[0].reshape(1, xq.shape[1])

In [6]:
# The query vector
xq.shape

(1, 128)

In [8]:
# The vector search space
xb.shape

(1000000, 128)

# Flat Index

In [16]:
d = 128  # dimensionality of Sift1M data
k = 10  # number of nearest neighbors to return

import faiss

index = faiss.IndexFlatIP(d)
index.add(xb)

In [17]:
%%time
D, I = index.search(xq, k)

CPU times: user 27.4 ms, sys: 6.38 ms, total: 33.7 ms
Wall time: 29.2 ms


In [18]:
# Save as baseline. This is our 100% correct recall set.
baseline = I[0].tolist()

In [88]:
# Measure index size
faiss.write_index(index, "flat_index.index")
flat_index_size = os.path.getsize("flat_index.index")
os.remove("flat_index.index")
humanize.naturalsize(flat_index_size)

'512.0 MB'

# LSH Index
Locality Sensitive Hashing

In [79]:
nbits = d*8  # resolution of bucketed vectors
# initialize index and add vectors
lsh_index = faiss.IndexLSH(d, nbits)
lsh_index.add(xb)

In [80]:
%%time
D, I = lsh_index.search(xq, k)

CPU times: user 4.89 ms, sys: 2.57 ms, total: 7.46 ms
Wall time: 4.57 ms


In [81]:
# Calculate Recall Percentage:
np.array(baseline)[np.in1d(baseline, I).tolist()].size / np.array(baseline).size * 100

80.0

In [87]:
# Measure index size
faiss.write_index(lsh_index, "lsh_index.index")
lsh_index_size = os.path.getsize("lsh_index.index")
os.remove("lsh_index.index")
humanize.naturalsize(lsh_index_size)

'128.5 MB'

# HNSW Index
Hierarchical Navigable Small World

In [44]:
# set HNSW index parameters
M = 64  # number of connections each vertex will have
ef_search = 32  # depth of layers explored during search
ef_construction = 64  # depth of layers explored during index construction

# initialize index (d == 128)
hnsw_index = faiss.IndexHNSWFlat(d, M)
# set efConstruction and efSearch parameters
hnsw_index.hnsw.efConstruction = ef_construction
hnsw_index.hnsw.efSearch = ef_search
# add data to index
hnsw_index.add(xb)

In [64]:
%%time
D, I = hnsw_index.search(xq, k)

CPU times: user 2.9 ms, sys: 17.8 ms, total: 20.7 ms
Wall time: 20.1 ms


In [65]:
# Calculate Recall Percentage:
np.array(baseline)[np.in1d(baseline, I).tolist()].size / np.array(baseline).size * 100

90.0

In [86]:
# Measure index size
faiss.write_index(hnsw_index, "hnsw_index.index")
hnsw_index_size = os.path.getsize("hnsw_index.index")
os.remove("hnsw_index.index")
humanize.naturalsize(hnsw_index_size)

'1.0 GB'

# IVF Index
Inverted File Index

In [107]:
nlist = 128  # number of cells/clusters to partition data into

quantizer = faiss.IndexFlatIP(d)  # how the vectors will be stored/compared
ivf_index = faiss.IndexIVFFlat(quantizer, d, nlist)
ivf_index.train(xb)  # we must train the index to cluster into cells
ivf_index.add(xb)

ivf_index.nprobe = 4  # set how many of nearest cells to search

In [108]:
%%time
D, I = ivf_index.search(xq, k)

CPU times: user 6.41 ms, sys: 2.32 ms, total: 8.73 ms
Wall time: 8.14 ms


In [109]:
# Calculate Recall Percentage:
np.array(baseline)[np.in1d(baseline, I).tolist()].size / np.array(baseline).size * 100

100.0

In [110]:
# Measure index size
faiss.write_index(ivf_index, "ivf_index.index")
ivf_index_size = os.path.getsize("ivf_index.index")
os.remove("ivf_index.index")
humanize.naturalsize(ivf_index_size)

'520.1 MB'