In [None]:
!pip install faiss-cpu



In [None]:
import shutil
import urllib.request as request
from contextlib import closing

# first we download the Sift1M dataset
with closing(request.urlopen('ftp://ftp.irisa.fr/local/texmex/corpus/sift.tar.gz')) as r:
    with open('sift.tar.gz', 'wb') as f:
        shutil.copyfileobj(r, f)

In [None]:
import tarfile

# the download leaves us with a tar.gz file, we unzip it
tar = tarfile.open('sift.tar.gz', "r:gz")
tar.extractall()

In [None]:
import numpy as np

def read_fvecs(fp):
  a = np.fromfile(fp, dtype='int32')
  d = a[0]
  return a.reshape(-1, d + 1)[:, 1:].copy().view('float32')

In [None]:
# data we will search through
xb = read_fvecs('./sift/sift_base.fvecs')  # 1M samples
# also get some query vectors to search with
xq = read_fvecs('./sift/sift_query.fvecs')
# take just one query (there are many in sift_learn.fvecs)
xq = xq[0].reshape(1, xq.shape[1])

In [None]:
xq.shape

(1, 128)

In [None]:
xb.shape

(1000000, 128)

In [None]:
xq

array([[  1.,   3.,  11., 110.,  62.,  22.,   4.,   0.,  43.,  21.,  22.,
         18.,   6.,  28.,  64.,   9.,  11.,   1.,   0.,   0.,   1.,  40.,
        101.,  21.,  20.,   2.,   4.,   2.,   2.,   9.,  18.,  35.,   1.,
          1.,   7.,  25., 108., 116.,  63.,   2.,   0.,   0.,  11.,  74.,
         40., 101., 116.,   3.,  33.,   1.,   1.,  11.,  14.,  18., 116.,
        116.,  68.,  12.,   5.,   4.,   2.,   2.,   9., 102.,  17.,   3.,
         10.,  18.,   8.,  15.,  67.,  63.,  15.,   0.,  14., 116.,  80.,
          0.,   2.,  22.,  96.,  37.,  28.,  88.,  43.,   1.,   4.,  18.,
        116.,  51.,   5.,  11.,  32.,  14.,   8.,  23.,  44.,  17.,  12.,
          9.,   0.,   0.,  19.,  37.,  85.,  18.,  16., 104.,  22.,   6.,
          2.,  26.,  12.,  58.,  67.,  82.,  25.,  12.,   2.,   2.,  25.,
         18.,   8.,   2.,  19.,  42.,  48.,  11.]], dtype=float32)

## Flat

In [None]:
d = 128  # dim- (1, 128)
k = 10

import faiss

index = faiss.IndexFlatIP(d)
index.add(xb)

In [None]:
%%time
D, I = index.search(xq, k)
print(I)

[[932085 934876 561813 708177 706771 695756 435345 701258 872728 455537]]
CPU times: user 65.2 ms, sys: 0 ns, total: 65.2 ms
Wall time: 129 ms


In [None]:
baseline = I[0].tolist()
baseline

[932085,
 934876,
 561813,
 708177,
 706771,
 695756,
 435345,
 701258,
 872728,
 455537]

# Loacality Sensitive Hashing (LSH)
https://www.pinecone.io/learn/series/faiss/vector-indexes/

In [None]:
nbits = d*4

index = faiss.IndexLSH(d, nbits)
index.add(xb)

In [None]:
%%time
D, I = index.search(xq, k)
print(I)

[[435345 931632 708177 813701 934876 455537 932085 561813 248185 361496]]
CPU times: user 40 ms, sys: 0 ns, total: 40 ms
Wall time: 37.6 ms


In [None]:
np.in1d(baseline, I)

array([ True,  True,  True,  True, False, False,  True, False, False,
        True])

# (HNSW) Hierarchical Navigable Small world

In [None]:
M = 16  # number of connection vertax has
ef_search = 8
ef_construction = 64


In [None]:
index = faiss.IndexHNSWFlat(d, M)

index.hnsw.efSearch = ef_search
index.hnsw.ef_construction = ef_construction

index.add(xb)

In [None]:
%%time
D, I = index.search(xq, k)

CPU times: user 883 µs, sys: 0 ns, total: 883 µs
Wall time: 952 µs


In [None]:
np.in1d(baseline, I)

array([False, False, False,  True,  True, False, False, False, False,
       False])

# IVF (Inverted file index)

In [None]:
nlist = 128

quantizer = faiss.IndexFlatIP(d)
index = faiss.IndexIVFFlat(quantizer, d, nlist)

In [None]:
index.is_trained

False

In [None]:
index.train(xb)

In [None]:
index.is_trained

True

In [None]:
index.add(xb)

In [None]:
index.nprobe = 1

In [None]:
%%time
D, I = index.search(xq, k)

np.in1d(baseline, I)

CPU times: user 3.75 ms, sys: 19 µs, total: 3.77 ms
Wall time: 2.87 ms


array([ True, False, False,  True,  True, False, False,  True, False,
        True])

In [None]:
index.nprobe = 4


In [None]:
%%time
D, I = index.search(xq, k)

np.in1d(baseline, I)

CPU times: user 9.55 ms, sys: 0 ns, total: 9.55 ms
Wall time: 9.12 ms


array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True])