In [2]:
import numpy as np
from faiss import write_index, read_index

In [7]:
d = 64                           # dimension
nb = 100000                      # database size
nq = 10000                       # nb of queries
np.random.seed(1234)             # make reproducible
xb = np.random.random((nb, d)).astype('float32')
xb[:, 0] += np.arange(nb) / 1000.
xq = np.random.random((nq, d)).astype('float32')
xq[:, 0] += np.arange(nq) / 1000.

import faiss

nlist = 100
k = 4
quantizer = faiss.IndexFlatL2(d)  # the other index
index = faiss.IndexIVFFlat(quantizer, d, nlist, faiss.METRIC_L2)
# here we specify METRIC_L2, by default it performs inner-product search

assert not index.is_trained
index.train(xb)
assert index.is_trained

index.add(xb)                  # add may be a bit slower as well
D, I = index.search(xq, k)     # actual search
print(I[-5:])                  # neighbors of the 5 last queries
index.nprobe = 10              # default nprobe is 1, try a few more
D, I = index.search(xq, k)
print(I[-5:])                  # neighbors of the 5 last queries

[[86961 18490 51010 50060]
 [93235  2701 82748  7165]
 [ 3101 97719 79955 19401]
 [47783 78973 87995 45546]
 [24639 14606 92706 58343]]
[[17596 38599 86961 56555]
 [74163  3875 93235 73702]
 [99723 63071  3101 31980]
 [ 3893  8372 47783 71890]
 [96343 73543 53301 69441]]


In [9]:
xb

array([[0.19151945, 0.62210876, 0.43772775, ..., 0.62491673, 0.4780938 ,
        0.19567518],
       [0.38231745, 0.05387368, 0.4516484 , ..., 0.15139526, 0.33517465,
        0.65755177],
       [0.07334255, 0.0550064 , 0.3231948 , ..., 0.34441698, 0.6408804 ,
        0.12620533],
       ...,
       [0.81447345, 0.5902453 , 0.7988935 , ..., 0.339859  , 0.3019495 ,
        0.85385454],
       [0.6714615 , 0.9160688 , 0.9550788 , ..., 0.59536433, 0.03849181,
        0.10563799],
       [0.85663575, 0.59113413, 0.67890793, ..., 0.21897699, 0.06530159,
        0.21753833]], dtype=float32)

In [4]:
write_index(index, "large.index")


indexb = read_index("large.index")

In [9]:
D, I = indexb.search(xq, k)     # actual search
print(I[-5:])                  # neighbors of the 5 last queries
index.nprobe = 10              # default nprobe is 1, try a few more
D, I = indexb.search(xq, k)
print(I[-5:])                  # neighbors of the 5 last queries

[[ 9900 10500  9309  9831]
 [11055 10895 10812 11321]
 [11353 11103 10164  9787]
 [10571 10664 10632  9638]
 [ 9628  9554 10036  9582]]
[[ 9900 10500  9309  9831]
 [11055 10895 10812 11321]
 [11353 11103 10164  9787]
 [10571 10664 10632  9638]
 [ 9628  9554 10036  9582]]


In [10]:
def apx_num_dp_from_idx_size_gb(idx_size_gb, dim):
    # get number of data points from index size in GB
    return int(idx_size_gb * 1024**3 / (dim * 4))

In [11]:
apx_num_dp_from_idx_size_gb(1, 512)

524288