In [39]:
import numpy as np
import os
from typing import Dict, Any
import faiss


def read_query_embeddings(fp, d, dt):
    fl = os.path.getsize(fp)
    nb = fl // d // dt.itemsize
    assert fl == d * dt.itemsize * nb  # no header
    return np.memmap(fp, shape=(nb, d), dtype=dt, mode="r")


def read_embeddings(root: str, fn: str, fmt: str, dt: str, size: int, input_d: int) -> np.array:
    """
    If the file is in raw format, then file size will be divisible by the dimensionality and by the size of the data type. Otherwise,
    the file contains a header and it assumes it is of .npy type. It returns the memmapped file.
    """
    fn = f"{root}/{fn}"
    assert os.path.exists(fn)
    if fmt == "raw":
        fl = os.path.getsize(fn)
        nb = fl // input_d // dt.itemsize
        assert nb == size
        assert fl == input_d * dt.itemsize * nb  # no header
        return np.memmap(fn, shape=(nb, input_d), dtype=dt, mode="r")
    elif fmt == "npy":
        vecs = np.load(fn, mmap_mode="r")
        assert vecs.shape[0] == size, f"size:{size},shape {vecs.shape[0]}"
        assert vecs.shape[1] == input_d
        assert vecs.dtype == dt
        return vecs
    else:
        ValueError("The file cannot be loaded in the current format.")


def iterate_input(
    config_for_dataset: Dict[str, str],
    batch_size: int,
    input_d: int,
    is_normalised: bool = False,
) -> Any:
    """
    Normalises vectors
    """

    buffer = np.empty(shape=(batch_size, input_d), dtype=np.float32)
    rem = 0
    for f in config_for_dataset["files"]:
        fn = f["name"]
        print(f"processing: {fn}...")
        xb = read_embeddings(
            config_for_dataset["root"],
            fn,
            f["format"],
            np.dtype(f["dtype"]),
            f["size"],
            input_d,
        )
        if is_normalised:
            faiss.normalize_L2(xb.astype(np.float32))
        req = min(batch_size - rem, xb.shape[0])
        buffer[rem : rem + req] = xb[:req]
        rem += req
        if rem == batch_size:
            yield buffer
            rem = 0
        for i in range(req, xb.shape[0], batch_size):
            j = i + batch_size
            if j <= xb.shape[0]:
                yield xb[i:j]
            else:
                rem = xb.shape[0] - i
                buffer[:rem] = xb[i:j]
    if rem > 0:
        yield buffer[:rem]

In [None]:
from faiss.contrib.exhaustive_search import knn_ground_truth
import sys
import faiss
import numpy as np

sys.path.append("/data/home/marialomeli/faiss_improvements/offline_ivf/")
from utils import load_config

queries_file = "/checkpoint/kevinheffernan/SONAR/denoise_autoencode_reg2_100k/embed.23h1/eng0/encf.000.eng0"
d = 1024
dt = np.dtype(np.float16)
all_queries = read_query_embeddings(queries_file, d, dt)
query_vectors = all_queries[
    0:10,
].astype(np.float32)
faiss.normalize_L2(query_vectors)
config = load_config("/data/home/marialomeli/faiss_improvements/offline_ivf/config_sonar.yaml")
config_for_dataset = config["datasets"]["test_glg_0"]
embeddings_bs = 10000000
db_iterator = iterate_input(config_for_dataset, embeddings_bs, d, True)
D, I = knn_ground_truth(query_vectors, db_iterator, config["k"], metric_type=faiss.METRIC_INNER_PRODUCT)

processing: encf.000.glg...


In [None]:
# load index
from faiss.contrib.evaluation import knn_intersection_measure

index_path = "/checkpoint/marialomeli/offline_faiss/seamless/sonar/test_glg_0/IVF32768_PQ256.faissindex"
index = faiss.read_index(index_path)
index_ivf = faiss.extract_index_ivf(index)
index_ivf.nprobe = 64
k = 16

n_query_vectors, dim = query_vectors.shape
# replace the query vectors with the corresponding ones:
D, I_approx = index.search(query_vectors, k)

knn_intersection_measure(I, I_approx)

In [6]:
#compare to the one computed with files created by evaluate
import numpy as np
class Object(object):
    pass


self = Object()
# self.root = '/scratch/gsz/seamless/oivf'
self.root = "/checkpoint/marialomeli/offline_faiss/seamless/sonar"
self.eval_dir = self.root + "/test_eng_0_in_test_glg_0/eval"
I_a_gt_file = f"{self.eval_dir}/I_a_gt.npy"
I_a_gt = np.load(I_a_gt_file)
I_a_gt

array([[14019300, 46986579, 37087857, ..., 44311120, 37147197, 32623153],
       [37485005, 22463736, 22463735, ..., 22463737, 25191417, 18075494],
       [48182428, 25703316, 48182406, ..., 41806355, 25746524, 25690238],
       ...,
       [ 9039955,  1897574,  4750893, ...,  3183195,  8682394, 10579718],
       [33682250, 39297822,  6301227, ..., 39297841, 41449743, 26146029],
       [18379010, 28135988, 23964199, ..., 21093975, 17022948, 16775086]])

In [11]:
from faiss.contrib.evaluation import knn_intersection_measure
index_strings = [
    'IVF32768_PQ64',
   # "OPQ64_IVF32768_PQ64",
    'IVF32768_PQ128',
   # "OPQ128_IVF32768_PQ128",
   # "OPQ256_IVF32768_PQ256",
     "IVF32768_PQ256",
   # "OPQ512_IVF32768_PQ512",
   "IVF32768_PQ512",
]
nprobes=[2**p for p in range(8)]
#nprobes = [1]
for fact_str in index_strings:
        for nprobe in nprobes:
            I_a_ann_file = f"{self.eval_dir}/I_a_ann_{fact_str}_np{nprobe}.npy"
            I_a_ann = np.load(I_a_ann_file)
            print(knn_intersection_measure(I_a_gt, I_a_ann))

0.0852875
0.11111875
0.13233125
0.1513625
0.16830625
0.180325
0.18860625
0.195225
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.12641875
0.1753875
0.22060625
0.2653625
0.30508125
0.33740625
0.36406875
0.38504375
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.15670625
0.22640625
0.29500625
0.36715625
0.4357625
0.49804375
0.55141875
0.5951875
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.16715625
0.2432
0.3222125
0.4070625
0.49170625
0.57290625
0.6455125
0.7089375


In [37]:
 import faiss
    
def index_stats(index_file):
    index = faiss.read_index(index_file,faiss.IO_FLAG_ONDISK_SAME_DIR)
    index_ivf = faiss.downcast_index(faiss.extract_index_ivf(index))
    il = index_ivf.invlists
    list_sizes = [il.list_size(i) for i in range(il.nlist)]
    print("max list size:",np.max(list_sizes))
    print("mean list size",np.mean(list_sizes))
    print("argmax:",np.argmax(list_sizes))
    il.print_stats()
#index stats



In [38]:
import os
root = "/checkpoint/marialomeli/offline_faiss/seamless/sonar"
indexes = [
    "IVF32768_PQ128","IVF32768_PQ512", "OPQ128_IVF32768_PQ128", "OPQ64_IVF32768_PQ64","IVF32768_PQ256", "IVF32768_PQ64","OPQ512_IVF32768_PQ512"
]

for index_str in indexes:
    index_f = root+f"/test_eng_0/{index_str}.faissindex"
    if os.path.exists(index_f):
        index_stats(index_f)
    else:
        print(f"skipping non-existent index file {index_f}")
        continue


max list size: 596741
mean list size 1525.3014221191406
argmax: 29030
max list size: 596741
mean list size 1525.3014221191406
argmax: 29030
max list size: 671081
mean list size 1525.3014221191406
argmax: 9018
max list size: 453696
mean list size 1525.3014221191406
argmax: 11336
0/OPQ512_IVF32768_PQ512.faissindex.ivfdata
list size in < 1: 1199 instances
list size in < 2: 777 instances
list size in < 4: 1091 instances
list size in < 8: 1386 instances
list size in < 16: 1811 instances
list size in < 32: 2215 instances
list size in < 64: 2736 instances
list size in < 128: 3329 instances
list size in < 256: 3695 instances
list size in < 512: 3723 instances
list size in < 1024: 3424 instances
list size in < 2048: 2759 instances
list size in < 4096: 2065 instances
list size in < 8192: 1345 instances
list size in < 16384: 677 instances
list size in < 32768: 358 instances
list size in < 65536: 123 instances
list size in < 131072: 43 instances
list size in < 262144: 10 instances
list size in < 5

In [6]:
import os

os.path.exists(index_f)

True