In [48]:
import numpy as np
import os


def read_query_embeddings(fp, d, dt):
    fl = os.path.getsize(fp)
    nb = fl // d // dt.itemsize
    assert fl == d * dt.itemsize * nb  # no header
    return np.memmap(fp, shape=(nb, d), dtype=dt, mode="r")

In [4]:
import numpy as np
import magic
import os

# files from 000 to 074
text_file = "/fsx-nllb/schwenk/mini-mine5/sent-segm.22h1/mm5_p5.text.eng0.000.gz"
embeddings_file = "/fsx-nllb-big/schwenk/mini-mine5/embed.22h1/mm5_p5.encf.000.eng0"
print(magic.from_file(embeddings_file))  # type of file
fl = os.path.getsize(embeddings_file)
fl / 1024
d = 1024
dt = np.dtype(np.float16)
# memory map the embeddings instead of loading them
emb = read_query_embeddings(embeddings_file, d, dt)
emb.shape

data


(49981077, 1024)

In [None]:
# index search specification in file /fsx-nllb-big/schwenk/mini-mine5/script/mm5-mine.sh
index_type = "OPQ64,IVFauto,PQ64"
nprobe = 128
knn = 16
margin_norm = "mean"
# margin_norm="last"
# d=1024
# fp16
# cosine similarity: build an index with MIP metric, and normalise the database and query vectors with faiss.normalize_L2() before adding them

In [None]:
import magic
import faiss

# OPQ64,IVF262144,PQ64 for Eng, one train file (empty index) and 0-5 indices with data : mm5_p5.OPQ64,IVF262144,PQ64.eng0.data.idx, ... , mm5_p5.OPQ64,IVF262144,PQ64.eng5.data.idx
indexdatapath = "/fsx-nllb-big/schwenk/mini-mine5/index.22h1/mm5_p5.OPQ64,IVF262144,PQ64.eng0.data.idx"
indexpath = "/fsx-nllb-big/schwenk/mini-mine5/index.22h1/mm5_p5.OPQ64,IVF262144,PQ64.eng.train.idx"
magic.from_file(indexdatapath)

os.path.getsize(indexpath)
os.path.getsize(indexdatapath)
index = faiss.read_index(indexdatapath, faiss.IO_FLAG_MMAP)
index.ntotal

In [1]:
# Files where NLLB built their indexes in https://github.com/fairinternal/nllb/blob/main/stopes/modules/bitext/indexing/populate_faiss_index.py#L289
import numpy as np

# create test_data with padding in name
# synthetic data for toy example
root = "/checkpoint/marialomeli/offline_faiss/notebook_data"
dimension = 1024  # dimensions of each vector
n = 100000  # number of vectors
for i in range(2):
    np.random.seed(i)
    db_vectors = np.random.random((n, dimension)).astype("float32")
    filename = root + f"/my_data{i:02}" + ".npy"
    print(filename)
    np.save(filename, db_vectors)

In [7]:
from faiss.contrib.exhaustive_search import knn_ground_truth
import sys
import faiss
import numpy as np

sys.path.append("/data/home/marialomeli/faiss_improvements/offline_ivf/")
from utils import iterate_input, load_config

queries_file = "/fsx-nllb-big/schwenk/mini-mine5/embed.22h1/mm5_p5.encf.000.eng3"
d = 1024
dt = np.dtype(np.float16)
all_queries = read_query_embeddings(queries_file, d, dt)
query_vectors = all_queries[
    0:10,
].astype(np.float32)
faiss.normalize_L2(query_vectors)
config = load_config("/data/home/marialomeli/faiss_improvements/offline_ivf/config_notebook.yaml")
config_for_dataset = config["datasets"]["my_test_data"]
embeddings_bs = 50000
db_iterator = iterate_input(config_for_dataset, embeddings_bs, d, True)
D, I = knn_ground_truth(query_vectors, db_iterator, config["k"], metric_type=faiss.METRIC_INNER_PRODUCT)

I

processing: my_data00.npy...
processing: my_data01.npy...


array([[153533, 137037,  76294, 168146,  49402],
       [162864,  73532, 139046,  80387,  39155],
       [147019,  82451, 122784, 111552, 182277],
       [122606,   3653,  90046,  21803, 192614],
       [ 94371,  13212,  51721,  44944,  89895],
       [193542,  40921, 121298, 126615, 121670],
       [183514,  40835, 116571,  48067,  41249],
       [123010, 193542,  74027,  46896,  92839],
       [157735,  99249,  12692,  77095,   6906],
       [143567,  44240,  57692, 183495,  28970]])

In [9]:
# Toy example
from faiss.contrib.evaluation import knn_intersection_measure
import sys

sys.path.append("/data/home/marialomeli/faiss_improvements/offline_ivf/")
from utils import compute_recalls_at

idx_str = config["index"]
d = config["d"]
index = faiss.index_factory(d, idx_str)
index.nprobe = config["nprobe"]
config_for_dataset = config["datasets"]["my_test_data"]
embeddings_bs = 50000
db_iterator = iterate_input(config_for_dataset, embeddings_bs, d, True)
all_data = []
for xbi in db_iterator:
    all_data(xbi)
index.train(all_data[0])
for data in all_data:
    index.add(data)
I, D = index.search(query_vectors, config["k"])
# knn_intersection_measure(I, I1)
# compute_recalls_at(I1, I, 1, 5)

False

In [1]:
# compute intersection measure for the ground truth
import numpy as np
from faiss.contrib.evaluation import knn_intersection_measure

res = "/checkpoint/marialomeli/offline_faiss/seamless/groundtruth_eng_0/I_16.npy"
I_gt = np.load(res)
I_gt[0, :]

array([1758956109, 2056088817,   62256939, 2578129083, 3651003285,
       2700622931, 3385624923, 1587849508, 3695874821, 1908539966,
       3072515321, 3182456442, 3600190096, 1298444975,  629240733,
        275486473])

In [49]:
# load the NLLB ENG0 index

import faiss
import numpy as np
import sys

sys.path.append("/data/home/marialomeli/faiss_improvements/offline_ivf/")
from utils import iterate_input, load_config

indexdatapath = "/fsx-nllb-big/schwenk/mini-mine5/index.22h1/mm5_p5.OPQ64,IVF262144,PQ64.eng0.data.idx"
index = faiss.read_index(indexdatapath, faiss.IO_FLAG_MMAP)
print(index.ntotal)
print(index.d)
cfg = load_config("/data/home/marialomeli/faiss_improvements/offline_ivf/config_seamless.yaml")
queries_file = "/fsx-nllb-big/schwenk/mini-mine5/embed.22h1/mm5_p5.encf.000.eng3"
d = cfg["d"]
dt = np.dtype(np.float16)
all_queries = read_query_embeddings(queries_file, d, dt)
query_vectors = all_queries[0:1000].astype(np.float32)
faiss.normalize_L2(query_vectors)

3748056364
1024


In [50]:
def get_intersection_cardinality_frequencies(I):
    nq = I.shape[0]
    res = []
    for ell in range(nq):
        res.append(len(np.intersect1d(I[ell, :], I_gt[ell, :])))
    values, counts = np.unique(res, return_counts=True)
    return values, counts

In [51]:
import numpy as np
import faiss

index_ivf = faiss.extract_index_ivf(index)
index_ivf.nprobe = 128
k = 16
D, I = index.search(query_vectors, k)
# knn_intersection_measure(I_gt, I)
val, counts = get_intersection_cardinality_frequencies(I)
print(val, counts, type(val))

[ 0  1  2  3  4  5  6  7  8 10] [822 102  35  12  14   9   2   1   2   1] <class 'numpy.ndarray'>


In [61]:
np.save("freqs", dict(zip(val, counts)))

In [36]:
import numpy as np

faiss.extract_index_ivf(index).nprobe = 128
k = 16
D, I = index.search(query_vectors, k)
print(knn_intersection_measure(I_gt, I))
get_intersection_cardinality_frequencies(I)

0.022125


(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8, 10]),
 array([822, 102,  35,  12,  14,   9,   2,   1,   2,   1]))

In [45]:
import numpy as np

faiss.extract_index_ivf(index).nprobe = 1280
k = 16
D, I = index.search(query_vectors, k)
print(knn_intersection_measure(I_gt, I))
get_intersection_cardinality_frequencies(I)

0.0230625


(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([814, 108,  36,  12,  13,   9,   4,   1,   2,   1]))

In [None]:
index_ivf = faiss.extract_index_ivf(index)
index_ivf.nlist

In [37]:
# number of centroids
3748056364 / 262144

14297.70036315918

In [38]:
math.sqrt(3748056364)

61221.371791229896

In [40]:
4 * 61221

244884

In [46]:
262144 / 128

2048.0

In [None]:
results = {}
results[128] = 0.022125
results[1280] = 0.0230625
results[3000] = 0.023375