In [4]:
import numpy as np
import os


def read_query_embeddings(fp, d, dt):
    fl = os.path.getsize(fp)
    nb = fl // d // dt.itemsize
    assert fl == d * dt.itemsize * nb  # no header
    return np.memmap(fp, shape=(nb, d), dtype=dt, mode="r")

In [None]:
# for exact search:
from faiss.contrib.exhaustive_search import knn_ground_truth
import sys
import faiss
import numpy as np

sys.path.append("/data/home/marialomeli/faiss_improvements/offline_ivf/")
from utils import iterate_input, load_config

CONFIG_PATH: str = "/data/home/marialomeli/faiss_improvements/offline_ivf/config_seamless.yaml"
NUM_QUERIES: int = 10

queries_file = "/fsx-nllb-big/schwenk/mini-mine5/embed.22h1/mm5_p5.encf.000.eng3"
d = 1024
dt = np.dtype(np.float16)
all_queries = read_query_embeddings(queries_file, d, dt)
query_vectors = all_queries[
    0:NUM_QUERIES,
].astype(np.float32)
faiss.normalize_L2(query_vectors)
config = load_config(CONFIG_PATH)
config_for_database = config["datasets"]["test_eng_0"]
embeddings_bs = 50000
db_iterator = iterate_input(config_for_database, embeddings_bs, d, True)
D, I = knn_ground_truth(query_vectors, db_iterator, config["k"], metric_type=faiss.METRIC_INNER_PRODUCT)

processing: mm5_p5.encf.000.eng0...


In [None]:
# for approximate search:
indexdatapath = "/fsx-nllb-big/schwenk/mini-mine5/index.22h1/mm5_p5.OPQ64,IVF262144,PQ64.eng0.data.idx"
index = faiss.read_index(indexdatapath, faiss.IO_FLAG_MMAP)
faiss.extract_index_ivf(index).nprobe = 128
_, I_approx = index.search(query_vectors, config["k"])
np.save(I_approx)

In [None]:
nvf = NeighbourVectorFetcher(
    queries_neighbour_indices_path="I_approx.npy",
    config=CONFIG_PATH,
    xb="test_eng_0",
    num_queries=NUM_QUERIES,
)
D_neigh = nvf.compute_neighbours_distances()

**Half-margin metric computations**

In [1]:
import numpy as np

# 1000 query vectors from eng3, index built with eng0

root = "/checkpoint/marialomeli/offline_faiss/seamless/groundtruth_eng_0/"
D_gt = np.load(root + "D_neighbours_xb_eng_0_with_xq_eng_3I_16.npy")
D_ann = np.load(root + "D_neighbours_xb_eng_0_with_xq_eng_3_k_16_nprobe_128.npy")

In [4]:
# Alex
half_margin_metric = {}
num_queries = D_gt.shape[0]

for row in range(num_queries):
    average_distance_approx_nn = np.mean(D_ann[row, :])
    half_margin_metric["pair_" + str(row)] = D_gt[row, neigh] / average_distance_approx_nn
half_margin_metric

{'pair_0': 0.9917174113537912,
 'pair_1': 0.9912894094302099,
 'pair_2': 1.004360972763744,
 'pair_3': 1.0136605424967122,
 'pair_4': 0.9919090506680333,
 'pair_5': 1.0129877870999677,
 'pair_6': 1.0004452911933766,
 'pair_7': 0.9992348014005987,
 'pair_8': 1.0093907423558148,
 'pair_9': 0.9932339406331132,
 'pair_10': 0.9910132369763556,
 'pair_11': 1.0101614520101454,
 'pair_12': 1.0001612046590025,
 'pair_13': 0.9882463028297722,
 'pair_14': 1.0188616768939058,
 'pair_15': 1.0077730107652874,
 'pair_16': 1.01810421566373,
 'pair_17': 0.9910488868644602,
 'pair_18': 1.0085490801937342,
 'pair_19': 0.9814326894829613,
 'pair_20': 0.9980957918878929,
 'pair_21': 1.0056919865437621,
 'pair_22': 0.9891175448377969,
 'pair_23': 1.0029387345131933,
 'pair_24': 0.9840871967797766,
 'pair_25': 1.0053534392088221,
 'pair_26': 0.9924421402655006,
 'pair_27': 0.9873139118800038,
 'pair_28': 0.9693456431156411,
 'pair_29': 0.997738719214537,
 'pair_30': 0.9901280524286731,
 'pair_31': 1.00438233

In [5]:
# for each query vector and nearest neighbour pair, we sort their half margin scores and apply a threshold
{k: v for k, v in sorted(half_margin_metric.items(), key=lambda item: item[1])}

{'pair_320': 0.8918414521648356,
 'pair_728': 0.8996673833902615,
 'pair_276': 0.9000503773635906,
 'pair_623': 0.9105503575834537,
 'pair_328': 0.9116078397922601,
 'pair_636': 0.913732918698074,
 'pair_301': 0.9296217851831574,
 'pair_813': 0.9304954557957722,
 'pair_602': 0.9336913146557931,
 'pair_652': 0.9356364256642621,
 'pair_389': 0.9361855091992165,
 'pair_331': 0.9364191181880617,
 'pair_393': 0.9421322881219596,
 'pair_585': 0.9513723661012671,
 'pair_525': 0.9516644743904236,
 'pair_308': 0.9546982346007211,
 'pair_283': 0.9561036719985428,
 'pair_683': 0.9563742438291482,
 'pair_739': 0.9566594941854633,
 'pair_478': 0.9567863798583984,
 'pair_209': 0.9583264971064824,
 'pair_902': 0.958764851599518,
 'pair_859': 0.9588899485392361,
 'pair_342': 0.9607265313668973,
 'pair_593': 0.9627252495205321,
 'pair_362': 0.9628976775486018,
 'pair_163': 0.9639193847119234,
 'pair_423': 0.9646353132975748,
 'pair_851': 0.9650238848765917,
 'pair_279': 0.965373206748879,
 'pair_611': 

1. Run ground truth - exhaustive search and true distances - and compute the margin criteria (there is a margin threshold for gt, will give us the maximum values we can achieve). We can use IVFFlat with nprobe<nlist (~1,2%) for bigger datasets.

2. Run approximate search and use approx distances - and compute the margin criteria (there is a margin threshold for approx search)
3. get the pairs from 2 and re-evaluate them with the margin score using their true distances (even for the numerator, you pick the pseudo-first nearest neighbour) 

4. We can penalise wrt the ground truth threshold by substracting it from (3). If it adds something that is well below the threshold, then we penalise it more. 
# we want to compute it backward and forward

In [None]:
language pair - 1 file of eng0 and 1 file of french