In [83]:
from io_utils import read_fbin, read_ibin
import faiss
print(faiss.__version__)
import numpy as np
file_path = "/root/code/vectordbindexing/Text2Image/base.1M.fbin"
query_path = "/root/code/vectordbindexing/Text2Image/query.public.100K.fbin"
ground_truth_path = "/root/code/vectordbindexing/Text2Image/groundtruth.public.100K.ibin"

1.11.0


In [94]:
# read datasets
print("\n\nreading image vector: ---")
data_vector = read_fbin(file_path)
print(type(data_vector))
print(data_vector.ndim, data_vector.shape, data_vector.dtype, data_vector.size)
# print(data_vector[:1])  # Print first 1 elements to verify content

train_data_vector = data_vector[:500000]
insert_1_percent = data_vector[500000:505000]
insert_2_percent = data_vector[505000:510000]
insert_3_percent = data_vector[510000:515000]
insert_4_percent = data_vector[515000:520000]
insert_5_percent = data_vector[520000:525000]
insert_10_percent = data_vector[525000:550000]

# read querys
print("\n\nreading querys: ---")
query_vector = read_fbin(query_path)
print(type(query_vector))
print(query_vector.ndim, query_vector.shape, query_vector.dtype, query_vector.size)
# print(query_vector[0])  # Print first 3 elements to verify content



reading image vector: ---
<class 'numpy.ndarray'>
2 (1000000, 200) float32 200000000


reading querys: ---
<class 'numpy.ndarray'>
2 (100000, 200) float32 20000000


In [110]:
import time
import hnsw_cosine
import simple_sim_hash
import importlib
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
importlib.reload(hnsw_cosine)

index = hnsw_cosine.HNSWIndex(M=32, ef_construction=128, ef_search=64, random_seed=1)
simHash = simple_sim_hash.SimpleSimHash(dim=200)

IMAGE_IDX_SET = set()

# 形状 [N,200]（先用1M子集或更小切片做原型）
for img_id, vec in enumerate(train_data_vector):        # 可加 tqdm、批量 flush
    index.add_item_fast10k(vec, lsh=simHash, limit=500)
    IMAGE_IDX_SET.add(img_id)

In [105]:
# 读取faiss搜索结果，获取 query_vector 和 search 结果
import json
train_query_list = {}
test_query_list = {}

# ground_truth = read_ibin(ground_truth_path)
# print(type(ground_truth))
# print(ground_truth.ndim, ground_truth.shape, ground_truth.dtype, ground_truth.size)
# for query_idx in range(ground_truth.shape[0]):
#     if int(query_idx) % 6 != 0:
#         train_query_list[query_idx] = ground_truth[query_idx]
#     else:
#         test_query_list[query_idx] = ground_truth[query_idx]

with open("./TempResults/search_results_100K.json", "r", encoding="utf-8") as f:
    data = json.load(f)
    for query_idx, vec_list in data.items():
        mList = []
        for x in vec_list:
            mList.append(x - int(query_idx))
        if int(query_idx) % 6 != 0:
            train_query_list[int(query_idx)] = mList
        else:
            test_query_list[int(query_idx)] = mList
print(f"num of train: {len(train_query_list)}")
print(f"num of test: {len(test_query_list)}")

num of train: 83333
num of test: 16667


In [97]:
# reset added links
for layer in range(0, 100):
    if layer not in index.neighbours:
        continue
    for idx in range(0, 500000):
        if idx not in index.neighbours[layer]:
            continue
        index.neighbours[layer][idx] = index.neighbours[layer][idx][:32]

In [111]:
# OOD search steps
NUM_STEPS = []
for qid, target_list in test_query_list.items():
    q = query_vector[qid]
    for target_id in target_list[:10]:
        if target_id not in IMAGE_IDX_SET:
            continue
        out = index.search_steps_to_target(q, target_id, k=10, ef=64)
        NUM_STEPS.append(len(out["trace"]))

arr = np.array(NUM_STEPS, dtype=np.float64)
arr.sort()

mean_steps = arr.mean()
p99_steps = np.percentile(arr, 99)
p995_steps = np.percentile(arr, 99.5)
p999_steps = np.percentile(arr, 99.9)
print(f"mean steps: {mean_steps}")
print(f"p99 steps: {p99_steps}")
print(f"p995 steps: {p995_steps}")
print(f"p999 steps: {p999_steps}")

mean steps: 644.5876808548959
p99 steps: 976.0
p995 steps: 1020.0
p999 steps: 1098.0


In [112]:
# add links to the graph
stats = index.augment_from_query_topk(
    test_query_list,
    strategy="clique",
    layer=0,
    max_new_edges_per_node=1,
)
print(stats)


{'pairs_considered': 20604027, 'pairs_added': 200577, 'skipped_missing': 833773, 'skipped_existing': 1, 'pruned_by_cap': 20403449, 'skipped_occluded': 0}


In [113]:
# OOD search steps - after add links
NUM_STEPS = []
for qid, target_list in test_query_list.items():
    q = query_vector[qid]
    for target_id in target_list[:10]:
        if target_id not in IMAGE_IDX_SET:
            continue
        out = index.search_steps_to_target(q, target_id, k=10, ef=64)
        NUM_STEPS.append(len(out["trace"]))

arr = np.array(NUM_STEPS, dtype=np.float64)
arr.sort()

mean_steps = arr.mean()
p99_steps = np.percentile(arr, 99)
p995_steps = np.percentile(arr, 99.5)
p999_steps = np.percentile(arr, 99.9)
print(f"mean steps: {mean_steps}")
print(f"p99 steps: {p99_steps}")
print(f"p995 steps: {p995_steps}")
print(f"p999 steps: {p999_steps}")

mean steps: 637.8591703187849
p99 steps: 1190.0
p995 steps: 1258.0
p999 steps: 1426.0


In [114]:
# 插入额外的数据；并继续search上面的测试集合，查看search所需steps
insert_data_vectors = {
    "insert_1%": insert_1_percent,
    "insert_2%": insert_2_percent,
    "insert_3%": insert_3_percent,
    "insert_4%": insert_4_percent,
    "insert_5%": insert_5_percent,
    "insert_10%": insert_10_percent,
}
img_id = 500000
for name, insert_vectors in insert_data_vectors.items():
    # insert 新节点
    for _, vec in enumerate(insert_vectors):        # 可加 tqdm、批量 flush
        index.add_item(vec, id=img_id)
        img_id += 1
        IMAGE_IDX_SET.add(img_id)
    
    NUM_STEPS = []
    for qid, target_list in test_query_list.items():
        q = query_vector[qid]
        for target_id in target_list[:10]:
            if target_id not in IMAGE_IDX_SET:
                continue
            out = index.search_steps_to_target(q, target_id, k=10, ef=64)
            NUM_STEPS.append(len(out["trace"]))

    arr = np.array(NUM_STEPS, dtype=np.float64)
    arr.sort()

    mean_steps = arr.mean()
    p99_steps = np.percentile(arr, 99)
    p995_steps = np.percentile(arr, 99.5)
    p999_steps = np.percentile(arr, 99.9)
    print(f"mean steps: {mean_steps}")
    print(f"p99 steps: {p99_steps}")
    print(f"p995 steps: {p995_steps}")
    print(f"p999 steps: {p999_steps}")