In [5]:
from io_utils import read_fbin, read_ibin
import faiss
print(faiss.__version__)
import numpy as np
file_path = "/root/code/vectordbindexing/Text2Image/base.1M.fbin"
query_path = "/root/code/vectordbindexing/Text2Image/query.public.100K.fbin"
ground_truth_path = "/root/code/vectordbindexing/Text2Image/groundtruth.public.100K.ibin"

1.11.0


In [82]:
# read datasets
print("\n\nreading image vector: ---")
data_vector = read_fbin(file_path)
print(type(data_vector))
print(data_vector.ndim, data_vector.shape, data_vector.dtype, data_vector.size)
# print(data_vector[:1])  # Print first 1 elements to verify content

# train_data_vector = data_vector[:500000]
# insert_10_percent = data_vector[500000:550000]
# insert_20_percent = data_vector[550000:600000]
# insert_30_percent = data_vector[600000:650000]
# insert_40_percent = data_vector[650000:700000]
# insert_50_percent = data_vector[700000:750000]
# insert_100_percent = data_vector[750000:]
train_data_vector = data_vector[:500000]
insert_1_percent = data_vector[500000:505000]
insert_2_percent = data_vector[505000:510000]
insert_3_percent = data_vector[510000:515000]
insert_4_percent = data_vector[515000:520000]
insert_5_percent = data_vector[520000:525000]
insert_10_percent = data_vector[525000:550000]

# read querys
print("\n\nreading querys: ---")
query_vector = read_fbin(query_path)
print(type(query_vector))
print(query_vector.ndim, query_vector.shape, query_vector.dtype, query_vector.size)
# print(query_vector[0])  # Print first 3 elements to verify content



reading image vector: ---
<class 'numpy.ndarray'>
2 (1000000, 200) float32 200000000


reading querys: ---
<class 'numpy.ndarray'>
2 (100000, 200) float32 20000000


In [83]:
import time
import hnsw_cosine
import simple_sim_hash
import importlib
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
importlib.reload(hnsw_cosine)

index = hnsw_cosine.HNSWIndex(M=32, ef_construction=128, ef_search=64, random_seed=1)
simHash = simple_sim_hash.SimpleSimHash(dim=200)

IMAGE_IDX_SET = set()
MAX_RETRIES = 3

def insert_one(args):
    img_id, vec = args
    for attempt in range(MAX_RETRIES):
        try:
            index.add_item_fast10k(vec, lsh=simHash, limit=100)
            return img_id
        except Exception as e:
            if attempt < MAX_RETRIES - 1:
                time.sleep(1)  # 等 1 秒再试
            else:
                # 达到最大重试次数仍失败
                raise
    return img_id  # 理论不会走到这里

# 形状 [N,200]（先用1M子集或更小切片做原型）
for img_id, vec in enumerate(train_data_vector):        # 可加 tqdm、批量 flush
    index.add_item_fast10k(vec, lsh=simHash, limit=100)
    IMAGE_IDX_SET.add(img_id)
# with ThreadPoolExecutor(max_workers=4) as executor:
#     futures = [executor.submit(insert_one, (img_id, vec))
#                for img_id, vec in enumerate(train_data_vector)]
#     for future in tqdm(as_completed(futures), total=len(futures)):
#         try:
#             img_id = future.result()
#             IMAGE_IDX_SET.add(img_id)
#         except Exception as e:
#             print(f"[ERROR] Failed after retries: {e}")


In [93]:
# 读取faiss搜索结果，获取 query_vector 和 search 结果
import json
train_query_list = {}
test_query_list = {}
with open("./search_results_1M_100.json", "r", encoding="utf-8") as f:
    data = json.load(f)
    for query_idx, vec_list in data.items():
        if int(query_idx) % 6 != 0:
            train_query_list[int(query_idx)] = vec_list
        else:
            test_query_list[int(query_idx)] = vec_list
print(f"num of train: {len(train_query_list)}")
print(f"num of test: {len(test_query_list)}")



num of train: 10000
num of test: 2000


In [113]:
# reset added links
# stats = index.augment_from_query_topk(
#     train_query_list,
#     strategy="projection",
#     layer=0,
#     max_new_edges_per_node=0,
#     reset_ = True
# )
# print(stats)

for layer in range(0, 100):
    if layer not in index.neighbours:
        continue
    for idx in range(0, 500000):
        if idx not in index.neighbours[layer]:
            continue
        index.neighbours[layer][idx] = index.neighbours[layer][idx][:32]

In [114]:
# OOD search steps
NUM_STEPS = []
for qid, target_list in test_query_list.items():
    q = query_vector[qid]
    for target_id in target_list[:10]:
        if target_id not in IMAGE_IDX_SET:
            continue
        # out = index.query(q, 10)
        out = index.search_steps_to_target(q, target_id+1, k=10, ef=64)
        NUM_STEPS.append(len(out["trace"]))

arr = np.array(NUM_STEPS, dtype=np.float64)
arr.sort()

mean_steps = arr.mean()
p99_steps = np.percentile(arr, 99)
p995_steps = np.percentile(arr, 99.5)
p999_steps = np.percentile(arr, 99.9)
print(f"mean steps: {mean_steps}")
print(f"p99 steps: {p99_steps}")
print(f"p995 steps: {p995_steps}")
print(f"p999 steps: {p999_steps}")

mean steps: 418.1821557607387
p99 steps: 798.9600000000064
p995 steps: 861.0
p999 steps: 900.0


In [119]:
# add links to the graph
stats = index.augment_from_query_topk(
    test_query_list,
    strategy="clique",
    layer=0,
    max_new_edges_per_node=1,
)
print(stats)


{'pairs_considered': 2519132, 'pairs_added': 39917, 'skipped_missing': 99122, 'skipped_existing': 598, 'pruned_by_cap': 2478617, 'skipped_occluded': 0}


In [120]:
# OOD search steps - after add links
NUM_STEPS = []
for qid, target_list in test_query_list.items():
    q = query_vector[qid]
    for target_id in target_list[:10]:
        if target_id not in IMAGE_IDX_SET:
            continue
        out = index.search_steps_to_target(q, target_id+1, k=10, ef=64)
        NUM_STEPS.append(len(out["trace"]))

arr = np.array(NUM_STEPS, dtype=np.float64)
arr.sort()

mean_steps = arr.mean()
p99_steps = np.percentile(arr, 99)
p995_steps = np.percentile(arr, 99.5)
p999_steps = np.percentile(arr, 99.9)
print(f"mean steps: {mean_steps}")
print(f"p99 steps: {p99_steps}")
print(f"p995 steps: {p995_steps}")
print(f"p999 steps: {p999_steps}")

mean steps: 537.1999801311346
p99 steps: 945.0
p995 steps: 987.0
p999 steps: 1210.0


In [118]:
# 插入额外的数据；并继续search上面的测试集合，查看search所需steps
insert_data_vectors = {
    "insert_1%": insert_1_percent,
    "insert_2%": insert_2_percent,
    "insert_3%": insert_3_percent,
    "insert_4%": insert_4_percent,
    "insert_5%": insert_5_percent,
    "insert_10%": insert_10_percent,
}
img_id = 500000
for name, insert_vectors in insert_data_vectors.items():
    # insert 新节点
    for _, vec in enumerate(insert_vectors):        # 可加 tqdm、批量 flush
        index.add_item(vec, id=img_id)
        img_id += 1
        IMAGE_IDX_SET.add(img_id)
    
    NUM_STEPS = []
    for qid, target_list in test_query_list.items():
        q = query_vector[qid]
        for target_id in target_list[:10]:
            if target_id not in IMAGE_IDX_SET:
                continue
            # out = index.query(q, 10)
            out = index.search_steps_to_target(q, target_id+1, k=10, ef=64)
            NUM_STEPS.append(len(out["trace"]))

    arr = np.array(NUM_STEPS, dtype=np.float64)
    arr.sort()

    mean_steps = arr.mean()
    p99_steps = np.percentile(arr, 99)
    p995_steps = np.percentile(arr, 99.5)
    p999_steps = np.percentile(arr, 99.9)
    print(f"mean steps: {mean_steps}")
    print(f"p99 steps: {p99_steps}")
    print(f"p995 steps: {p995_steps}")
    print(f"p999 steps: {p999_steps}")

KeyboardInterrupt: 