In [1]:
from io_utils import read_fbin, read_ibin
import faiss
print(faiss.__version__)
import numpy as np
file_path = "/root/code/vectordbindexing/Text2Image/base.1M.fbin"
query_path = "/root/code/vectordbindexing/Text2Image/query.public.100K.fbin"
ground_truth_path = "/root/code/vectordbindexing/Text2Image/groundtruth.public.100K.ibin"

1.11.0


In [37]:
# read datasets
print("\n\nreading image vector: ---")
data_vector = read_fbin(file_path)
print(type(data_vector))
print(data_vector.ndim, data_vector.shape, data_vector.dtype, data_vector.size)
# print(data_vector[:1])  # Print first 1 elements to verify content

train_data_vector = data_vector[:500000]
insert_1_percent = data_vector[500000:505000]
insert_2_percent = data_vector[505000:510000]
insert_3_percent = data_vector[510000:515000]
insert_4_percent = data_vector[515000:520000]
insert_5_percent = data_vector[520000:525000]
insert_10_percent = data_vector[525000:550000]

# read querys
print("\n\nreading querys: ---")
query_vector = read_fbin(query_path)
print(type(query_vector))
print(query_vector.ndim, query_vector.shape, query_vector.dtype, query_vector.size)
# print(query_vector[0])  # Print first 3 elements to verify content



reading image vector: ---
<class 'numpy.ndarray'>
2 (1000000, 200) float32 200000000


reading querys: ---
<class 'numpy.ndarray'>
2 (100000, 200) float32 20000000


In [34]:
import time
import hnsw_cosine_status as hnsw_cosine
import simple_sim_hash
import importlib
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
importlib.reload(hnsw_cosine)

# M=64 比较合适，甚至更宽的宽度
# 这里是个经验值：会在增加宽度的同时，逐渐达到一个稳定值
index = hnsw_cosine.HNSWIndex(M=32, ef_construction=128, ef_search=64, random_seed=1)
simHash = simple_sim_hash.SimpleSimHash(dim=200)

IMAGE_IDX_SET = set()

# 形状 [N,200]（先用1M子集或更小切片做原型）
for img_id, vec in enumerate(train_data_vector):        # 可加 tqdm、批量 flush
    index.add_item_fast10k(vec, lsh=simHash, limit=100)
    IMAGE_IDX_SET.add(img_id)

In [35]:
# 读取faiss搜索结果，获取 query_vector 和 search 结果
import json
train_query_list = {}
test_query_list = {}

# ground_truth = read_ibin(ground_truth_path)
# print(type(ground_truth))
# print(ground_truth.ndim, ground_truth.shape, ground_truth.dtype, ground_truth.size)
# for query_idx in range(ground_truth.shape[0]):
#     actual_groundtruth = []
#     for idx in ground_truth[query_idx]:
#         if idx > 500000:
#             continue
#         actual_groundtruth.append(idx)
#     if len(actual_groundtruth) < 1:
#         continue
#     if int(query_idx) % 6 != 0:
#         train_query_list[query_idx] = ground_truth[query_idx]
#     else:
#         test_query_list[query_idx] = ground_truth[query_idx]

with open("./TempResults/search_results_100K.json", "r", encoding="utf-8") as f:
    data = json.load(f)
    for query_idx, vec_list in data.items():
        mList = []
        for x in vec_list:
            mList.append(x - int(query_idx))
        if int(query_idx) % 6 != 0:
            train_query_list[int(query_idx)] = mList
        else:
            test_query_list[int(query_idx)] = mList
print(f"num of train: {len(train_query_list)}")
print(f"num of test: {len(test_query_list)}")

num of train: 83333
num of test: 16667


In [36]:
# OOD search steps
NUM_STEPS = []
PHASE_ANALYSIS = []
for qid, target_list in test_query_list.items():
    q = query_vector[qid]
    for target_id in target_list[:10]:
        if target_id not in IMAGE_IDX_SET:
            continue
        # 使用阶段分析功能
        out = index.search_steps_to_target(q, target_id, k=10, ef=64, analyze_phases=True, verbose=False)
        NUM_STEPS.append(len(out["trace"]))
        if "phase_analysis" in out:
            PHASE_ANALYSIS.append(out["phase_analysis"])


# 分析阶段统计
if PHASE_ANALYSIS:
    print("\n=== 阶段分析统计 ===")
    phase_1_steps = [pa["phase_1"]["step_count"] for pa in PHASE_ANALYSIS]
    phase_2_steps = [pa["phase_2"]["step_count"] for pa in PHASE_ANALYSIS]
    phase_1_accel_edges = [pa["phase_1"]["accel_edges"] for pa in PHASE_ANALYSIS]
    phase_2_accel_edges = [pa["phase_2"]["accel_edges"] for pa in PHASE_ANALYSIS]
    
    print(f"第一阶段 (快速靠近) - 平均步数: {np.mean(phase_1_steps):.2f}, 平均加速边: {np.mean(phase_1_accel_edges):.2f}")
    print(f"第二阶段 (Beam Search) - 平均步数: {np.mean(phase_2_steps):.2f}, 平均加速边: {np.mean(phase_2_accel_edges):.2f}")
    
    # 计算加速边使用比例
    total_accel_edges = [pa["total_accel_edges"] for pa in PHASE_ANALYSIS]
    total_steps = [pa["total_steps"] for pa in PHASE_ANALYSIS]
    accel_edge_ratios = [accel/steps if steps > 0 else 0 for accel, steps in zip(total_accel_edges, total_steps)]
    
    print(f"整体加速边使用比例: {np.mean(accel_edge_ratios):.2%}")
    
    # 分析哪些查询受益最多
    if len(PHASE_ANALYSIS) > 0:
        best_benefit_idx = np.argmax(accel_edge_ratios)
        best_benefit = PHASE_ANALYSIS[best_benefit_idx]
        print(f"\n加速边受益最多的查询:")
        print(f"  第一阶段: {best_benefit['phase_1']['step_count']} 步, {best_benefit['phase_1']['accel_edges']} 条加速边")
        print(f"  第二阶段: {best_benefit['phase_2']['step_count']} 步, {best_benefit['phase_2']['accel_edges']} 条加速边")
        print(f"  总步数: {best_benefit['total_steps']}, 总加速边: {best_benefit['total_accel_edges']}")
        print(f"  加速边比例: {best_benefit['overall_accel_edge_ratio']:.2%}")


arr_ori_bak = np.array(NUM_STEPS, dtype=np.float64)
arr_ori = arr_ori_bak.copy()
arr_ori.sort()

mean_steps = arr_ori.mean()
P50_steps = np.percentile(arr_ori, 50)
p99_steps = np.percentile(arr_ori, 99)
print(f"\n原始搜索统计:")
print(f"mean steps: {mean_steps}")
print(f"middle steps: {P50_steps}")
print(f"p99 steps: {p99_steps}")


=== 阶段分析统计 ===
第一阶段 (快速靠近) - 平均步数: 268.97, 平均加速边: 0.00
第二阶段 (Beam Search) - 平均步数: 218.23, 平均加速边: 0.00
整体加速边使用比例: 0.00%

加速边受益最多的查询:
  第一阶段: 293 步, 0 条加速边
  第二阶段: 369 步, 0 条加速边
  总步数: 663, 总加速边: 0
  加速边比例: 0.00%

原始搜索统计:
mean steps: 488.1981869484301
middle steps: 503.0
p99 steps: 712.0


In [38]:
# 使用新的 RoarGraph 风格的 cross distribution 边构建
print("\n=== 构建 RoarGraph 风格的 Cross Distribution 边 ===")
stats = index.build_cross_distribution_edges(
    test_query_list,
    layer=0,  # 只在第0层构建
    max_new_edges_per_node=4,
    occlude_alpha=1.0,  # 遮挡阈值
    use_metric=True,
    chain_extra=1,  # 额外的链式连接
)
print("Cross distribution 边构建统计:")
print(stats)

# 获取 cross distribution 边的统计信息
cross_stats = index.get_cross_distribution_stats()
print("\nCross distribution 边统计:")
print(f"总添加的 cross distribution 边: {cross_stats['total_cross_edges']}")
print(f"被删除的 cross distribution 边: {cross_stats['deleted_cross_edges']}")
print(f"活跃的 cross distribution 边: {cross_stats['active_cross_edges']}")



=== 构建 RoarGraph 风格的 Cross Distribution 边 ===


Cross distribution 边构建统计:
{'pairs_considered': 81702, 'pairs_added': 80205, 'skipped_missing': 833773, 'skipped_existing': 1, 'pruned_by_cap': 1555, 'skipped_occluded': 1497}

Cross distribution 边统计:
总添加的 cross distribution 边: 160407
被删除的 cross distribution 边: 570
活跃的 cross distribution 边: 159837


In [39]:
# OOD search steps
NUM_STEPS = []
PHASE_ANALYSIS = []
for qid, target_list in test_query_list.items():
    q = query_vector[qid]
    for target_id in target_list[:10]:
        if target_id not in IMAGE_IDX_SET:
            continue
        # 使用阶段分析功能
        out = index.search_steps_to_target(q, target_id, k=10, ef=64, analyze_phases=True, verbose=False)
        NUM_STEPS.append(len(out["trace"]))
        if "phase_analysis" in out:
            PHASE_ANALYSIS.append(out["phase_analysis"])


# 分析阶段统计
if PHASE_ANALYSIS:
    print("\n=== 阶段分析统计 ===")
    phase_1_steps = [pa["phase_1"]["step_count"] for pa in PHASE_ANALYSIS]
    phase_2_steps = [pa["phase_2"]["step_count"] for pa in PHASE_ANALYSIS]
    phase_1_accel_edges = [pa["phase_1"]["accel_edges"] for pa in PHASE_ANALYSIS]
    phase_2_accel_edges = [pa["phase_2"]["accel_edges"] for pa in PHASE_ANALYSIS]
    
    print(f"第一阶段 (快速靠近) - 平均步数: {np.mean(phase_1_steps):.2f}, 平均加速边: {np.mean(phase_1_accel_edges):.2f}")
    print(f"第二阶段 (Beam Search) - 平均步数: {np.mean(phase_2_steps):.2f}, 平均加速边: {np.mean(phase_2_accel_edges):.2f}")
    
    # 计算加速边使用比例
    total_accel_edges = [pa["total_accel_edges"] for pa in PHASE_ANALYSIS]
    total_steps = [pa["total_steps"] for pa in PHASE_ANALYSIS]
    accel_edge_ratios = [accel/steps if steps > 0 else 0 for accel, steps in zip(total_accel_edges, total_steps)]
    
    print(f"整体加速边使用比例: {np.mean(accel_edge_ratios):.2%}")
    
    # 分析哪些查询受益最多
    if len(PHASE_ANALYSIS) > 0:
        best_benefit_idx = np.argmax(accel_edge_ratios)
        best_benefit = PHASE_ANALYSIS[best_benefit_idx]
        print(f"\n加速边受益最多的查询:")
        print(f"  第一阶段: {best_benefit['phase_1']['step_count']} 步, {best_benefit['phase_1']['accel_edges']} 条加速边")
        print(f"  第二阶段: {best_benefit['phase_2']['step_count']} 步, {best_benefit['phase_2']['accel_edges']} 条加速边")
        print(f"  总步数: {best_benefit['total_steps']}, 总加速边: {best_benefit['total_accel_edges']}")
        print(f"  加速边比例: {best_benefit['overall_accel_edge_ratio']:.2%}")


arr_ori_bak = np.array(NUM_STEPS, dtype=np.float64)
arr_ori = arr_ori_bak.copy()
arr_ori.sort()

mean_steps = arr_ori.mean()
P50_steps = np.percentile(arr_ori, 50)
p99_steps = np.percentile(arr_ori, 99)
print(f"\n原始搜索统计:")
print(f"mean steps: {mean_steps}")
print(f"middle steps: {P50_steps}")
print(f"p99 steps: {p99_steps}")


=== 阶段分析统计 ===
第一阶段 (快速靠近) - 平均步数: 271.03, 平均加速边: 14.96
第二阶段 (Beam Search) - 平均步数: 169.62, 平均加速边: 11.55
整体加速边使用比例: 6.76%

加速边受益最多的查询:
  第一阶段: 114 步, 20 条加速边
  第二阶段: 79 步, 33 条加速边
  总步数: 194, 总加速边: 53
  加速边比例: 27.32%

原始搜索统计:
mean steps: 441.6526985651678
middle steps: 448.0
p99 steps: 785.0


In [8]:
# 插入额外的数据；并继续search上面的测试集合，查看search所需steps
insert_data_vectors = {
    "insert_1%": insert_1_percent,
    "insert_2%": insert_2_percent,
    "insert_3%": insert_3_percent,
    "insert_4%": insert_4_percent,
    "insert_5%": insert_5_percent,
    "insert_10%": insert_10_percent,
}
img_id = 500000
for name, insert_vectors in insert_data_vectors.items():
    print(f"-------------{name}--------------")
    # insert 新节点
    for _, vec in enumerate(insert_vectors):        # 可加 tqdm、批量 flush
        index.add_item(vec, id=img_id)
        img_id += 1
        IMAGE_IDX_SET.add(img_id)
    
    # 获取增量插入后的 cross distribution 边统计
    cross_stats_after = index.get_cross_distribution_stats()
    print(f"增量插入后 cross distribution 边统计:")
    print(f"总添加的 cross distribution 边: {cross_stats_after['total_cross_edges']}")
    print(f"被删除的 cross distribution 边: {cross_stats_after['deleted_cross_edges']}")
    print(f"活跃的 cross distribution 边: {cross_stats_after['active_cross_edges']}")
    
    NUM_STEPS = []
    PHASE_ANALYSIS = []
    for qid, target_list in test_query_list.items():
        q = query_vector[qid]
        for target_id in target_list[:10]:
            if target_id not in IMAGE_IDX_SET:
                continue
            # 使用阶段分析功能
            out = index.search_steps_to_target(q, target_id, k=10, ef=64, analyze_phases=True, verbose=False)
            NUM_STEPS.append(len(out["trace"]))
            if "phase_analysis" in out:
                PHASE_ANALYSIS.append(out["phase_analysis"])


    # 分析阶段统计
    if PHASE_ANALYSIS:
        print("\n=== 阶段分析统计 ===")
        phase_1_steps = [pa["phase_1"]["step_count"] for pa in PHASE_ANALYSIS]
        phase_2_steps = [pa["phase_2"]["step_count"] for pa in PHASE_ANALYSIS]
        phase_1_accel_edges = [pa["phase_1"]["accel_edges"] for pa in PHASE_ANALYSIS]
        phase_2_accel_edges = [pa["phase_2"]["accel_edges"] for pa in PHASE_ANALYSIS]
        
        print(f"第一阶段 (快速靠近) - 平均步数: {np.mean(phase_1_steps):.2f}, 平均加速边: {np.mean(phase_1_accel_edges):.2f}")
        print(f"第二阶段 (Beam Search) - 平均步数: {np.mean(phase_2_steps):.2f}, 平均加速边: {np.mean(phase_2_accel_edges):.2f}")
        
        # 计算加速边使用比例
        total_accel_edges = [pa["total_accel_edges"] for pa in PHASE_ANALYSIS]
        total_steps = [pa["total_steps"] for pa in PHASE_ANALYSIS]
        accel_edge_ratios = [accel/steps if steps > 0 else 0 for accel, steps in zip(total_accel_edges, total_steps)]
        
        print(f"整体加速边使用比例: {np.mean(accel_edge_ratios):.2%}")
        
        # 分析哪些查询受益最多
        if len(PHASE_ANALYSIS) > 0:
            best_benefit_idx = np.argmax(accel_edge_ratios)
            best_benefit = PHASE_ANALYSIS[best_benefit_idx]
            print(f"\n加速边受益最多的查询:")
            print(f"  第一阶段: {best_benefit['phase_1']['step_count']} 步, {best_benefit['phase_1']['accel_edges']} 条加速边")
            print(f"  第二阶段: {best_benefit['phase_2']['step_count']} 步, {best_benefit['phase_2']['accel_edges']} 条加速边")
            print(f"  总步数: {best_benefit['total_steps']}, 总加速边: {best_benefit['total_accel_edges']}")
            print(f"  加速边比例: {best_benefit['overall_accel_edge_ratio']:.2%}")


    arr_ori_bak = np.array(NUM_STEPS, dtype=np.float64)
    arr_ori = arr_ori_bak.copy()
    arr_ori.sort()

    mean_steps = arr_ori.mean()
    P50_steps = np.percentile(arr_ori, 50)
    p99_steps = np.percentile(arr_ori, 99)
    print(f"\n原始搜索统计:")
    print(f"mean steps: {mean_steps}")
    print(f"middle steps: {P50_steps}")
    print(f"p99 steps: {p99_steps}")

-------------insert_1%--------------


增量插入后 cross distribution 边统计:
总添加的 cross distribution 边: 96334
被删除的 cross distribution 边: 3212
活跃的 cross distribution 边: 93122
新增删除的 cross distribution 边: 97
33266 item got the biggest steps reduction: 741.0
mean steps: 894.7256470923942
middle steps: 869.0
p99 steps: 1612.820000000007
-------------insert_2%--------------
增量插入后 cross distribution 边统计:
总添加的 cross distribution 边: 96334
被删除的 cross distribution 边: 3299
活跃的 cross distribution 边: 93035
新增删除的 cross distribution 边: 87
33266 item got the biggest steps reduction: 712.0
mean steps: 973.5208882818388
middle steps: 950.0
p99 steps: 1707.0
-------------insert_3%--------------
增量插入后 cross distribution 边统计:
总添加的 cross distribution 边: 96334
被删除的 cross distribution 边: 3435
活跃的 cross distribution 边: 92899
新增删除的 cross distribution 边: 136
33266 item got the biggest steps reduction: 809.0
mean steps: 1033.9566215901143
middle steps: 1010.0
p99 steps: 1801.0
-------------insert_4%--------------
增量插入后 cross distribution 边统计:
总添加的 cross distri

## 观测一下瓶颈 profile
## 具体的distribution (img + text gap distance)
## insert, search 流程，以及存储方式/cache