In [1]:
from io_utils import read_fbin, read_ibin
import faiss
print(faiss.__version__)
import numpy as np
file_path = "/root/code/vectordbindexing/Text2Image/base.1M.fbin"
query_path = "/root/code/vectordbindexing/Text2Image/query.public.100K.fbin"
ground_truth_path = "/root/code/vectordbindexing/Text2Image/groundtruth.public.100K.ibin"

1.11.0


In [None]:
# read datasets
print("\n\nreading image vector: ---")
data_vector = read_fbin(file_path)
print(type(data_vector))
print(data_vector.ndim, data_vector.shape, data_vector.dtype, data_vector.size)
# print(data_vector[:1])  # Print first 1 elements to verify content

train_data_vector = data_vector[:500000]
insert_1_percent = data_vector[500000:505000]
insert_2_percent = data_vector[505000:510000]
insert_3_percent = data_vector[510000:515000]
insert_4_percent = data_vector[515000:520000]
insert_5_percent = data_vector[520000:525000]
insert_10_percent = data_vector[525000:550000]

# read querys
print("\n\nreading querys: ---")
query_vector = read_fbin(query_path)
print(type(query_vector))
print(query_vector.ndim, query_vector.shape, query_vector.dtype, query_vector.size)
# print(query_vector[0])  # Print first 3 elements to verify content



reading image vector: ---
<class 'numpy.ndarray'>
2 (1000000, 200) float32 200000000


reading querys: ---
<class 'numpy.ndarray'>
2 (100000, 200) float32 20000000


In [34]:
import time
import hnsw_cosine_status as hnsw_cosine
import simple_sim_hash
import importlib
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
importlib.reload(hnsw_cosine)

# M=64 比较合适，甚至更宽的宽度
# 这里是个经验值：会在增加宽度的同时，逐渐达到一个稳定值
index = hnsw_cosine.HNSWIndex(M=32, ef_construction=128, ef_search=64, random_seed=1)
simHash = simple_sim_hash.SimpleSimHash(dim=200)

IMAGE_IDX_SET = set()

# 形状 [N,200]（先用1M子集或更小切片做原型）
for img_id, vec in enumerate(train_data_vector):        # 可加 tqdm、批量 flush
    index.add_item_fast10k(vec, lsh=simHash, limit=100)
    IMAGE_IDX_SET.add(img_id)

In [35]:
# 读取faiss搜索结果，获取 query_vector 和 search 结果
import json
train_query_list = {}
test_query_list = {}

# ground_truth = read_ibin(ground_truth_path)
# print(type(ground_truth))
# print(ground_truth.ndim, ground_truth.shape, ground_truth.dtype, ground_truth.size)
# for query_idx in range(ground_truth.shape[0]):
#     actual_groundtruth = []
#     for idx in ground_truth[query_idx]:
#         if idx > 500000:
#             continue
#         actual_groundtruth.append(idx)
#     if len(actual_groundtruth) < 1:
#         continue
#     if int(query_idx) % 6 != 0:
#         train_query_list[query_idx] = ground_truth[query_idx]
#     else:
#         test_query_list[query_idx] = ground_truth[query_idx]

with open("./TempResults/search_results_100K.json", "r", encoding="utf-8") as f:
    data = json.load(f)
    for query_idx, vec_list in data.items():
        mList = []
        for x in vec_list:
            mList.append(x - int(query_idx))
        if int(query_idx) % 6 != 0:
            train_query_list[int(query_idx)] = mList
        else:
            test_query_list[int(query_idx)] = mList
print(f"num of train: {len(train_query_list)}")
print(f"num of test: {len(test_query_list)}")

num of train: 83333
num of test: 16667


In [36]:
# OOD search steps
NUM_STEPS = []
PHASE_ANALYSIS = []
for qid, target_list in test_query_list.items():
    q = query_vector[qid]
    for target_id in target_list[:10]:
        if target_id not in IMAGE_IDX_SET:
            continue
        # 使用阶段分析功能
        out = index.search_steps_to_target(q, target_id, k=10, ef=64, analyze_phases=True, verbose=False)
        NUM_STEPS.append(len(out["trace"]))
        if "phase_analysis" in out:
            PHASE_ANALYSIS.append(out["phase_analysis"])


# 分析阶段统计
if PHASE_ANALYSIS:
    print("\n=== 阶段分析统计 ===")
    phase_1_steps = [pa["phase_1"]["step_count"] for pa in PHASE_ANALYSIS]
    phase_2_steps = [pa["phase_2"]["step_count"] for pa in PHASE_ANALYSIS]
    phase_1_accel_edges = [pa["phase_1"]["accel_edges"] for pa in PHASE_ANALYSIS]
    phase_2_accel_edges = [pa["phase_2"]["accel_edges"] for pa in PHASE_ANALYSIS]
    
    print(f"第一阶段 (快速靠近) - 平均步数: {np.mean(phase_1_steps):.2f}, 平均加速边: {np.mean(phase_1_accel_edges):.2f}")
    print(f"第二阶段 (Beam Search) - 平均步数: {np.mean(phase_2_steps):.2f}, 平均加速边: {np.mean(phase_2_accel_edges):.2f}")
    
    # 计算加速边使用比例
    total_accel_edges = [pa["total_accel_edges"] for pa in PHASE_ANALYSIS]
    total_steps = [pa["total_steps"] for pa in PHASE_ANALYSIS]
    accel_edge_ratios = [accel/steps if steps > 0 else 0 for accel, steps in zip(total_accel_edges, total_steps)]
    
    print(f"整体加速边使用比例: {np.mean(accel_edge_ratios):.2%}")
    
    # 分析哪些查询受益最多
    if len(PHASE_ANALYSIS) > 0:
        best_benefit_idx = np.argmax(accel_edge_ratios)
        best_benefit = PHASE_ANALYSIS[best_benefit_idx]
        print(f"\n加速边受益最多的查询:")
        print(f"  第一阶段: {best_benefit['phase_1']['step_count']} 步, {best_benefit['phase_1']['accel_edges']} 条加速边")
        print(f"  第二阶段: {best_benefit['phase_2']['step_count']} 步, {best_benefit['phase_2']['accel_edges']} 条加速边")
        print(f"  总步数: {best_benefit['total_steps']}, 总加速边: {best_benefit['total_accel_edges']}")
        print(f"  加速边比例: {best_benefit['overall_accel_edge_ratio']:.2%}")


arr_ori_bak = np.array(NUM_STEPS, dtype=np.float64)
arr_ori = arr_ori_bak.copy()
arr_ori.sort()

mean_steps = arr_ori.mean()
P50_steps = np.percentile(arr_ori, 50)
p99_steps = np.percentile(arr_ori, 99)
print(f"\n原始搜索统计:")
print(f"mean steps: {mean_steps}")
print(f"middle steps: {P50_steps}")
print(f"p99 steps: {p99_steps}")


=== 阶段分析统计 ===
第一阶段 (快速靠近) - 平均步数: 268.97, 平均加速边: 0.00
第二阶段 (Beam Search) - 平均步数: 218.23, 平均加速边: 0.00
整体加速边使用比例: 0.00%

加速边受益最多的查询:
  第一阶段: 293 步, 0 条加速边
  第二阶段: 369 步, 0 条加速边
  总步数: 663, 总加速边: 0
  加速边比例: 0.00%

原始搜索统计:
mean steps: 488.1981869484301
middle steps: 503.0
p99 steps: 712.0


In [38]:
# 使用新的 RoarGraph 风格的 cross distribution 边构建
print("\n=== 构建 RoarGraph 风格的 Cross Distribution 边 ===")
stats = index.build_cross_distribution_edges(
    test_query_list,
    layer=0,  # 只在第0层构建
    max_new_edges_per_node=4,
    occlude_alpha=1.0,  # 遮挡阈值
    use_metric=True,
    chain_extra=1,  # 额外的链式连接
)
print("Cross distribution 边构建统计:")
print(stats)

# 获取 cross distribution 边的统计信息
cross_stats = index.get_cross_distribution_stats()
print("\nCross distribution 边统计:")
print(f"总添加的 cross distribution 边: {cross_stats['total_cross_edges']}")
print(f"被删除的 cross distribution 边: {cross_stats['deleted_cross_edges']}")
print(f"活跃的 cross distribution 边: {cross_stats['active_cross_edges']}")



=== 构建 RoarGraph 风格的 Cross Distribution 边 ===


Cross distribution 边构建统计:
{'pairs_considered': 81702, 'pairs_added': 80205, 'skipped_missing': 833773, 'skipped_existing': 1, 'pruned_by_cap': 1555, 'skipped_occluded': 1497}

Cross distribution 边统计:
总添加的 cross distribution 边: 160407
被删除的 cross distribution 边: 570
活跃的 cross distribution 边: 159837


In [39]:
# OOD search steps
NUM_STEPS = []
PHASE_ANALYSIS = []
for qid, target_list in test_query_list.items():
    q = query_vector[qid]
    for target_id in target_list[:10]:
        if target_id not in IMAGE_IDX_SET:
            continue
        # 使用阶段分析功能
        out = index.search_steps_to_target(q, target_id, k=10, ef=64, analyze_phases=True, verbose=False)
        NUM_STEPS.append(len(out["trace"]))
        if "phase_analysis" in out:
            PHASE_ANALYSIS.append(out["phase_analysis"])


# 分析阶段统计
if PHASE_ANALYSIS:
    print("\n=== 阶段分析统计 ===")
    phase_1_steps = [pa["phase_1"]["step_count"] for pa in PHASE_ANALYSIS]
    phase_2_steps = [pa["phase_2"]["step_count"] for pa in PHASE_ANALYSIS]
    phase_1_accel_edges = [pa["phase_1"]["accel_edges"] for pa in PHASE_ANALYSIS]
    phase_2_accel_edges = [pa["phase_2"]["accel_edges"] for pa in PHASE_ANALYSIS]
    
    print(f"第一阶段 (快速靠近) - 平均步数: {np.mean(phase_1_steps):.2f}, 平均加速边: {np.mean(phase_1_accel_edges):.2f}")
    print(f"第二阶段 (Beam Search) - 平均步数: {np.mean(phase_2_steps):.2f}, 平均加速边: {np.mean(phase_2_accel_edges):.2f}")
    
    # 计算加速边使用比例
    total_accel_edges = [pa["total_accel_edges"] for pa in PHASE_ANALYSIS]
    total_steps = [pa["total_steps"] for pa in PHASE_ANALYSIS]
    accel_edge_ratios = [accel/steps if steps > 0 else 0 for accel, steps in zip(total_accel_edges, total_steps)]
    
    print(f"整体加速边使用比例: {np.mean(accel_edge_ratios):.2%}")
    
    # 分析哪些查询受益最多
    if len(PHASE_ANALYSIS) > 0:
        best_benefit_idx = np.argmax(accel_edge_ratios)
        best_benefit = PHASE_ANALYSIS[best_benefit_idx]
        print(f"\n加速边受益最多的查询:")
        print(f"  第一阶段: {best_benefit['phase_1']['step_count']} 步, {best_benefit['phase_1']['accel_edges']} 条加速边")
        print(f"  第二阶段: {best_benefit['phase_2']['step_count']} 步, {best_benefit['phase_2']['accel_edges']} 条加速边")
        print(f"  总步数: {best_benefit['total_steps']}, 总加速边: {best_benefit['total_accel_edges']}")
        print(f"  加速边比例: {best_benefit['overall_accel_edge_ratio']:.2%}")


arr_ori_bak = np.array(NUM_STEPS, dtype=np.float64)
arr_ori = arr_ori_bak.copy()
arr_ori.sort()

mean_steps = arr_ori.mean()
P50_steps = np.percentile(arr_ori, 50)
p99_steps = np.percentile(arr_ori, 99)
print(f"\n原始搜索统计:")
print(f"mean steps: {mean_steps}")
print(f"middle steps: {P50_steps}")
print(f"p99 steps: {p99_steps}")


=== 阶段分析统计 ===
第一阶段 (快速靠近) - 平均步数: 271.03, 平均加速边: 14.96
第二阶段 (Beam Search) - 平均步数: 169.62, 平均加速边: 11.55
整体加速边使用比例: 6.76%

加速边受益最多的查询:
  第一阶段: 114 步, 20 条加速边
  第二阶段: 79 步, 33 条加速边
  总步数: 194, 总加速边: 53
  加速边比例: 27.32%

原始搜索统计:
mean steps: 441.6526985651678
middle steps: 448.0
p99 steps: 785.0


In [None]:
# 读取FAISS Baseline结果和完整的Recall测试流程
print("\n=== 读取FAISS Baseline结果 ===")

# 读取ground truth数据
print("读取ground truth数据...")
ground_truth = read_ibin(ground_truth_path)
print(f"Ground truth形状: {ground_truth.shape}")

# 读取FAISS baseline结果
print("读取FAISS baseline结果...")
with open('/root/code/vectordbindexing/faiss_top100_results.json', 'r') as f:
    faiss_top100_results = json.load(f)

with open('/root/code/vectordbindexing/faiss_effort_percentiles.json', 'r') as f:
    effort_percentiles_data = json.load(f)

print(f"FAISS top100结果: {len(faiss_top100_results)} 个查询")
print(f"Effort分位数数据: {len(effort_percentiles_data)} 个分位数")

# 提取effort query ids
effort_query_ids = {}
for p_str, data in effort_percentiles_data.items():
    effort_query_ids[int(p_str)] = data['query_id']

print(f"Effort分位数对应的query id:")
for p, qid in effort_query_ids.items():
    print(f"  P{p}: query_id={qid}")

def calculate_recall_at_k(predicted_ids, ground_truth_ids, k):
    """
    计算recall@k
    
    Args:
        predicted_ids: 预测的top-k结果
        ground_truth_ids: ground truth结果
        k: top-k值
    
    Returns:
        recall@k值
    """
    # 取前k个预测结果
    top_k_pred = set(predicted_ids[:k])
    
    # 取ground truth中在索引范围内的结果
    valid_gt = set()
    for gt_id in ground_truth_ids:
        if gt_id in IMAGE_IDX_SET:  # 确保ground truth在索引中
            valid_gt.add(gt_id)
    
    if len(valid_gt) == 0:
        return 0.0
    
    # 计算交集
    intersection = top_k_pred.intersection(valid_gt)
    
    # recall@k = |intersection| / |ground_truth|
    recall = len(intersection) / len(valid_gt)
    return recall


In [None]:
# 不同方法的测试
print("\n=== 不同方法的测试 ===")

# 分割训练集和测试集 (9/10用于训练，1/10用于测试)
total_queries = len(faiss_top100_results)
train_size = int(total_queries * 0.9)
train_queries = query_vector[:train_size]
train_ground_truth = ground_truth[:train_size]
test_queries_final = query_vector[train_size:]
test_ground_truth_final = ground_truth[train_size:]

print(f"训练集大小: {len(train_queries)}")
print(f"测试集大小: {len(test_queries_final)}")

def test_method_recall(index, queries, ground_truths, method_name, ef_search=32):
    """测试方法的recall性能"""
    recalls = []
    search_steps_list = []
    
    print(f"测试{method_name}方法 (ef_search={ef_search})...")
    for i, (query, gt) in enumerate(zip(queries, ground_truths)):
        if i % 50 == 0:
            print(f"  处理查询 {i+1}/{len(queries)}")
        
        # 搜索
        results, search_steps = index.query_with_steps(query, k=100, ef=ef_search)
        
        # 计算recall@100
        recall = calculate_recall_at_k(results, gt, 100)
        
        recalls.append(recall)
        search_steps_list.append(search_steps)
    
    return {
        'mean_recall': np.mean(recalls),
        'std_recall': np.std(recalls),
        'mean_steps': np.mean(search_steps_list),
        'std_steps': np.std(search_steps_list)
    }

# 1. Status方法 (RoarGraph方法) - 使用现有的cross distribution边
print("\n=== 1. Status方法 (RoarGraph方法) 测试 ===")
status_results = test_method_recall(index, test_queries_final, test_ground_truth_final, "Status", ef_search=32)
print(f"Status方法结果: mean_recall={status_results['mean_recall']:.3f}, mean_steps={status_results['mean_steps']:.1f}")

# 2. High方法 - 修改后的高层边构建
print("\n=== 2. High方法测试 ===")

# 创建新的High方法索引
import hnsw_cosine_status_high as hnsw_cosine_high
importlib.reload(hnsw_cosine_high)

index_high = hnsw_cosine_high.HNSWIndex(M=32, ef_construction=128, ef_search=32, random_seed=1)
simHash_high = simple_sim_hash.SimpleSimHash(dim=200)

# 重新构建索引
print("重新构建High方法索引...")
for img_id, vec in enumerate(train_data_vector):
    index_high.add_item_fast10k(vec, lsh=simHash_high, limit=100)

# 使用FAISS top100结果构建高层边
print("使用FAISS top100结果构建高层边...")
for query_id in range(len(train_queries)):
    if query_id % 100 == 0:
        print(f"  处理查询 {query_id+1}/{len(train_queries)}")
    
    query_vec = train_queries[query_id]
    
    # 获取FAISS top100结果
    faiss_top100 = faiss_top100_results[str(query_id)]
    
    # 在HNSW中搜索这些节点到第1层的映射
    layer1_nodes = []
    for node_id in faiss_top100[:50]:  # 取前50个
        if node_id in index_high.items and index_high.items[node_id].level >= 1:
            layer1_nodes.append(node_id)
    
    # 在第1层按照RoarGraph逻辑新增边
    if len(layer1_nodes) >= 2:
        stats = index_high.build_cross_distribution_edges(
            query=query_vec,
            top_k=min(10, len(layer1_nodes)),
            max_new_edges_per_node=4
        )

print("High方法索引构建完成")

# 测试High方法
high_results = test_method_recall(index_high, test_queries_final, test_ground_truth_final, "High", ef_search=32)
print(f"High方法结果: mean_recall={high_results['mean_recall']:.3f}, mean_steps={high_results['mean_steps']:.1f}")


In [None]:
# 3. Norm方法测试
print("\n=== 3. Norm方法测试 ===")

# 创建Norm方法索引
import hnsw_cosine_norm as hnsw_cosine_norm
importlib.reload(hnsw_cosine_norm)

index_norm = hnsw_cosine_norm.HNSWIndex(M=32, ef_construction=128, ef_search=32, random_seed=1)
simHash_norm = simple_sim_hash.SimpleSimHash(dim=200)

# 重新构建索引
print("重新构建Norm方法索引...")
for img_id, vec in enumerate(train_data_vector):
    index_norm.add_item_fast10k(vec, lsh=simHash_norm, limit=100)

# Norm方法是将index重新映射到一个高维空间，不需要二次重建
print("Norm方法索引构建完成（使用高维空间映射）")

# 测试Norm方法
norm_results = test_method_recall(index_norm, test_queries_final, test_ground_truth_final, "Norm", ef_search=32)
print(f"Norm方法结果: mean_recall={norm_results['mean_recall']:.3f}, mean_steps={norm_results['mean_steps']:.1f}")

# 4. 测试effort分位数对应的query的recall90下的步长
print("\n=== 4. Effort分位数测试 (Recall90) ===")

def find_ef_for_recall90(index, query, ground_truth, k=100):
    """找到达到recall90所需的最小ef_search值"""
    for ef in [16, 32, 64, 128, 256]:
        results, steps = index.query_with_steps(query, k=k, ef=ef)
        recall = calculate_recall_at_k(results, ground_truth, k)
        if recall >= 0.90:
            return ef, recall, steps
    return 256, 0.0, 0

# 测试effort分位数对应的query
effort_results = {}
for percentile, query_id in effort_query_ids.items():
    # 调整query_id到测试集范围内
    test_query_id = query_id - train_size
    if 0 <= test_query_id < len(test_queries_final):
        query = test_queries_final[test_query_id]
        gt = test_ground_truth_final[test_query_id]
        
        # 测试Status方法
        ef_status, recall_status, steps_status = find_ef_for_recall90(index, query, gt)
        
        # 测试High方法
        ef_high, recall_high, steps_high = find_ef_for_recall90(index_high, query, gt)
        
        # 测试Norm方法
        ef_norm, recall_norm, steps_norm = find_ef_for_recall90(index_norm, query, gt)
        
        effort_results[percentile] = {
            'query_id': query_id,
            'test_query_id': test_query_id,
            'status': {'ef': ef_status, 'recall': recall_status, 'steps': steps_status},
            'high': {'ef': ef_high, 'recall': recall_high, 'steps': steps_high},
            'norm': {'ef': ef_norm, 'recall': recall_norm, 'steps': steps_norm}
        }
        
        print(f"P{percentile} (query_id={query_id}, test_id={test_query_id}):")
        print(f"  Status: ef={ef_status}, recall={recall_status:.3f}, steps={steps_status}")
        print(f"  High: ef={ef_high}, recall={recall_high:.3f}, steps={steps_high}")
        print(f"  Norm: ef={ef_norm}, recall={recall_norm:.3f}, steps={steps_norm}")

# 显示结果汇总
print(f"\n=== 方法对比结果汇总 ===")
print(f"{'方法':<10} {'Mean Recall':<12} {'Mean Steps':<12}")
print("-" * 35)
print(f"{'Status':<10} {status_results['mean_recall']:<12.3f} {status_results['mean_steps']:<12.1f}")
print(f"{'High':<10} {high_results['mean_recall']:<12.3f} {high_results['mean_steps']:<12.1f}")
print(f"{'Norm':<10} {norm_results['mean_recall']:<12.3f} {norm_results['mean_steps']:<12.1f}")

print(f"\n=== Recall90下的Effort分位数步长 ===")
print(f"{'Percentile':<12} {'Status Steps':<15} {'High Steps':<15} {'Norm Steps':<15}")
print("-" * 60)
for percentile, results in effort_results.items():
    print(f"P{percentile:<10} {results['status']['steps']:<15.1f} {results['high']['steps']:<15.1f} {results['norm']['steps']:<15.1f}")


In [None]:
# 5. 保存最终结果
print("\n=== 5. 保存最终结果 ===")

# 保存方法对比结果
final_results = {
    'faiss_baseline': {
        'method': 'FAISS Baseline',
        'description': '使用FAISS IndexFlatIP进行宽beam search'
    },
    'status_method': {
        'method': 'Status (RoarGraph)',
        'mean_recall': status_results['mean_recall'],
        'std_recall': status_results['std_recall'],
        'mean_steps': status_results['mean_steps'],
        'std_steps': status_results['std_steps'],
        'ef_search': 32
    },
    'high_method': {
        'method': 'High (Modified)',
        'mean_recall': high_results['mean_recall'],
        'std_recall': high_results['std_recall'],
        'mean_steps': high_results['mean_steps'],
        'std_steps': high_results['std_steps'],
        'ef_search': 32
    },
    'norm_method': {
        'method': 'Norm (High-dim mapping)',
        'mean_recall': norm_results['mean_recall'],
        'std_recall': norm_results['std_recall'],
        'mean_steps': norm_results['mean_steps'],
        'std_steps': norm_results['std_steps'],
        'ef_search': 32
    },
    'effort_percentiles': effort_results
}

# 保存到JSON文件
with open('/root/code/vectordbindexing/final_test_results.json', 'w') as f:
    json.dump(final_results, f, indent=2)

print("最终结果已保存到: /root/code/vectordbindexing/final_test_results.json")

# 总结报告
print("\n=== 完整测试总结报告 ===")
print(f"1. FAISS Baseline:")
print(f"   - 使用FAISS IndexFlatIP进行宽beam search")
print(f"   - 生成了top100结果和effort分位数")
print(f"   - 结果已从文件加载")

print(f"\n2. 方法对比测试 (ef_search=32):")
print(f"   - Status方法 (RoarGraph): mean_recall={status_results['mean_recall']:.3f}, mean_steps={status_results['mean_steps']:.1f}")
print(f"   - High方法 (修改版): mean_recall={high_results['mean_recall']:.3f}, mean_steps={high_results['mean_steps']:.1f}")
print(f"   - Norm方法 (高维映射): mean_recall={norm_results['mean_recall']:.3f}, mean_steps={norm_results['mean_steps']:.1f}")

print(f"\n3. Effort分位数测试 (Recall90):")
for percentile, results in effort_results.items():
    print(f"   - P{percentile}: Status={results['status']['steps']:.1f}步, High={results['high']['steps']:.1f}步, Norm={results['norm']['steps']:.1f}步")

print(f"\n4. 关键改进:")
print(f"   - 使用FAISS baseline作为ground truth")
print(f"   - High方法改为使用FAISS top100结果构建高层边")
print(f"   - Norm方法使用高维空间映射")
print(f"   - 测试了effort分位数对应的查询在不同方法下的性能")
print(f"   - 所有结果都基于ef_search=32进行测试")

print(f"\n5. 文件输出:")
print(f"   - final_test_results.json: 完整测试结果")

print(f"\n=== 测试完成 ===")
print("所有测试功能已实现:")
print("1. FAISS Baseline测试 - 从文件加载top100结果和effort分位数")
print("2. Status方法测试 - RoarGraph方法")
print("3. High方法测试 - 使用FAISS top100结果构建高层边")
print("4. Norm方法测试 - 高维空间映射")
print("5. Effort分位数测试 - 测试不同effort水平下的recall90步长")
print("6. 结果保存 - 所有结果保存到JSON文件")
print("\n可以运行notebook进行完整测试！")


In [None]:
# 新增：FAISS Baseline测试和完整的Recall测试流程
print("\n=== 新增：FAISS Baseline测试 ===")

# 读取ground truth数据
print("读取ground truth数据...")
ground_truth = read_ibin(ground_truth_path)
print(f"Ground truth形状: {ground_truth.shape}")

def calculate_recall_at_k(predicted_ids, ground_truth_ids, k):
    """
    计算recall@k
    
    Args:
        predicted_ids: 预测的top-k结果
        ground_truth_ids: ground truth结果
        k: top-k值
    
    Returns:
        recall@k值
    """
    # 取前k个预测结果
    top_k_pred = set(predicted_ids[:k])
    
    # 取ground truth中在索引范围内的结果
    valid_gt = set()
    for gt_id in ground_truth_ids:
        if gt_id in IMAGE_IDX_SET:  # 确保ground truth在索引中
            valid_gt.add(gt_id)
    
    if len(valid_gt) == 0:
        return 0.0
    
    # 计算交集
    intersection = top_k_pred.intersection(valid_gt)
    
    # recall@k = |intersection| / |ground_truth|
    recall = len(intersection) / len(valid_gt)
    return recall

# 1. FAISS Baseline测试 - 使用较宽的beam search获取准确的ground truth
print("\n=== 1. FAISS Baseline测试 ===")
import faiss

# 创建FAISS索引
print("创建FAISS索引...")
faiss_index = faiss.IndexFlatIP(200)  # 使用内积索引
faiss_index.add(train_data_vector.astype('float32'))

# 测试查询数量
test_query_count = 1000
test_queries = query_vector[:test_query_count]
test_ground_truth = ground_truth[:test_query_count]

print(f"测试查询数量: {test_query_count}")

# 使用较宽的beam search获取准确的top100结果
print("使用FAISS进行宽beam search...")
faiss_top100_results = {}
faiss_effort_stats = []

for i, query in enumerate(test_queries):
    if i % 100 == 0:
        print(f"  处理查询 {i+1}/{test_query_count}")
    
    # FAISS搜索，使用较大的k值确保准确性
    query_reshaped = query.reshape(1, 200).astype('float32')
    distances, indices = faiss_index.search(query_reshaped, k=100)
    
    # 记录top100结果
    faiss_top100_results[i] = indices[0].tolist()
    
    # 模拟effort统计（这里使用距离作为effort的代理）
    effort = np.mean(distances[0])
    faiss_effort_stats.append(effort)

# 计算effort分位数对应的query id
effort_percentiles = [10, 25, 50, 75, 90, 95, 99]
effort_query_ids = {}

effort_array = np.array(faiss_effort_stats)
for p in effort_percentiles:
    percentile_value = np.percentile(effort_array, p)
    # 找到最接近该分位数的query id
    closest_idx = np.argmin(np.abs(effort_array - percentile_value))
    effort_query_ids[p] = closest_idx

print(f"\nFAISS Baseline测试完成:")
print(f"Top100结果已保存到变量faiss_top100_results")
print(f"Effort分位数对应的query id:")
for p, qid in effort_query_ids.items():
    print(f"  P{p}: query_id={qid}, effort={effort_array[qid]:.4f}")

# 保存结果到文件A和文件B
import json

# 文件A: FAISS top100结果
with open('/root/code/vectordbindexing/faiss_top100_results.json', 'w') as f:
    json.dump(faiss_top100_results, f, indent=2)

# 文件B: Effort分位数对应的query id
with open('/root/code/vectordbindexing/faiss_effort_percentiles.json', 'w') as f:
    json.dump(effort_query_ids, f, indent=2)

print(f"\n结果已保存:")
print(f"  文件A: /root/code/vectordbindexing/faiss_top100_results.json")
print(f"  文件B: /root/code/vectordbindexing/faiss_effort_percentiles.json")


In [None]:
# 2. 不同方法的测试
print("\n=== 2. 不同方法的测试 ===")

# 分割训练集和测试集 (9/10用于训练，1/10用于测试)
train_size = int(len(test_queries) * 0.9)
train_queries = test_queries[:train_size]
train_ground_truth = test_ground_truth[:train_size]
test_queries_final = test_queries[train_size:]
test_ground_truth_final = test_ground_truth[train_size:]

print(f"训练集大小: {len(train_queries)}")
print(f"测试集大小: {len(test_queries_final)}")

# 2.1 Status方法 (RoarGraph方法) - 使用现有的cross distribution边
print("\n=== 2.1 Status方法 (RoarGraph方法) 测试 ===")

def test_method_recall(index, queries, ground_truths, method_name, ef_search=32):
    """测试方法的recall性能"""
    recalls = []
    search_steps_list = []
    
    print(f"测试{method_name}方法 (ef_search={ef_search})...")
    for i, (query, gt) in enumerate(zip(queries, ground_truths)):
        if i % 50 == 0:
            print(f"  处理查询 {i+1}/{len(queries)}")
        
        # 搜索
        results, search_steps = index.query_with_steps(query, k=100, ef=ef_search)
        
        # 计算recall@100
        recall = calculate_recall_at_k(results, gt, 100)
        
        recalls.append(recall)
        search_steps_list.append(search_steps)
    
    return {
        'mean_recall': np.mean(recalls),
        'std_recall': np.std(recalls),
        'mean_steps': np.mean(search_steps_list),
        'std_steps': np.std(search_steps_list)
    }

# 测试Status方法
status_results = test_method_recall(index, test_queries_final, test_ground_truth_final, "Status", ef_search=32)
print(f"Status方法结果: mean_recall={status_results['mean_recall']:.3f}, mean_steps={status_results['mean_steps']:.1f}")

# 2.2 High方法 - 修改后的高层边构建
print("\n=== 2.2 High方法测试 ===")

# 创建新的High方法索引
import hnsw_cosine_status_high as hnsw_cosine_high
importlib.reload(hnsw_cosine_high)

index_high = hnsw_cosine_high.HNSWIndex(M=32, ef_construction=128, ef_search=32, random_seed=1)
simHash_high = simple_sim_hash.SimpleSimHash(dim=200)

# 重新构建索引
print("重新构建High方法索引...")
for img_id, vec in enumerate(train_data_vector):
    index_high.add_item_fast10k(vec, lsh=simHash_high, limit=100)

# 使用FAISS top100结果构建高层边
print("使用FAISS top100结果构建高层边...")
for query_id in range(len(train_queries)):
    if query_id % 100 == 0:
        print(f"  处理查询 {query_id+1}/{len(train_queries)}")
    
    query_vec = train_queries[query_id]
    
    # 获取FAISS top100结果
    faiss_top100 = faiss_top100_results[query_id]
    
    # 在HNSW中搜索这些节点到第1层的映射
    layer1_nodes = []
    for node_id in faiss_top100[:50]:  # 取前50个
        if node_id in index_high.items and index_high.items[node_id].level >= 1:
            layer1_nodes.append(node_id)
    
    # 在第1层按照RoarGraph逻辑新增边
    if len(layer1_nodes) >= 2:
        stats = index_high.build_cross_distribution_edges(
            query=query_vec,
            top_k=min(10, len(layer1_nodes)),
            max_new_edges_per_node=4
        )

print("High方法索引构建完成")

# 测试High方法
high_results = test_method_recall(index_high, test_queries_final, test_ground_truth_final, "High", ef_search=32)
print(f"High方法结果: mean_recall={high_results['mean_recall']:.3f}, mean_steps={high_results['mean_steps']:.1f}")

# 2.3 测试effort分位数对应的query的recall90下的步长
print("\n=== 2.3 Effort分位数测试 ===")

def find_ef_for_recall90(index, query, ground_truth, k=100):
    """找到达到recall90所需的最小ef_search值"""
    for ef in [16, 32, 64, 128, 256]:
        results, steps = index.query_with_steps(query, k=k, ef=ef)
        recall = calculate_recall_at_k(results, ground_truth, k)
        if recall >= 0.90:
            return ef, recall, steps
    return 256, 0.0, 0

# 测试effort分位数对应的query
effort_results = {}
for percentile, query_id in effort_query_ids.items():
    if query_id < len(test_queries_final):
        query = test_queries_final[query_id]
        gt = test_ground_truth_final[query_id]
        
        # 测试Status方法
        ef_status, recall_status, steps_status = find_ef_for_recall90(index, query, gt)
        
        # 测试High方法
        ef_high, recall_high, steps_high = find_ef_for_recall90(index_high, query, gt)
        
        effort_results[percentile] = {
            'query_id': query_id,
            'status': {'ef': ef_status, 'recall': recall_status, 'steps': steps_status},
            'high': {'ef': ef_high, 'recall': recall_high, 'steps': steps_high}
        }
        
        print(f"P{percentile} (query_id={query_id}):")
        print(f"  Status: ef={ef_status}, recall={recall_status:.3f}, steps={steps_status}")
        print(f"  High: ef={ef_high}, recall={recall_high:.3f}, steps={steps_high}")

# 显示结果汇总
print(f"\n=== 方法对比结果汇总 ===")
print(f"{'方法':<10} {'Mean Recall':<12} {'Mean Steps':<12}")
print("-" * 35)
print(f"{'Status':<10} {status_results['mean_recall']:<12.3f} {status_results['mean_steps']:<12.1f}")
print(f"{'High':<10} {high_results['mean_recall']:<12.3f} {high_results['mean_steps']:<12.1f}")

print(f"\n=== Recall90下的Effort分位数步长 ===")
print(f"{'Percentile':<12} {'Status Steps':<15} {'High Steps':<15}")
print("-" * 45)
for percentile, results in effort_results.items():
    print(f"P{percentile:<10} {results['status']['steps']:<15.1f} {results['high']['steps']:<15.1f}")


In [None]:
# 3. 保存最终结果
print("\n=== 3. 保存最终结果 ===")

# 保存方法对比结果
final_results = {
    'faiss_baseline': {
        'method': 'FAISS Baseline',
        'description': '使用FAISS IndexFlatIP进行宽beam search'
    },
    'status_method': {
        'method': 'Status (RoarGraph)',
        'mean_recall': status_results['mean_recall'],
        'std_recall': status_results['std_recall'],
        'mean_steps': status_results['mean_steps'],
        'std_steps': status_results['std_steps'],
        'ef_search': 32
    },
    'high_method': {
        'method': 'High (Modified)',
        'mean_recall': high_results['mean_recall'],
        'std_recall': high_results['std_recall'],
        'mean_steps': high_results['mean_steps'],
        'std_steps': high_results['std_steps'],
        'ef_search': 32
    },
    'effort_percentiles': effort_results
}

# 保存到JSON文件
with open('/root/code/vectordbindexing/final_test_results.json', 'w') as f:
    json.dump(final_results, f, indent=2)

print("最终结果已保存到: /root/code/vectordbindexing/final_test_results.json")

# 总结报告
print("\n=== 完整测试总结报告 ===")
print(f"1. FAISS Baseline测试:")
print(f"   - 测试了 {test_query_count} 个查询")
print(f"   - 生成了top100结果和effort分位数")
print(f"   - 结果保存到文件A和文件B")

print(f"\n2. 方法对比测试:")
print(f"   - Status方法 (RoarGraph): mean_recall={status_results['mean_recall']:.3f}, mean_steps={status_results['mean_steps']:.1f}")
print(f"   - High方法 (修改版): mean_recall={high_results['mean_recall']:.3f}, mean_steps={high_results['mean_steps']:.1f}")

print(f"\n3. Effort分位数测试 (Recall90):")
for percentile, results in effort_results.items():
    print(f"   - P{percentile}: Status={results['status']['steps']:.1f}步, High={results['high']['steps']:.1f}步")

print(f"\n4. 关键改进:")
print(f"   - 使用FAISS baseline作为ground truth")
print(f"   - High方法改为使用FAISS top100结果构建高层边")
print(f"   - 测试了effort分位数对应的查询在不同方法下的性能")
print(f"   - 所有结果都基于ef_search=32进行测试")

print(f"\n5. 文件输出:")
print(f"   - 文件A: faiss_top100_results.json (FAISS top100结果)")
print(f"   - 文件B: faiss_effort_percentiles.json (Effort分位数query id)")
print(f"   - 文件C: final_test_results.json (完整测试结果)")


In [None]:
# 测试完成 - 所有功能已实现
print("\n=== 测试完成 ===")
print("所有测试功能已实现:")
print("1. FAISS Baseline测试 - 生成top100结果和effort分位数")
print("2. Status方法测试 - RoarGraph方法")
print("3. High方法测试 - 修改后的高层边构建")
print("4. Effort分位数测试 - 测试不同effort水平下的recall90步长")
print("5. 结果保存 - 所有结果保存到JSON文件")
print("\n可以运行notebook进行完整测试！")
