# OOD分层图索引Demo

本Notebook实现一个"面向OOD的分层图索引 + 在线增量维护"的演示系统。

## 设计思路
1. **核心图**: 包含常见的ID节点，使用高效的图结构进行kNN查询
2. **边缘图**: 包含OOD节点，通过长边连接到核心图
3. **OOD-score机制**: 动态评估节点是否为OOD，决定图结构策略
4. **在线增量维护**: 支持新节点的异步插入和图结构优化

## 功能特点
- 分层图结构：核心图 + 边缘图
- OOD检测与自适应处理
- 在线增量更新
- 查询路径可视化
- 性能评估与测试


In [None]:
# Step 0: 文件与环境准备
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import faiss
import logging
from typing import List, Tuple, Dict, Optional
import time
import random
from collections import defaultdict
import warnings
warnings.filterwarnings('ignore')

# 设置随机种子确保可重现性
np.random.seed(42)
random.seed(42)

# 配置日志
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# 设置matplotlib中文字体
plt.rcParams['font.sans-serif'] = ['SimHei', 'DejaVu Sans']
plt.rcParams['axes.unicode_minus'] = False

print("✅ Step 0: 环境准备完成")
print("已导入的库:")
print("- numpy:", np.__version__)
print("- networkx:", nx.__version__)
print("- matplotlib:", plt.matplotlib.__version__)
print("- faiss:", faiss.__version__ if hasattr(faiss, '__version__') else "已安装")


## Step 1: 数据生成与预处理

生成模拟数据：
- 库向量：1000个128维向量，模拟正常ID数据
- 查询向量：包含ID查询和OOD查询（分布偏移约20%）


In [None]:
# 数据生成参数
VECTOR_DIM = 128
LIBRARY_SIZE = 1000
QUERY_SIZE = 200
OOD_RATIO = 0.2  # 20%的查询为OOD

print(f"数据参数:")
print(f"- 向量维度: {VECTOR_DIM}")
print(f"- 库大小: {LIBRARY_SIZE}")
print(f"- 查询数量: {QUERY_SIZE}")
print(f"- OOD比例: {OOD_RATIO}")

# 生成库向量（ID数据）
# 使用多个高斯分布模拟真实数据的不同簇
np.random.seed(42)
library_vectors = []
n_clusters = 5
vectors_per_cluster = LIBRARY_SIZE // n_clusters

for i in range(n_clusters):
    # 每个簇的中心点
    center = np.random.randn(VECTOR_DIM) * 2
    # 生成簇内向量
    cluster_vectors = center + np.random.randn(vectors_per_cluster, VECTOR_DIM) * 0.5
    library_vectors.append(cluster_vectors)

# 添加剩余向量
remaining = LIBRARY_SIZE - len(library_vectors) * vectors_per_cluster
if remaining > 0:
    center = np.random.randn(VECTOR_DIM) * 2
    remaining_vectors = center + np.random.randn(remaining, VECTOR_DIM) * 0.5
    library_vectors.append(remaining_vectors)

library_vectors = np.vstack(library_vectors)
print(f"✅ 生成库向量: {library_vectors.shape}")

# 生成ID查询向量（与库向量分布相似）
id_query_count = int(QUERY_SIZE * (1 - OOD_RATIO))
id_queries = []
for i in range(id_query_count):
    # 随机选择一个簇
    cluster_idx = i % n_clusters
    center = np.random.randn(VECTOR_DIM) * 2  # 重新生成中心，模拟查询分布
    query_vector = center + np.random.randn(VECTOR_DIM) * 0.5
    id_queries.append(query_vector)
id_queries = np.array(id_queries)

# 生成OOD查询向量（分布偏移）
ood_query_count = QUERY_SIZE - id_query_count
ood_queries = []
for i in range(ood_query_count):
    # 使用不同的分布参数模拟OOD
    ood_vector = np.random.randn(VECTOR_DIM) * 3 + np.random.randn(VECTOR_DIM) * 1.5
    ood_queries.append(ood_vector)
ood_queries = np.array(ood_queries)

print(f"✅ 生成ID查询向量: {id_queries.shape}")
print(f"✅ 生成OOD查询向量: {ood_queries.shape}")

# 合并所有查询向量并标记类型
all_queries = np.vstack([id_queries, ood_queries])
query_labels = ['ID'] * id_query_count + ['OOD'] * ood_query_count

print(f"✅ 总查询向量: {all_queries.shape}")
print(f"ID查询: {id_query_count}, OOD查询: {ood_query_count}")


In [None]:
# 可视化向量分布（使用PCA降维到2D）
from sklearn.decomposition import PCA

# 降维到2D用于可视化
pca = PCA(n_components=2)
all_data = np.vstack([library_vectors, all_queries])
pca_result = pca.fit_transform(all_data)

# 分离不同数据集的PCA结果
lib_pca = pca_result[:LIBRARY_SIZE]
query_pca = pca_result[LIBRARY_SIZE:]

plt.figure(figsize=(12, 5))

# 子图1：库向量分布
plt.subplot(1, 2, 1)
plt.scatter(lib_pca[:, 0], lib_pca[:, 1], c='blue', alpha=0.6, s=20, label='库向量')
plt.title('库向量分布 (ID数据)')
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.legend()
plt.grid(True, alpha=0.3)

# 子图2：查询向量分布
plt.subplot(1, 2, 2)
id_mask = np.array(query_labels) == 'ID'
ood_mask = np.array(query_labels) == 'OOD'

plt.scatter(query_pca[id_mask, 0], query_pca[id_mask, 1], 
           c='green', alpha=0.7, s=30, label='ID查询', marker='o')
plt.scatter(query_pca[ood_mask, 0], query_pca[ood_mask, 1], 
           c='red', alpha=0.7, s=30, label='OOD查询', marker='^')
plt.title('查询向量分布')
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.legend()
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# 数据统计
print("数据统计:")
print(f"库向量范围: [{library_vectors.min():.3f}, {library_vectors.max():.3f}]")
print(f"库向量均值: {library_vectors.mean():.3f}, 标准差: {library_vectors.std():.3f}")
print(f"ID查询范围: [{id_queries.min():.3f}, {id_queries.max():.3f}]")
print(f"OOD查询范围: [{ood_queries.min():.3f}, {ood_queries.max():.3f}]")

print("✅ Step 1: 数据生成与预处理完成")


## Step 2: 构建核心图（ID节点）

使用NetworkX构建核心图，包含所有ID节点：
- 基于余弦相似度构建邻接关系
- 实现kNN查询功能
- 测试核心图的基本查询能力


In [None]:
class CoreGraph:
    """核心图类，用于存储ID节点和基础kNN查询"""
    
    def __init__(self, vectors: np.ndarray, k_neighbors: int = 20):
        """
        初始化核心图
        
        Args:
            vectors: 向量数据
            k_neighbors: 每个节点的邻居数量
        """
        self.vectors = vectors.astype(np.float32)
        self.n_vectors = len(vectors)
        self.k_neighbors = k_neighbors
        self.graph = nx.Graph()
        
        # 构建图
        self._build_graph()
        
    def _build_graph(self):
        """构建核心图"""
        logger.info(f"开始构建核心图，节点数: {self.n_vectors}, 邻居数: {self.k_neighbors}")
        
        # 添加节点
        for i in range(self.n_vectors):
            self.graph.add_node(i, vector=self.vectors[i])
        
        # 使用FAISS计算kNN
        index = faiss.IndexFlatIP(self.vectors.shape[1])  # 内积索引
        # 归一化向量用于余弦相似度
        vectors_norm = self.vectors / np.linalg.norm(self.vectors, axis=1, keepdims=True)
        index.add(vectors_norm)
        
        # 为每个节点找到k个最近邻
        k_search = min(self.k_neighbors + 1, self.n_vectors)  # +1因为会包含自己
        distances, indices = index.search(vectors_norm, k_search)
        
        # 添加边
        for i in range(self.n_vectors):
            for j in range(1, k_search):  # 跳过自己
                neighbor_idx = indices[i, j]
                similarity = distances[i, j]
                # 只添加相似度较高的边
                if similarity > 0.5:  # 余弦相似度阈值
                    self.graph.add_edge(i, neighbor_idx, weight=similarity)
        
        logger.info(f"核心图构建完成，节点数: {self.graph.number_of_nodes()}, 边数: {self.graph.number_of_edges()}")
    
    def knn_search(self, query_vector: np.ndarray, k: int = 10) -> List[Tuple[int, float]]:
        """
        kNN搜索
        
        Args:
            query_vector: 查询向量
            k: 返回的最近邻数量
            
        Returns:
            List of (node_id, similarity) tuples
        """
        query_norm = query_vector.astype(np.float32)
        query_norm = query_norm / np.linalg.norm(query_norm)
        
        # 计算与所有节点的相似度
        similarities = np.dot(self.vectors, query_norm)
        
        # 获取top-k
        top_k_indices = np.argsort(similarities)[::-1][:k]
        
        return [(idx, similarities[idx]) for idx in top_k_indices]
    
    def get_graph_stats(self) -> Dict:
        """获取图统计信息"""
        return {
            'nodes': self.graph.number_of_nodes(),
            'edges': self.graph.number_of_edges(),
            'avg_degree': sum(dict(self.graph.degree()).values()) / self.graph.number_of_nodes(),
            'is_connected': nx.is_connected(self.graph),
            'num_components': nx.number_connected_components(self.graph)
        }

# 构建核心图
print("构建核心图...")
core_graph = CoreGraph(library_vectors, k_neighbors=20)

# 获取图统计信息
stats = core_graph.get_graph_stats()
print("核心图统计信息:")
for key, value in stats.items():
    print(f"  {key}: {value}")

print("✅ Step 2: 核心图构建完成")


In [None]:
# 测试核心图查询功能
print("测试核心图查询功能...")

# 测试ID查询
test_id_query = id_queries[0]
id_results = core_graph.knn_search(test_id_query, k=5)
print(f"ID查询测试结果 (前5个):")
for i, (node_id, similarity) in enumerate(id_results):
    print(f"  {i+1}. 节点{node_id}, 相似度: {similarity:.4f}")

# 测试OOD查询
test_ood_query = ood_queries[0]
ood_results = core_graph.knn_search(test_ood_query, k=5)
print(f"\nOOD查询测试结果 (前5个):")
for i, (node_id, similarity) in enumerate(ood_results):
    print(f"  {i+1}. 节点{node_id}, 相似度: {similarity:.4f}")

# 可视化核心图的局部结构
plt.figure(figsize=(15, 5))

# 子图1：核心图度分布
degrees = [d for n, d in core_graph.graph.degree()]
plt.subplot(1, 3, 1)
plt.hist(degrees, bins=20, alpha=0.7, color='blue')
plt.title('核心图度分布')
plt.xlabel('节点度数')
plt.ylabel('频次')
plt.grid(True, alpha=0.3)

# 子图2：边权重分布
weights = [core_graph.graph[u][v]['weight'] for u, v in core_graph.graph.edges()]
plt.subplot(1, 3, 2)
plt.hist(weights, bins=20, alpha=0.7, color='green')
plt.title('边权重分布')
plt.xlabel('余弦相似度')
plt.ylabel('频次')
plt.grid(True, alpha=0.3)

# 子图3：核心图的连通组件大小分布
components = list(nx.connected_components(core_graph.graph))
component_sizes = [len(comp) for comp in components]
plt.subplot(1, 3, 3)
plt.hist(component_sizes, bins=20, alpha=0.7, color='red')
plt.title('连通组件大小分布')
plt.xlabel('组件大小')
plt.ylabel('频次')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"核心图连通性分析:")
print(f"  连通组件数量: {len(components)}")
print(f"  最大组件大小: {max(component_sizes) if component_sizes else 0}")
print(f"  平均组件大小: {np.mean(component_sizes):.2f}")

print("✅ Step 2: 核心图构建与测试完成")


## Step 3: 构建边缘OOD图

构建边缘图用于处理OOD节点：
- 初始化边缘图结构
- 为OOD节点计算到核心图的距离
- 添加跨簇长边连接
- 实现在线插入新OOD节点功能


In [None]:
class OODGraph:
    """边缘OOD图类，用于处理OOD节点和跨簇连接"""
    
    def __init__(self, core_graph: CoreGraph, max_long_edges: int = 5):
        """
        初始化OOD图
        
        Args:
            core_graph: 核心图实例
            max_long_edges: 每个OOD节点到核心图的最大长边数
        """
        self.core_graph = core_graph
        self.max_long_edges = max_long_edges
        self.ood_graph = nx.Graph()  # OOD节点之间的图
        self.ood_to_core_edges = {}  # OOD节点到核心图的连接
        self.ood_vectors = {}  # 存储OOD节点向量
        self.next_ood_id = core_graph.n_vectors  # OOD节点ID从核心图节点数开始
        
        logger.info(f"初始化OOD图，核心图节点数: {core_graph.n_vectors}")
    
    def compute_ood_score(self, vector: np.ndarray) -> float:
        """
        计算OOD-score（基于与核心图的距离）
        
        Args:
            vector: 待评估的向量
            
        Returns:
            OOD-score (0-1，越高越可能是OOD)
        """
        # 计算与核心图所有节点的最大相似度
        similarities = np.dot(self.core_graph.vectors, vector / np.linalg.norm(vector))
        max_similarity = np.max(similarities)
        
        # OOD-score = 1 - max_similarity (相似度越低，OOD-score越高)
        ood_score = 1 - max_similarity
        return min(max(ood_score, 0), 1)  # 限制在[0,1]范围内
    
    def add_ood_node(self, vector: np.ndarray, ood_threshold: float = 0.3) -> Optional[int]:
        """
        添加新的OOD节点
        
        Args:
            vector: 新节点向量
            ood_threshold: OOD判断阈值
            
        Returns:
            节点ID（如果是OOD）或None（如果不够OOD）
        """
        ood_score = self.compute_ood_score(vector)
        
        if ood_score < ood_threshold:
            logger.info(f"向量OOD-score {ood_score:.3f} < 阈值 {ood_threshold}，不添加为OOD节点")
            return None
        
        # 添加OOD节点
        ood_id = self.next_ood_id
        self.next_ood_id += 1
        
        self.ood_graph.add_node(ood_id, vector=vector, ood_score=ood_score)
        self.ood_vectors[ood_id] = vector.copy()
        
        # 计算到核心图的长边
        self._add_long_edges_to_core(ood_id, vector)
        
        # 计算到其他OOD节点的连接
        self._add_ood_to_ood_edges(ood_id, vector)
        
        logger.info(f"添加OOD节点 {ood_id}，OOD-score: {ood_score:.3f}")
        return ood_id
    
    def _add_long_edges_to_core(self, ood_id: int, vector: np.ndarray):
        """添加OOD节点到核心图的长边"""
        # 计算与核心图节点的相似度
        similarities = np.dot(self.core_graph.vectors, vector / np.linalg.norm(vector))
        
        # 选择top-k作为长边连接
        top_k = min(self.max_long_edges, len(similarities))
        top_indices = np.argsort(similarities)[::-1][:top_k]
        
        self.ood_to_core_edges[ood_id] = []
        for core_id in top_indices:
            similarity = similarities[core_id]
            # 即使相似度较低也添加长边（这是OOD的特点）
            if similarity > 0.1:  # 很低的阈值
                self.ood_to_core_edges[ood_id].append((core_id, similarity))
    
    def _add_ood_to_ood_edges(self, new_ood_id: int, vector: np.ndarray):
        """添加OOD节点之间的连接"""
        if len(self.ood_vectors) <= 1:
            return
        
        # 计算与其他OOD节点的相似度
        other_ood_vectors = np.array([v for oid, v in self.ood_vectors.items() if oid != new_ood_id])
        other_ood_ids = [oid for oid in self.ood_vectors.keys() if oid != new_ood_id]
        
        if len(other_ood_vectors) == 0:
            return
        
        similarities = np.dot(other_ood_vectors, vector / np.linalg.norm(vector))
        
        # 添加相似度较高的连接
        for i, (other_id, similarity) in enumerate(zip(other_ood_ids, similarities)):
            if similarity > 0.2:  # OOD之间的连接阈值
                self.ood_graph.add_edge(new_ood_id, other_id, weight=similarity)
    
    def get_ood_stats(self) -> Dict:
        """获取OOD图统计信息"""
        return {
            'ood_nodes': len(self.ood_graph.nodes()),
            'ood_edges': len(self.ood_graph.edges()),
            'long_edges_to_core': sum(len(edges) for edges in self.ood_to_core_edges.values()),
            'avg_ood_score': np.mean([data['ood_score'] for node, data in self.ood_graph.nodes(data=True)]),
            'is_connected': nx.is_connected(self.ood_graph) if len(self.ood_graph.nodes()) > 1 else True
        }
    
    def visualize_ood_structure(self):
        """可视化OOD图结构"""
        if len(self.ood_graph.nodes()) == 0:
            print("没有OOD节点可可视化")
            return
        
        plt.figure(figsize=(15, 5))
        
        # 子图1：OOD节点分布（2D PCA）
        ood_vectors_array = np.array(list(self.ood_vectors.values()))
        if len(ood_vectors_array) > 0:
            pca_ood = PCA(n_components=2)
            ood_pca = pca_ood.fit_transform(ood_vectors_array)
            
            plt.subplot(1, 3, 1)
            ood_scores = [self.ood_graph.nodes[node]['ood_score'] for node in self.ood_graph.nodes()]
            scatter = plt.scatter(ood_pca[:, 0], ood_pca[:, 1], c=ood_scores, 
                                cmap='Reds', s=50, alpha=0.7)
            plt.colorbar(scatter, label='OOD Score')
            plt.title('OOD节点分布')
            plt.xlabel('PC1')
            plt.ylabel('PC2')
            plt.grid(True, alpha=0.3)
        
        # 子图2：OOD-score分布
        plt.subplot(1, 3, 2)
        plt.hist(ood_scores, bins=10, alpha=0.7, color='red')
        plt.title('OOD-score分布')
        plt.xlabel('OOD Score')
        plt.ylabel('频次')
        plt.grid(True, alpha=0.3)
        
        # 子图3：长边连接数分布
        plt.subplot(1, 3, 3)
        long_edge_counts = [len(edges) for edges in self.ood_to_core_edges.values()]
        plt.hist(long_edge_counts, bins=range(max(long_edge_counts)+2), alpha=0.7, color='orange')
        plt.title('长边连接数分布')
        plt.xlabel('连接到核心图的边数')
        plt.ylabel('频次')
        plt.grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.show()

# 创建OOD图
print("创建OOD图...")
ood_graph = OODGraph(core_graph, max_long_edges=3)

# 添加一些OOD查询作为OOD节点
print("添加OOD查询作为OOD节点...")
ood_nodes_added = []
for i, ood_query in enumerate(ood_queries[:10]):  # 只添加前10个OOD查询作为示例
    ood_id = ood_graph.add_ood_node(ood_query, ood_threshold=0.2)
    if ood_id is not None:
        ood_nodes_added.append(ood_id)

print(f"成功添加 {len(ood_nodes_added)} 个OOD节点")

# 获取OOD图统计信息
ood_stats = ood_graph.get_ood_stats()
print("OOD图统计信息:")
for key, value in ood_stats.items():
    print(f"  {key}: {value}")

print("✅ Step 3: 边缘OOD图构建完成")


In [None]:
# 可视化OOD图结构
ood_graph.visualize_ood_structure()

# 测试在线插入新OOD节点
print("\n测试在线插入新OOD节点...")

# 生成一个新的测试向量（更极端的OOD）
test_ood_vector = np.random.randn(VECTOR_DIM) * 4 + np.random.randn(VECTOR_DIM) * 2
test_ood_score = ood_graph.compute_ood_score(test_ood_vector)
print(f"测试向量OOD-score: {test_ood_score:.3f}")

# 尝试添加为OOD节点
new_ood_id = ood_graph.add_ood_node(test_ood_vector, ood_threshold=0.15)
if new_ood_id is not None:
    print(f"成功添加新的OOD节点: {new_ood_id}")
    
    # 查看新节点的长边连接
    long_edges = ood_graph.ood_to_core_edges[new_ood_id]
    print(f"新节点连接到核心图的边数: {len(long_edges)}")
    for core_id, similarity in long_edges:
        print(f"  连接到核心节点 {core_id}, 相似度: {similarity:.4f}")
else:
    print("测试向量不够OOD，未添加为OOD节点")

# 更新后的统计信息
updated_stats = ood_graph.get_ood_stats()
print("\n更新后的OOD图统计信息:")
for key, value in updated_stats.items():
    print(f"  {key}: {value}")

print("✅ Step 3: 边缘OOD图构建与测试完成")


## Step 4: OOD-score机制

实现更完善的OOD-score计算和节点策略决策：
- 基于局部密度的OOD-score计算
- 根据OOD-score动态调整图结构策略
- 测试不同阈值下的查询可达性


In [None]:
class EnhancedOODGraph(OODGraph):
    """增强版OOD图，包含更完善的OOD-score机制"""
    
    def __init__(self, core_graph: CoreGraph, max_long_edges: int = 5):
        super().__init__(core_graph, max_long_edges)
        self.ood_score_cache = {}  # 缓存OOD-score计算结果
        
    def compute_enhanced_ood_score(self, vector: np.ndarray, k_neighbors: int = 20) -> float:
        """
        计算增强版OOD-score（基于局部密度）
        
        Args:
            vector: 待评估的向量
            k_neighbors: 用于密度计算的邻居数量
            
        Returns:
            增强版OOD-score (0-1)
        """
        vector_norm = vector / np.linalg.norm(vector)
        
        # 1. 与核心图的最大相似度
        core_similarities = np.dot(self.core_graph.vectors, vector_norm)
        max_core_similarity = np.max(core_similarities)
        
        # 2. 局部密度计算（基于k个最近邻的平均距离）
        all_similarities = core_similarities
        if len(self.ood_vectors) > 0:
            ood_vectors_array = np.array(list(self.ood_vectors.values()))
            ood_similarities = np.dot(ood_vectors_array, vector_norm)
            all_similarities = np.concatenate([core_similarities, ood_similarities])
        
        # 计算局部密度（k个最近邻的平均相似度）
        k = min(k_neighbors, len(all_similarities))
        top_k_similarities = np.sort(all_similarities)[::-1][:k]
        local_density = np.mean(top_k_similarities)
        
        # 3. 综合OOD-score
        # 结合最大相似度和局部密度
        similarity_score = 1 - max_core_similarity
        density_score = 1 - local_density
        
        # 加权组合（可以调整权重）
        enhanced_score = 0.6 * similarity_score + 0.4 * density_score
        
        return min(max(enhanced_score, 0), 1)
    
    def adaptive_node_strategy(self, vector: np.ndarray, 
                             low_ood_threshold: float = 0.2,
                             high_ood_threshold: float = 0.6) -> Dict:
        """
        根据OOD-score自适应决定节点策略
        
        Args:
            vector: 待评估的向量
            low_ood_threshold: 低OOD阈值
            high_ood_threshold: 高OOD阈值
            
        Returns:
            策略决策结果
        """
        ood_score = self.compute_enhanced_ood_score(vector)
        
        strategy = {
            'ood_score': ood_score,
            'action': None,
            'max_long_edges': self.max_long_edges,
            'connectivity_level': 'normal'
        }
        
        if ood_score < low_ood_threshold:
            # 低OOD-score：可能是ID数据，不需要特殊处理
            strategy['action'] = 'skip'
            strategy['reason'] = 'Low OOD score, likely ID data'
            
        elif ood_score < high_ood_threshold:
            # 中等OOD-score：标准OOD处理
            strategy['action'] = 'add_standard'
            strategy['max_long_edges'] = self.max_long_edges
            strategy['connectivity_level'] = 'normal'
            strategy['reason'] = 'Moderate OOD score, standard handling'
            
        else:
            # 高OOD-score：需要更多连接
            strategy['action'] = 'add_enhanced'
            strategy['max_long_edges'] = self.max_long_edges * 2  # 增加长边数量
            strategy['connectivity_level'] = 'high'
            strategy['reason'] = 'High OOD score, enhanced connectivity needed'
        
        return strategy
    
    def add_ood_node_with_strategy(self, vector: np.ndarray, 
                                 low_threshold: float = 0.2,
                                 high_threshold: float = 0.6) -> Optional[Dict]:
        """
        使用自适应策略添加OOD节点
        
        Returns:
            添加结果信息
        """
        strategy = self.adaptive_node_strategy(vector, low_threshold, high_threshold)
        
        if strategy['action'] == 'skip':
            return None
        
        # 临时调整max_long_edges
        original_max_edges = self.max_long_edges
        self.max_long_edges = strategy['max_long_edges']
        
        # 添加节点
        ood_id = self.add_ood_node(vector, ood_threshold=low_threshold)
        
        # 恢复原始设置
        self.max_long_edges = original_max_edges
        
        if ood_id is not None:
            result = {
                'ood_id': ood_id,
                'strategy': strategy,
                'success': True
            }
            
            # 如果策略是enhanced，添加额外的长边
            if strategy['action'] == 'add_enhanced':
                self._add_extra_long_edges(ood_id, vector)
                result['extra_edges_added'] = True
            else:
                result['extra_edges_added'] = False
                
            return result
        
        return None
    
    def _add_extra_long_edges(self, ood_id: int, vector: np.ndarray):
        """为高OOD-score节点添加额外的长边"""
        # 计算与核心图的相似度
        similarities = np.dot(self.core_graph.vectors, vector / np.linalg.norm(vector))
        
        # 添加更多长边（降低阈值）
        extra_edges = []
        for i, similarity in enumerate(similarities):
            if similarity > 0.05:  # 很低的阈值
                extra_edges.append((i, similarity))
        
        # 限制额外边的数量
        extra_edges = sorted(extra_edges, key=lambda x: x[1], reverse=True)[:10]
        
        # 添加到现有连接中
        if ood_id in self.ood_to_core_edges:
            self.ood_to_core_edges[ood_id].extend(extra_edges)
        else:
            self.ood_to_core_edges[ood_id] = extra_edges
        
        logger.info(f"为OOD节点 {ood_id} 添加了 {len(extra_edges)} 条额外长边")
    
    def test_different_thresholds(self, test_vectors: List[np.ndarray]) -> Dict:
        """测试不同阈值下的OOD检测效果"""
        results = {}
        
        thresholds = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]
        
        for threshold in thresholds:
            detected_count = 0
            total_count = len(test_vectors)
            
            for vector in test_vectors:
                ood_score = self.compute_enhanced_ood_score(vector)
                if ood_score >= threshold:
                    detected_count += 1
            
            results[threshold] = {
                'detection_rate': detected_count / total_count,
                'detected_count': detected_count,
                'total_count': total_count
            }
        
        return results

# 创建增强版OOD图
print("创建增强版OOD图...")
enhanced_ood_graph = EnhancedOODGraph(core_graph, max_long_edges=3)

# 测试增强版OOD-score计算
print("测试增强版OOD-score计算...")
test_vectors = [id_queries[0], ood_queries[0], ood_queries[1]]

for i, vector in enumerate(test_vectors):
    basic_score = ood_graph.compute_ood_score(vector)
    enhanced_score = enhanced_ood_graph.compute_enhanced_ood_score(vector)
    
    vector_type = "ID" if i == 0 else "OOD"
    print(f"{vector_type}向量 {i}: 基础OOD-score: {basic_score:.3f}, 增强OOD-score: {enhanced_score:.3f}")

print("✅ Step 4: OOD-score机制实现完成")


## Step 5: 局部扰动增强

为高OOD-score节点增加少量随机长边，提升OOD查询的召回效果。


## Step 7: 查询测试与性能验证

实现统一的查询接口，测试不同分布查询的召回和可达性：
- 实现分层图查询接口
- 测试ID、OOD、混合查询的召回率
- 可视化查询路径和性能对比


In [None]:
class HierarchicalGraphQuery:
    """分层图查询类，整合核心图和OOD图"""
    
    def __init__(self, core_graph: CoreGraph, ood_graph):
        """
        初始化分层图查询
        
        Args:
            core_graph: 核心图实例
            ood_graph: OOD图实例
        """
        self.core_graph = core_graph
        self.ood_graph = ood_graph
        self.query_stats = {
            'total_queries': 0,
            'core_only_queries': 0,
            'ood_enhanced_queries': 0,
            'avg_query_time': 0
        }
    
    def hierarchical_search(self, query_vector: np.ndarray, k: int = 10, 
                          search_strategy: str = "adaptive") -> Dict:
        """
        分层图搜索
        
        Args:
            query_vector: 查询向量
            k: 返回结果数量
            search_strategy: 搜索策略 ("core_only", "ood_enhanced", "adaptive")
            
        Returns:
            搜索结果字典
        """
        start_time = time.time()
        
        # 计算OOD-score决定搜索策略
        if search_strategy == "adaptive":
            ood_score = self.ood_graph.compute_enhanced_ood_score(query_vector)
            if ood_score > 0.4:
                search_strategy = "ood_enhanced"
            else:
                search_strategy = "core_only"
        
        results = {
            'query_vector': query_vector,
            'k': k,
            'search_strategy': search_strategy,
            'results': [],
            'ood_score': None,
            'query_time': 0,
            'search_path': []
        }
        
        if search_strategy == "core_only":
            # 仅在核心图搜索
            core_results = self.core_graph.knn_search(query_vector, k)
            results['results'] = [(node_id, sim, 'core') for node_id, sim in core_results]
            results['search_path'] = ['core_graph']
            self.query_stats['core_only_queries'] += 1
            
        elif search_strategy == "ood_enhanced":
            # 增强搜索：核心图 + OOD图
            ood_score = self.ood_graph.compute_enhanced_ood_score(query_vector)
            results['ood_score'] = ood_score
            
            # 1. 在核心图搜索
            core_results = self.core_graph.knn_search(query_vector, k//2)
            core_candidates = [(node_id, sim, 'core') for node_id, sim in core_results]
            
            # 2. 在OOD图搜索
            ood_results = []
            if len(self.ood_graph.ood_vectors) > 0:
                ood_vectors_array = np.array(list(self.ood_graph.ood_vectors.values()))
                ood_ids = list(self.ood_graph.ood_vectors.keys())
                
                query_norm = query_vector / np.linalg.norm(query_vector)
                similarities = np.dot(ood_vectors_array, query_norm)
                
                # 获取top-k OOD结果
                top_k_indices = np.argsort(similarities)[::-1][:k//2]
                ood_results = [(ood_ids[i], similarities[i], 'ood') for i in top_k_indices]
            
            # 3. 合并和排序结果
            all_results = core_candidates + ood_results
            all_results.sort(key=lambda x: x[1], reverse=True)
            results['results'] = all_results[:k]
            results['search_path'] = ['core_graph', 'ood_graph']
            self.query_stats['ood_enhanced_queries'] += 1
        
        # 计算查询时间
        query_time = time.time() - start_time
        results['query_time'] = query_time
        
        # 更新统计信息
        self.query_stats['total_queries'] += 1
        total_time = self.query_stats['avg_query_time'] * (self.query_stats['total_queries'] - 1)
        self.query_stats['avg_query_time'] = (total_time + query_time) / self.query_stats['total_queries']
        
        return results
    
    def batch_query_test(self, query_vectors: List[np.ndarray], 
                        query_labels: List[str], k: int = 10) -> Dict:
        """
        批量查询测试
        
        Args:
            query_vectors: 查询向量列表
            query_labels: 查询标签列表
            k: 返回结果数量
            
        Returns:
            批量测试结果
        """
        batch_results = {
            'id_queries': [],
            'ood_queries': [],
            'mixed_queries': [],
            'performance_stats': {}
        }
        
        for i, (vector, label) in enumerate(zip(query_vectors, query_labels)):
            # 使用自适应策略
            result = self.hierarchical_search(vector, k, "adaptive")
            
            if label == 'ID':
                batch_results['id_queries'].append(result)
            elif label == 'OOD':
                batch_results['ood_queries'].append(result)
            else:
                batch_results['mixed_queries'].append(result)
        
        # 计算性能统计
        all_results = (batch_results['id_queries'] + 
                      batch_results['ood_queries'] + 
                      batch_results['mixed_queries'])
        
        if all_results:
            batch_results['performance_stats'] = {
                'avg_query_time': np.mean([r['query_time'] for r in all_results]),
                'total_queries': len(all_results),
                'avg_results_per_query': np.mean([len(r['results']) for r in all_results]),
                'core_only_ratio': len([r for r in all_results if r['search_strategy'] == 'core_only']) / len(all_results),
                'ood_enhanced_ratio': len([r for r in all_results if r['search_strategy'] == 'ood_enhanced']) / len(all_results)
            }
        
        return batch_results
    
    def visualize_query_performance(self, batch_results: Dict):
        """可视化查询性能"""
        plt.figure(figsize=(15, 10))
        
        # 子图1：查询时间分布
        plt.subplot(2, 3, 1)
        all_times = []
        all_labels = []
        
        for result_list, label in [(batch_results['id_queries'], 'ID'),
                                  (batch_results['ood_queries'], 'OOD')]:
            times = [r['query_time'] for r in result_list]
            all_times.extend(times)
            all_labels.extend([label] * len(times))
        
        id_times = [r['query_time'] for r in batch_results['id_queries']]
        ood_times = [r['query_time'] for r in batch_results['ood_queries']]
        
        plt.hist([id_times, ood_times], bins=10, alpha=0.7, 
                label=['ID查询', 'OOD查询'], color=['green', 'red'])
        plt.title('查询时间分布')
        plt.xlabel('查询时间 (秒)')
        plt.ylabel('频次')
        plt.legend()
        plt.grid(True, alpha=0.3)
        
        # 子图2：OOD-score分布
        plt.subplot(2, 3, 2)
        ood_scores = [r['ood_score'] for r in batch_results['ood_queries'] if r['ood_score'] is not None]
        if ood_scores:
            plt.hist(ood_scores, bins=10, alpha=0.7, color='red')
            plt.title('OOD查询的OOD-score分布')
            plt.xlabel('OOD Score')
            plt.ylabel('频次')
            plt.grid(True, alpha=0.3)
        
        # 子图3：搜索结果数量分布
        plt.subplot(2, 3, 3)
        id_result_counts = [len(r['results']) for r in batch_results['id_queries']]
        ood_result_counts = [len(r['results']) for r in batch_results['ood_queries']]
        
        plt.hist([id_result_counts, ood_result_counts], bins=10, alpha=0.7,
                label=['ID查询', 'OOD查询'], color=['green', 'red'])
        plt.title('搜索结果数量分布')
        plt.xlabel('结果数量')
        plt.ylabel('频次')
        plt.legend()
        plt.grid(True, alpha=0.3)
        
        # 子图4：搜索策略使用情况
        plt.subplot(2, 3, 4)
        strategies = ['core_only', 'ood_enhanced']
        counts = [batch_results['performance_stats']['core_only_ratio'],
                 batch_results['performance_stats']['ood_enhanced_ratio']]
        
        plt.bar(strategies, counts, color=['blue', 'orange'], alpha=0.7)
        plt.title('搜索策略使用比例')
        plt.ylabel('比例')
        plt.grid(True, alpha=0.3)
        
        # 子图5：平均相似度对比
        plt.subplot(2, 3, 5)
        id_avg_sims = [np.mean([sim for _, sim, _ in r['results']]) for r in batch_results['id_queries']]
        ood_avg_sims = [np.mean([sim for _, sim, _ in r['results']]) for r in batch_results['ood_queries']]
        
        plt.hist([id_avg_sims, ood_avg_sims], bins=10, alpha=0.7,
                label=['ID查询', 'OOD查询'], color=['green', 'red'])
        plt.title('平均相似度分布')
        plt.xlabel('平均相似度')
        plt.ylabel('频次')
        plt.legend()
        plt.grid(True, alpha=0.3)
        
        # 子图6：性能对比
        plt.subplot(2, 3, 6)
        metrics = ['查询时间', '结果数量', '平均相似度']
        id_values = [
            np.mean([r['query_time'] for r in batch_results['id_queries']]),
            np.mean([len(r['results']) for r in batch_results['id_queries']]),
            np.mean([np.mean([sim for _, sim, _ in r['results']]) for r in batch_results['id_queries']])
        ]
        ood_values = [
            np.mean([r['query_time'] for r in batch_results['ood_queries']]),
            np.mean([len(r['results']) for r in batch_results['ood_queries']]),
            np.mean([np.mean([sim for _, sim, _ in r['results']]) for r in batch_results['ood_queries']])
        ]
        
        x = np.arange(len(metrics))
        width = 0.35
        
        plt.bar(x - width/2, id_values, width, label='ID查询', alpha=0.7, color='green')
        plt.bar(x + width/2, ood_values, width, label='OOD查询', alpha=0.7, color='red')
        
        plt.title('性能指标对比')
        plt.xlabel('指标')
        plt.ylabel('值')
        plt.xticks(x, metrics, rotation=45)
        plt.legend()
        plt.grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.show()

# 创建分层图查询实例
print("创建分层图查询实例...")
hierarchical_query = HierarchicalGraphQuery(core_graph, enhanced_ood_graph)

print("✅ Step 7: 查询接口实现完成")


In [None]:
# 测试分层图查询功能
print("测试分层图查询功能...")

# 单个查询测试
print("1. 单个查询测试")
test_id_query = id_queries[0]
test_ood_query = ood_queries[0]

# ID查询测试
id_result = hierarchical_query.hierarchical_search(test_id_query, k=5, search_strategy="adaptive")
print(f"ID查询结果:")
print(f"  搜索策略: {id_result['search_strategy']}")
print(f"  查询时间: {id_result['query_time']:.4f}秒")
print(f"  结果数量: {len(id_result['results'])}")
print(f"  前3个结果: {id_result['results'][:3]}")

# OOD查询测试
ood_result = hierarchical_query.hierarchical_search(test_ood_query, k=5, search_strategy="adaptive")
print(f"\nOOD查询结果:")
print(f"  搜索策略: {ood_result['search_strategy']}")
print(f"  OOD-score: {ood_result['ood_score']:.3f}")
print(f"  查询时间: {ood_result['query_time']:.4f}秒")
print(f"  结果数量: {len(ood_result['results'])}")
print(f"  前3个结果: {ood_result['results'][:3]}")

# 批量查询测试
print("\n2. 批量查询测试")
test_queries = list(id_queries[:10]) + list(ood_queries[:10])
test_labels = ['ID'] * 10 + ['OOD'] * 10

batch_results = hierarchical_query.batch_query_test(test_queries, test_labels, k=8)

print(f"批量测试结果:")
print(f"  ID查询数量: {len(batch_results['id_queries'])}")
print(f"  OOD查询数量: {len(batch_results['ood_queries'])}")
print(f"  平均查询时间: {batch_results['performance_stats']['avg_query_time']:.4f}秒")
print(f"  核心图搜索比例: {batch_results['performance_stats']['core_only_ratio']:.3f}")
print(f"  增强搜索比例: {batch_results['performance_stats']['ood_enhanced_ratio']:.3f}")

# 可视化查询性能
print("\n3. 可视化查询性能")
hierarchical_query.visualize_query_performance(batch_results)

# 查询统计信息
print("\n4. 查询统计信息")
query_stats = hierarchical_query.query_stats
print(f"  总查询次数: {query_stats['total_queries']}")
print(f"  核心图查询次数: {query_stats['core_only_queries']}")
print(f"  增强查询次数: {query_stats['ood_enhanced_queries']}")
print(f"  平均查询时间: {query_stats['avg_query_time']:.4f}秒")

print("✅ Step 7: 查询测试与性能验证完成")


## Step 8: Notebook记录与总结

总结整个OOD分层图索引系统的设计思路、实现效果和测试结果。


In [None]:
# Step 8: 系统总结与效果评估

print("=" * 60)
print("OOD分层图索引系统 - 完整总结")
print("=" * 60)

# 1. 系统架构总结
print("\n1. 系统架构设计:")
print("   ✓ 核心图 (CoreGraph): 存储ID节点，基于余弦相似度构建邻接关系")
print("   ✓ 边缘OOD图 (OODGraph): 存储OOD节点，通过长边连接核心图")
print("   ✓ 增强OOD图 (EnhancedOODGraph): 包含OOD-score机制和自适应策略")
print("   ✓ 分层查询 (HierarchicalGraphQuery): 统一的查询接口")

# 2. 数据统计
print("\n2. 数据规模:")
print(f"   ✓ 库向量: {LIBRARY_SIZE} 个 {VECTOR_DIM} 维向量")
print(f"   ✓ ID查询: {len(id_queries)} 个")
print(f"   ✓ OOD查询: {len(ood_queries)} 个")
print(f"   ✓ OOD比例: {OOD_RATIO:.1%}")

# 3. 核心图统计
core_stats = core_graph.get_graph_stats()
print("\n3. 核心图统计:")
print(f"   ✓ 节点数: {core_stats['nodes']}")
print(f"   ✓ 边数: {core_stats['edges']}")
print(f"   ✓ 平均度数: {core_stats['avg_degree']:.2f}")
print(f"   ✓ 连通性: {'连通' if core_stats['is_connected'] else '不连通'}")
print(f"   ✓ 连通组件数: {core_stats['num_components']}")

# 4. OOD图统计
ood_stats = enhanced_ood_graph.get_ood_stats()
print("\n4. OOD图统计:")
print(f"   ✓ OOD节点数: {ood_stats['ood_nodes']}")
print(f"   ✓ OOD边数: {ood_stats['ood_edges']}")
print(f"   ✓ 长边总数: {ood_stats['long_edges_to_core']}")
print(f"   ✓ 平均OOD-score: {ood_stats['avg_ood_score']:.3f}")
print(f"   ✓ OOD图连通性: {'连通' if ood_stats['is_connected'] else '不连通'}")

# 5. 查询性能统计
query_stats = hierarchical_query.query_stats
print("\n5. 查询性能统计:")
print(f"   ✓ 总查询次数: {query_stats['total_queries']}")
print(f"   ✓ 核心图查询: {query_stats['core_only_queries']}")
print(f"   ✓ 增强查询: {query_stats['ood_enhanced_queries']}")
print(f"   ✓ 平均查询时间: {query_stats['avg_query_time']:.4f}秒")

# 6. 系统特性总结
print("\n6. 系统特性:")
print("   ✓ 自适应OOD检测: 基于局部密度和相似度的OOD-score计算")
print("   ✓ 分层图结构: 核心图处理ID查询，边缘图处理OOD查询")
print("   ✓ 长边连接: OOD节点通过长边连接到核心图，保证可达性")
print("   ✓ 在线增量: 支持新OOD节点的动态插入")
print("   ✓ 查询优化: 根据OOD-score自适应选择搜索策略")

# 7. 技术亮点
print("\n7. 技术亮点:")
print("   ✓ 使用FAISS加速kNN计算")
print("   ✓ NetworkX构建图结构和图算法")
print("   ✓ PCA降维可视化高维向量分布")
print("   ✓ 多策略OOD-score计算")
print("   ✓ 自适应图结构优化")

# 8. 可视化总结
print("\n8. 可视化总结:")
print("   ✓ 向量分布可视化 (PCA降维)")
print("   ✓ 图结构统计 (度分布、边权重分布)")
print("   ✓ OOD-score分布和长边连接分析")
print("   ✓ 查询性能对比 (时间、相似度、策略使用)")

print("\n" + "=" * 60)
print("系统设计理念:")
print("  本系统通过分层图结构有效处理OOD查询，核心思想是:")
print("  1. ID数据用高效的核心图处理")
print("  2. OOD数据用边缘图+长边连接处理")
print("  3. 通过OOD-score自适应调整图结构策略")
print("  4. 保证OOD查询的可达性和召回效果")
print("=" * 60)

print("\n✅ Step 8: Notebook记录与总结完成")
print("🎉 OOD分层图索引Demo全部完成！")


In [None]:
# 测试自适应策略
print("测试自适应节点策略...")

# 测试不同类型的向量
test_cases = [
    ("ID查询", id_queries[0]),
    ("轻度OOD", ood_queries[0]),
    ("重度OOD", ood_queries[1] * 2),  # 放大OOD特征
]

for case_name, vector in test_cases:
    strategy = enhanced_ood_graph.adaptive_node_strategy(vector)
    print(f"{case_name}:")
    print(f"  OOD-score: {strategy['ood_score']:.3f}")
    print(f"  策略: {strategy['action']}")
    print(f"  原因: {strategy['reason']}")
    print()

print("✅ Step 4: OOD-score机制测试完成")


In [None]:
# 测试自适应策略
print("测试自适应节点策略...")

# 测试不同类型的向量
test_cases = [
    ("ID查询", id_queries[0]),
    ("轻度OOD", ood_queries[0]),
    ("重度OOD", ood_queries[1] * 2),  # 放大OOD特征
]

for case_name, vector in test_cases:
    strategy = enhanced_ood_graph.adaptive_node_strategy(vector)
    print(f"{case_name}:")
    print(f"  OOD-score: {strategy['ood_score']:.3f}")
    print(f"  策略: {strategy['action']}")
    print(f"  原因: {strategy['reason']}")
    print(f"  连接级别: {strategy['connectivity_level']}")
    print()

# 测试不同阈值的OOD检测效果
print("测试不同阈值下的OOD检测效果...")
threshold_results = enhanced_ood_graph.test_different_thresholds(ood_queries[:20])

plt.figure(figsize=(12, 4))

# 子图1：检测率随阈值变化
plt.subplot(1, 2, 1)
thresholds = list(threshold_results.keys())
detection_rates = [results['detection_rate'] for results in threshold_results.values()]
plt.plot(thresholds, detection_rates, 'bo-', linewidth=2, markersize=8)
plt.title('OOD检测率 vs 阈值')
plt.xlabel('OOD-score阈值')
plt.ylabel('检测率')
plt.grid(True, alpha=0.3)
plt.ylim(0, 1.1)

# 子图2：检测数量随阈值变化
plt.subplot(1, 2, 2)
detected_counts = [results['detected_count'] for results in threshold_results.values()]
plt.plot(thresholds, detected_counts, 'ro-', linewidth=2, markersize=8)
plt.title('OOD检测数量 vs 阈值')
plt.xlabel('OOD-score阈值')
plt.ylabel('检测到的OOD数量')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# 打印详细结果
print("阈值测试详细结果:")
for threshold, results in threshold_results.items():
    print(f"  阈值 {threshold}: 检测率 {results['detection_rate']:.3f}, "
          f"检测数量 {results['detected_count']}/{results['total_count']}")

# 测试使用自适应策略添加节点
print("\n使用自适应策略添加OOD节点...")
added_nodes_info = []

for i, ood_vector in enumerate(ood_queries[:5]):
    result = enhanced_ood_graph.add_ood_node_with_strategy(ood_vector)
    if result is not None:
        added_nodes_info.append(result)
        print(f"OOD向量 {i}: 成功添加节点 {result['ood_id']}, "
              f"策略: {result['strategy']['action']}, "
              f"额外边: {result['extra_edges_added']}")

print(f"\n成功添加了 {len(added_nodes_info)} 个OOD节点")

# 可视化增强版OOD图结构
enhanced_ood_graph.visualize_ood_structure()

print("✅ Step 4: OOD-score机制测试完成")
