<a href="https://colab.research.google.com/github/wangyiyang/RAG-Cookbook-Code/blob/main/ch03/reranking_system.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%pip install sentence-transformers torch transformers faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0.post1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.0 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-m

In [8]:
"""
重排序算法系统
包含Cross-Encoder、ColBERT等重排序方法
"""

import numpy as np
from typing import List, Dict, Any, Tuple
from dataclasses import dataclass
import torch
import re


def simple_chinese_tokenizer(text: str) -> List[str]:
    """简单的中英文分词器"""
    # 分离中英文和标点符号
    pattern = r'[a-zA-Z]+|[0-9]+|[\u4e00-\u9fff]'
    tokens = re.findall(pattern, text.lower())
    return tokens


@dataclass
class RerankResult:
    """重排序结果"""
    doc_id: str
    content: str
    original_score: float
    rerank_score: float
    final_rank: int


class MockCrossEncoder:
    """模拟Cross-Encoder（实际使用sentence-transformers库）"""

    def __init__(self, model_name: str = "cross-encoder/ms-marco-MiniLM-L-12-v2"):
        self.model_name = model_name
        self.max_length = 512

    def predict(self, query_doc_pairs: List[Tuple[str, str]]) -> List[float]:
        """预测查询-文档对的相关性分数"""
        scores = []
        for query, doc in query_doc_pairs:
            # 模拟相关性计算（实际使用预训练模型）
            score = self._mock_relevance_score(query, doc)
            scores.append(score)
        return scores

    def _mock_relevance_score(self, query: str, doc: str) -> float:
        """模拟相关性分数计算"""
        # 使用改进的分词器
        query_words = set(simple_chinese_tokenizer(query))
        doc_words = set(simple_chinese_tokenizer(doc))

        if len(query_words) == 0:
            return 0.0

        overlap_ratio = len(query_words & doc_words) / len(query_words)

        # 计算精确匹配关键词的权重（重要改进）
        exact_match_bonus = 0.0
        important_matches = 0
        for q_word in query_words:
            if q_word.lower() in [w.lower() for w in doc_words]:
                # 重要关键词（如RAG）给予更高权重
                if len(q_word) >= 2:  # 降低长度要求，适应中文
                    if q_word.lower() == 'rag':  # RAG是核心关键词
                        exact_match_bonus += 0.4
                        important_matches += 1
                    elif q_word in ['检索', '技术', '原理']:  # 其他重要词
                        exact_match_bonus += 0.2
                        important_matches += 1
                    else:
                        exact_match_bonus += 0.1

        # 考虑文档长度因子
        length_factor = min(1.0, max(0.3, len(simple_chinese_tokenizer(doc)) / 30))

        # 模拟语义相似度（基于内容相关性）
        semantic_factor = 0.3
        # 技术词汇匹配奖励
        tech_words = {'技术', '算法', '检索', '生成', '嵌入', '相似度', '匹配'}
        query_tech = len(set(simple_chinese_tokenizer(query)) & tech_words)
        doc_tech = len(set(simple_chinese_tokenizer(doc)) & tech_words)
        if query_tech > 0 and doc_tech > 0:
            semantic_factor += min(0.2, (query_tech + doc_tech) * 0.05)

        # 主题相关性（根据文档内容判断）
        if 'rag' in doc.lower():
            semantic_factor += 0.3  # RAG相关内容
        elif '检索' in doc and '算法' in doc:
            semantic_factor += 0.2  # 检索算法相关
        elif '向量' in doc and '检索' in doc:
            semantic_factor += 0.15  # 向量检索相关

        # 最终分数计算
        base_score = 0.1
        final_score = base_score + (overlap_ratio * 0.2 + exact_match_bonus + length_factor * 0.15 + semantic_factor)

        return min(1.0, final_score)


class CrossEncoderReRanker:
    """Cross-Encoder重排序器"""

    def __init__(self, model_name: str = "cross-encoder/ms-marco-MiniLM-L-12-v2"):
        self.model = MockCrossEncoder(model_name)
        self.max_length = 512

    def rerank(self, query: str, documents: List[Dict], top_k: int = 5) -> List[RerankResult]:
        """执行重排序"""
        if not documents:
            return []

        # 1. 构建查询-文档对
        query_doc_pairs = []
        for doc in documents:
            content = doc.get('content', str(doc))
            # 智能截断
            truncated_content = self._intelligent_truncate(content, query)
            query_doc_pairs.append((query, truncated_content))

        # 2. 批量计算相关性分数
        rerank_scores = self.model.predict(query_doc_pairs)

        # 3. 构建结果并排序
        results = []
        for i, (doc, score) in enumerate(zip(documents, rerank_scores)):
            result = RerankResult(
                doc_id=doc.get('doc_id', str(i)),
                content=doc.get('content', str(doc)),
                original_score=doc.get('score', 0.0),
                rerank_score=score,
                final_rank=0  # 将在排序后设置
            )
            results.append(result)

        # 4. 按重排序分数排序
        results.sort(key=lambda x: x.rerank_score, reverse=True)

        # 5. 设置最终排名
        for rank, result in enumerate(results[:top_k], 1):
            result.final_rank = rank

        return results[:top_k]

    def _intelligent_truncate(self, content: str, query: str, max_tokens: int = 400) -> str:
        """智能截断策略：保留最相关的部分"""
        if len(content.split()) <= max_tokens:
            return content

        # 分句处理
        sentences = self._split_sentences(content)
        if not sentences:
            return content[:max_tokens * 4]  # 估算字符数

        # 计算每个句子的相关性
        sentence_scores = []
        query_words = set(simple_chinese_tokenizer(query))

        for sentence in sentences:
            sentence_words = set(simple_chinese_tokenizer(sentence))
            # 词汇重叠度
            overlap = len(query_words & sentence_words) / max(len(query_words), 1)
            # 句子长度因子（偏向中等长度句子）
            length_factor = max(0.1, 1 - abs(len(simple_chinese_tokenizer(sentence)) - 10) / 20)
            score = overlap * length_factor
            sentence_scores.append((sentence, score))

        # 选择最相关的句子
        sentence_scores.sort(key=lambda x: x[1], reverse=True)

        selected_sentences = []
        current_length = 0

        for sentence, score in sentence_scores:
            sentence_length = len(simple_chinese_tokenizer(sentence))
            if current_length + sentence_length <= max_tokens:
                selected_sentences.append(sentence)
                current_length += sentence_length
            else:
                break

        # 保持原文顺序
        result_sentences = []
        for sentence in sentences:
            if sentence in selected_sentences:
                result_sentences.append(sentence)

        return '。'.join(result_sentences) if result_sentences else content[:max_tokens * 4]

    def _split_sentences(self, text: str) -> List[str]:
        """分句处理"""
        # 中文分句
        sentences = re.split(r'[。！？；]', text)
        # 过滤空句子和过短句子
        sentences = [s.strip() for s in sentences if len(s.strip()) > 5]
        return sentences


class ColBERTReRanker:
    """ColBERT重排序器（模拟实现）"""

    def __init__(self, model_name: str = "colbert-ir/colbertv2.0"):
        self.model_name = model_name
        self.dim = 128  # ColBERT向量维度

    def rerank(self, query: str, documents: List[Dict], top_k: int = 5) -> List[RerankResult]:
        """ColBERT重排序"""
        if not documents:
            return []

        # 1. 查询编码
        query_embeddings = self._encode_query(query)

        results = []
        for i, doc in enumerate(documents):
            content = doc.get('content', str(doc))

            # 2. 文档编码
            doc_embeddings = self._encode_document(content)

            # 3. 计算ColBERT分数
            score = self._compute_colbert_score(query_embeddings, doc_embeddings)

            result = RerankResult(
                doc_id=doc.get('doc_id', str(i)),
                content=content,
                original_score=doc.get('score', 0.0),
                rerank_score=score,
                final_rank=0
            )
            results.append(result)

        # 4. 排序
        results.sort(key=lambda x: x.rerank_score, reverse=True)

        for rank, result in enumerate(results[:top_k], 1):
            result.final_rank = rank

        return results[:top_k]

    def _encode_query(self, query: str) -> np.ndarray:
        """编码查询（模拟）"""
        tokens = simple_chinese_tokenizer(query)
        # 模拟每个token的向量表示
        embeddings = []
        for i, token in enumerate(tokens):
            # 基于token生成确定性向量，重要词汇特殊处理
            base_seed = abs(hash(token)) % (2**31)

            # 为重要关键词（如RAG）生成更具区分性的向量
            if token.lower() == 'rag':
                # RAG使用特殊的向量表示
                np.random.seed(base_seed)
                embedding = np.random.randn(self.dim) * 1.5  # 增强幅度
            elif token in ['检索', '技术', '原理']:
                # 其他重要词汇也给予特殊处理
                np.random.seed(base_seed + 1000)
                embedding = np.random.randn(self.dim) * 1.2
            else:
                np.random.seed(base_seed)
                embedding = np.random.randn(self.dim)

            embedding = embedding / np.linalg.norm(embedding)  # 单位化
            embeddings.append(embedding)

        return np.array(embeddings)  # [query_len, dim]

    def _encode_document(self, document: str) -> np.ndarray:
        """编码文档（模拟）"""
        tokens = simple_chinese_tokenizer(document)[:50]  # 限制长度
        embeddings = []

        for token in tokens:
            base_seed = abs(hash(token)) % (2**31)

            # 为重要关键词生成特殊向量
            if token.lower() == 'rag':
                # RAG词汇使用与查询中相同的种子，增加相似度
                np.random.seed(base_seed)
                embedding = np.random.randn(self.dim) * 1.5
            elif token in ['检索', '技术', '算法', '生成', '嵌入']:
                np.random.seed(base_seed + 1000)
                embedding = np.random.randn(self.dim) * 1.2
            else:
                np.random.seed(base_seed)
                embedding = np.random.randn(self.dim)

            embedding = embedding / np.linalg.norm(embedding)
            embeddings.append(embedding)

        return np.array(embeddings)  # [doc_len, dim]

    def _compute_colbert_score(self, query_embs: np.ndarray, doc_embs: np.ndarray) -> float:
        """计算ColBERT分数：MaxSim操作"""
        if len(query_embs) == 0 or len(doc_embs) == 0:
            return 0.0

        # 计算相似度矩阵
        similarity_matrix = np.dot(query_embs, doc_embs.T)  # [query_len, doc_len]

        # 每个查询token找到最相似的文档token
        max_similarities = np.max(similarity_matrix, axis=1)  # [query_len]

        # 加权求和：重要token获得更高权重
        query_tokens = query_embs  # 获取查询token信息
        weighted_score = 0.0
        total_weight = 0.0

        for i, sim in enumerate(max_similarities):
            # 为重要位置的token分配更高权重
            if i < len(query_embs):  # 确保索引有效
                # 这里我们可以根据token的重要性来分配权重
                # 假设第一个token（通常是关键词）更重要
                weight = 2.0 if i == 0 else 1.0  # 简单的权重分配
                weighted_score += sim * weight
                total_weight += weight

        if total_weight == 0:
            return 0.0

        # 计算加权平均分数
        avg_score = weighted_score / total_weight

        # 转换到[0, 1]范围：(score + 1) / 2，并增加正向偏置
        final_score = (avg_score + 1) / 2

        # 如果有强匹配（分数>0.8），给予额外奖励
        if final_score > 0.8:
            final_score = min(1.0, final_score * 1.1)

        return max(0.0, min(1.0, final_score))


class LearningToRankReRanker:
    """Learning-to-Rank重排序器"""

    def __init__(self):
        self.feature_weights = {
            'bm25_score': 0.3,
            'semantic_score': 0.4,
            'length_factor': 0.1,
            'position_factor': 0.2
        }

    def rerank(self, query: str, documents: List[Dict], top_k: int = 5) -> List[RerankResult]:
        """基于特征的重排序"""
        if not documents:
            return []

        results = []
        for i, doc in enumerate(documents):
            content = doc.get('content', str(doc))

            # 提取特征
            features = self._extract_features(query, content, i)

            # 计算综合分数
            score = sum(
                features[feature] * weight
                for feature, weight in self.feature_weights.items()
                if feature in features
            )

            result = RerankResult(
                doc_id=doc.get('doc_id', str(i)),
                content=content,
                original_score=doc.get('score', 0.0),
                rerank_score=score,
                final_rank=0
            )
            results.append(result)

        # 排序
        results.sort(key=lambda x: x.rerank_score, reverse=True)

        for rank, result in enumerate(results[:top_k], 1):
            result.final_rank = rank

        return results[:top_k]

    def _extract_features(self, query: str, document: str, position: int) -> Dict[str, float]:
        """提取特征"""
        query_words = set(simple_chinese_tokenizer(query))
        doc_words = simple_chinese_tokenizer(document)
        doc_words_set = set(doc_words)

        # BM25相似特征（改进的词汇重叠计算）
        bm25_score = len(query_words & doc_words_set) / max(len(query_words), 1)

        # 精确匹配奖励
        exact_match_bonus = 0.0
        for q_word in query_words:
            if q_word in doc_words_set and len(q_word) >= 2:  # 适应中文
                exact_match_bonus += 0.15

        # 语义相似度特征（基于内容相关性）
        semantic_score = 0.4  # 基础语义分数
        tech_words = {'技术', '算法', '检索', '生成', '嵌入', '相似度', '匹配', 'rag', 'bm25'}
        query_tech = len(set(simple_chinese_tokenizer(query)) & tech_words)
        doc_tech = len(set(simple_chinese_tokenizer(document)) & tech_words)
        if query_tech > 0 and doc_tech > 0:
            semantic_score += min(0.4, (query_tech + doc_tech) * 0.1)

        # 文档长度特征
        length_factor = min(1.0, len(doc_words) / 100)

        # 位置特征（早期结果有优势）
        position_factor = 1.0 / (1 + position * 0.1)

        return {
            'bm25_score': bm25_score + exact_match_bonus,
            'semantic_score': semantic_score,
            'length_factor': length_factor,
            'position_factor': position_factor
        }


class ReRankingSystem:
    """重排序系统集成类"""

    def __init__(self):
        self.cross_encoder = CrossEncoderReRanker()
        self.colbert = ColBERTReRanker()
        self.learning_to_rank = LearningToRankReRanker()

    def rerank(
        self,
        query: str,
        documents: List[Dict],
        method: str = "cross_encoder",
        top_k: int = 5
    ) -> List[RerankResult]:
        """统一重排序接口"""

        if method == "cross_encoder":
            return self.cross_encoder.rerank(query, documents, top_k)
        elif method == "colbert":
            return self.colbert.rerank(query, documents, top_k)
        elif method == "learning_to_rank":
            return self.learning_to_rank.rerank(query, documents, top_k)
        else:
            raise ValueError(f"Unsupported reranking method: {method}")

    def compare_methods(
        self,
        query: str,
        documents: List[Dict],
        top_k: int = 5
    ) -> Dict[str, List[RerankResult]]:
        """比较不同重排序方法"""
        methods = ["cross_encoder", "colbert", "learning_to_rank"]
        results = {}

        for method in methods:
            try:
                results[method] = self.rerank(query, documents, method, top_k)
            except Exception as e:
                print(f"Error with {method}: {e}")
                results[method] = []

        return results

    def analyze_ranking_decisions(
        self,
        query: str,
        documents: List[Dict],
        top_k: int = 3
    ) -> None:
        """分析不同重排序方法的决策过程"""
        print(f"=== 重排序决策分析 ===")
        print(f"查询: {query}\n")

        # 获取各方法的排序结果
        results = self.compare_methods(query, documents, top_k)

        # 分析每个文档的特征
        print("文档特征分析:")
        for i, doc in enumerate(documents):
            content = doc.get('content', '')
            doc_words = set(simple_chinese_tokenizer(content))
            query_words = set(simple_chinese_tokenizer(query))

            print(f"查询分词: {list(query_words)}")  # 调试信息
            print(f"文档{i+1}分词: {list(doc_words)[:10]}...")  # 显示前10个词

            overlap = len(query_words & doc_words)
            overlap_words = query_words & doc_words
            print(f"文档{i+1}: {content[:30]}...")
            print(f"  - 词汇重叠: {overlap}/{len(query_words)} = {overlap/len(query_words):.2f}")
            print(f"  - 重叠词汇: {list(overlap_words)}")
            print(f"  - 包含'RAG': {'RAG' in content}")
            print(f"  - 包含'检索': {'检索' in content}")
            print(f"  - 包含'技术': {'技术' in content}")
            print()

        # 显示排序结果对比
        print("排序结果对比:")
        for method, method_results in results.items():
            print(f"{method.upper()}:")
            for result in method_results:
                print(f"  排名{result.final_rank}: [分数: {result.rerank_score:.3f}] 文档{result.doc_id}")

        print("\n=== 分析总结 ===")
        rag_rankings = {}
        for method, method_results in results.items():
            for result in method_results:
                if 'RAG' in result.content:
                    rag_rankings[method] = result.final_rank
                    break

        print("RAG文档在各算法中的排名:")
        for method, rank in rag_rankings.items():
            status = "✅" if rank == 1 else "❌"
            print(f"  {method}: 第{rank}名 {status}")

# 使用示例
if __name__ == "__main__":
    # 模拟文档数据
    documents = [
        {
            "doc_id": "1",
            "content": "RAG是检索增强生成技术，它结合了信息检索和自然语言生成的优势",
            "score": 0.8
        },
        {
            "doc_id": "2",
            "content": "BM25是一种经典的信息检索算法，基于TF-IDF的改进版本",
            "score": 0.6
        },
        {
            "doc_id": "3",
            "content": "向量检索通过语义嵌入实现文档的相似度计算和匹配",
            "score": 0.7
        }
    ]

    # 初始化重排序系统
    rerank_system = ReRankingSystem()

    query = "RAG检索技术原理"

    # 比较不同方法
    results = rerank_system.compare_methods(query, documents, top_k=3)

    print(f"查询: {query}\n")

    for method, method_results in results.items():
        print(f"{method.upper()} 重排序结果:")
        for result in method_results:
            print(f"  排名{result.final_rank}: [分数: {result.rerank_score:.3f}] {result.content[:50]}...")
        print()

    # 添加详细分析
    print("\n" + "="*60)
    rerank_system.analyze_ranking_decisions(query, documents)

查询: RAG检索技术原理

CROSS_ENCODER 重排序结果:
  排名1: [分数: 1.000] RAG是检索增强生成技术，它结合了信息检索和自然语言生成的优势...
  排名2: [分数: 0.772] BM25是一种经典的信息检索算法，基于TF-IDF的改进版本...
  排名3: [分数: 0.722] 向量检索通过语义嵌入实现文档的相似度计算和匹配...

COLBERT 重排序结果:
  排名1: [分数: 0.985] RAG是检索增强生成技术，它结合了信息检索和自然语言生成的优势...
  排名2: [分数: 0.700] 向量检索通过语义嵌入实现文档的相似度计算和匹配...
  排名3: [分数: 0.687] BM25是一种经典的信息检索算法，基于TF-IDF的改进版本...

LEARNING_TO_RANK 重排序结果:
  排名1: [分数: 0.727] RAG是检索增强生成技术，它结合了信息检索和自然语言生成的优势...
  排名2: [分数: 0.451] BM25是一种经典的信息检索算法，基于TF-IDF的改进版本...
  排名3: [分数: 0.435] 向量检索通过语义嵌入实现文档的相似度计算和匹配...


=== 重排序决策分析 ===
查询: RAG检索技术原理

文档特征分析:
查询分词: ['理', 'rag', '术', '检', '索', '原', '技']
文档1分词: ['强', '语', '自', '成', '息', '然', '索', '优', '生', '言']...
文档1: RAG是检索增强生成技术，它结合了信息检索和自然语言生成的优...
  - 词汇重叠: 5/7 = 0.71
  - 重叠词汇: ['rag', '术', '检', '索', '技']
  - 包含'RAG': True
  - 包含'检索': True
  - 包含'技术': True

查询分词: ['理', 'rag', '术', '检', '索', '原', '技']
文档2分词: ['一', '算', '于', '息', 'bm', '索', '种', '25', '基', '是']...
文档2: BM25是一种经典的信息检索算法，基于TF-IDF的改进版本...
  - 词汇重叠: 2/7 = 0.29
