<a href="https://colab.research.google.com/github/wangyiyang/RAG-Cookbook-Code/blob/main/ch04/quality_controller.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install numpy transformers torch sentence-transformers

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [6]:
"""
生成质量控制器
实现多维度质量评估、自我修正机制和迭代优化生成
"""

import re
import time
from typing import List, Dict, Any, Optional, Tuple
from dataclasses import dataclass
from abc import ABC, abstractmethod


@dataclass
class QualityMetrics:
    """质量指标"""
    factual_accuracy: float = 0.0
    relevance: float = 0.0
    completeness: float = 0.0
    consistency: float = 0.0
    safety: float = 0.0
    fluency: float = 0.0

    def overall_score(self, weights: Optional[Dict[str, float]] = None) -> float:
        """计算总体质量分数"""
        if weights is None:
            weights = {
                'factual_accuracy': 0.3,
                'relevance': 0.25,
                'completeness': 0.2,
                'consistency': 0.15,
                'safety': 0.1,
                'fluency': 0.0  # 可选项
            }

        return sum(
            getattr(self, metric) * weight
            for metric, weight in weights.items()
            if hasattr(self, metric)
        )


@dataclass
class GenerationResult:
    """生成结果"""
    content: str
    quality_metrics: QualityMetrics
    generation_time: float
    iteration_count: int
    is_safe: bool = True


class BaseLLM(ABC):
    """基础LLM接口"""

    @abstractmethod
    def generate(self, prompt: str, **kwargs) -> str:
        """生成文本"""
        pass


class MockLLM(BaseLLM):
    """模拟LLM（用于演示）"""

    def __init__(self):
        self.response_templates = {
            'rag': "RAG（检索增强生成）是一种结合信息检索和文本生成的AI技术。它通过检索相关文档来增强生成质量，特别适用于需要知识密集型的任务。",
            'default': "基于提供的信息，我来回答您的问题。这是一个示例回答，实际应用中会使用真实的大语言模型。"
        }

    def generate(self, prompt: str, **kwargs) -> str:
        """生成模拟回答"""
        # 简单的关键词匹配
        prompt_lower = prompt.lower()

        if 'rag' in prompt_lower:
            return self.response_templates['rag']
        else:
            return self.response_templates['default']


class AdvancedFactChecker:
    """高级事实检查器"""

    def __init__(self):
        self.known_facts = {
            'rag': {
                'definition': 'RAG是检索增强生成技术',
                'components': ['检索器', '生成器'],
                'purpose': '提升生成质量'
            }
        }

    def check_accuracy(self, answer: str, context: str) -> float:
        """检查事实准确性"""
        # 简化的事实检查逻辑
        accuracy_score = 0.8  # 基础分数

        # 检查是否包含已知错误信息
        error_patterns = [
            'rag.*是.*翻译',  # 错误定义
            '100%.*准确',     # 过于绝对的表述
            '永远不会.*错误'   # 不现实的声明
        ]

        for pattern in error_patterns:
            if re.search(pattern, answer.lower()):
                accuracy_score -= 0.2

        # 检查是否引用了上下文信息
        context_words = set(context.lower().split())
        answer_words = set(answer.lower().split())

        if context_words & answer_words:
            accuracy_score += 0.1

        return max(0.0, min(1.0, accuracy_score))


class RelevanceScorer:
    """相关性评分器"""

    def score_relevance(self, answer: str, prompt: str) -> float:
        """评估答案相关性"""
        # 提取查询关键词
        query_part = self._extract_query_from_prompt(prompt)

        # 改进词汇分割 - 使用正则表达式分割中英文
        import re

        def tokenize_text(text):
            # 使用正则表达式分割中英文词汇
            # 分割中文字符、英文单词、数字
            tokens = re.findall(r'[a-zA-Z]+|[\u4e00-\u9fa5]|[0-9]+', text.lower())
            return set(tokens)

        query_words = tokenize_text(query_part)
        answer_words = tokenize_text(answer)

        # Debug输出
        print(f"DEBUG - 提取的查询: '{query_part}'")
        print(f"DEBUG - 查询词汇: {query_words}")
        print(f"DEBUG - 回答词汇: {answer_words}")

        # 计算词汇重叠
        intersection = query_words & answer_words
        union = query_words | answer_words

        print(f"DEBUG - 交集: {intersection}")
        print(f"DEBUG - 并集大小: {len(union)}")

        if not union:
            return 0.0

        # 改进相关性计算算法
        jaccard_score = len(intersection) / len(union)
        print(f"DEBUG - Jaccard分数: {jaccard_score}")

        # 对于短查询，给予更高的权重
        query_length_bonus = 0.0
        if len(query_words) <= 6:  # 短查询（6个词以下）
            # 计算核心词汇覆盖率
            core_words = {'rag', '技', '术'}  # 核心概念词汇
            core_coverage = len(core_words & intersection) / len(core_words) if core_words else 0
            query_length_bonus = core_coverage * 0.3
            print(f"DEBUG - 核心词汇覆盖率: {core_coverage}, 加分: {query_length_bonus}")

        # 调整分数 - 提高基础权重
        relevance_score = jaccard_score * 2.0 + query_length_bonus  # 提升权重从1.5到2.0

        # 检查是否直接回答了问题
        directly_addresses = self._directly_addresses_query(answer, query_part)
        print(f"DEBUG - 直接回答问题: {directly_addresses}")

        if directly_addresses:
            relevance_score += 0.3  # 提升直接回答的加分从0.2到0.3

        # 对于定义类问题的特别处理
        if '什么是' in query_part or 'what is' in query_part.lower():
            definition_bonus = 0.0
            definition_indicators = ['是一种', '是指', '定义', '概念', '技术', '方法']
            for indicator in definition_indicators:
                if indicator in answer:
                    definition_bonus += 0.1
            relevance_score += min(definition_bonus, 0.2)  # 最多加0.2分
            print(f"DEBUG - 定义类问题加分: {min(definition_bonus, 0.2)}")

        print(f"DEBUG - 最终相关性分数: {relevance_score}")
        return min(1.0, relevance_score)

    def _extract_query_from_prompt(self, prompt: str) -> str:
        """从提示中提取查询部分"""
        print(f"DEBUG - 原始prompt: '{prompt}'")

        # 简化提取：查找"问题"或"query"关键词后的内容
        patterns = [
            r'问题[：:]\s*([^：\n]+?)(?=\n|请|$)',  # 更精确的匹配，避免匹配到冒号后的内容
            r'query[：:]\s*([^：\n]+?)(?=\n|$)',
            r'用户问题[：:]\s*([^：\n]+?)(?=\n|$)'
        ]

        for pattern in patterns:
            match = re.search(pattern, prompt, re.IGNORECASE | re.DOTALL)
            if match:
                extracted = match.group(1).strip()
                print(f"DEBUG - 正则匹配到: '{extracted}'")
                # 如果提取的内容过长，可能包含了其他信息，需要进一步处理
                if len(extracted) > 100:  # 如果超过100字符，可能提取错误
                    # 尝试找到真正的问题部分
                    lines = extracted.split('\n')
                    for line in lines:
                        if line.strip() and ('什么' in line or '如何' in line or '为什么' in line or '？' in line or '?' in line):
                            print(f"DEBUG - 从长文本中找到问题: '{line.strip()}'")
                            return line.strip()
                return extracted

        # 如果没找到，查找包含问号的行
        lines = prompt.split('\n')
        for line in lines:
            if '？' in line or '?' in line:
                # 去掉行首的标签（如"问题："）
                cleaned_line = re.sub(r'^[^：:]*[：:]\s*', '', line).strip()
                print(f"DEBUG - 从问号行提取: '{cleaned_line}'")
                return cleaned_line

        # 查找包含疑问词的行
        question_words = ['什么', '如何', '为什么', '哪里', '何时', '谁', '怎样']
        for line in lines:
            for word in question_words:
                if word in line:
                    cleaned_line = re.sub(r'^[^：:]*[：:]\s*', '', line).strip()
                    print(f"DEBUG - 从疑问词行提取: '{cleaned_line}'")
                    return cleaned_line

        # 最后兜底，返回最后一行
        result = lines[-1] if lines else prompt
        print(f"DEBUG - 兜底提取: '{result}'")
        return result

    def _directly_addresses_query(self, answer: str, query: str) -> bool:
        """检查是否直接回答了问题"""
        # 检查问题类型
        question_patterns = {
            '什么是': ['是', '定义', '概念'],
            '如何': ['方法', '步骤', '实现'],
            '为什么': ['原因', '因为', '由于'],
            '哪里': ['位置', '地方', '在'],
            '何时': ['时间', '时候', '当']
        }

        for question_type, answer_indicators in question_patterns.items():
            if question_type in query:
                if any(indicator in answer for indicator in answer_indicators):
                    return True

        return False


class SafetyChecker:
    """安全检查器"""

    def __init__(self):
        self.harmful_patterns = [
            r'(歧视|偏见|仇恨)',
            r'(暴力|攻击|伤害)',
            r'(隐私|个人信息|敏感数据)',
            r'(虚假|谣言|误导)'
        ]

        self.sensitive_topics = [
            '政治', '宗教', '种族', '性别'
        ]

    def is_safe(self, content: str) -> bool:
        """检查内容安全性"""
        safety_score = self.safety_score(content)
        return safety_score > 0.7

    def safety_score(self, content: str) -> float:
        """计算安全分数"""
        score = 1.0
        content_lower = content.lower()

        # 检查有害模式
        for pattern in self.harmful_patterns:
            if re.search(pattern, content_lower):
                score -= 0.3

        # 检查敏感话题
        for topic in self.sensitive_topics:
            if topic in content_lower:
                score -= 0.1

        # 检查是否包含免责声明（积极因素）
        disclaimers = ['仅供参考', '建议咨询专业人士', '可能存在误差']
        for disclaimer in disclaimers:
            if disclaimer in content:
                score += 0.1

        return max(0.0, min(1.0, score))


class ErrorDetector:
    """错误检测器"""

    def detect_errors(self, answer: str, context: str) -> List[Dict[str, str]]:
        """检测回答中的错误"""
        errors = []

        # 检查事实错误
        fact_errors = self._detect_factual_errors(answer, context)
        errors.extend(fact_errors)

        # 检查逻辑错误
        logic_errors = self._detect_logic_errors(answer)
        errors.extend(logic_errors)

        # 检查一致性错误
        consistency_errors = self._detect_consistency_errors(answer)
        errors.extend(consistency_errors)

        return errors

    def _detect_factual_errors(self, answer: str, context: str) -> List[Dict[str, str]]:
        """检测事实错误"""
        errors = []

        # 检查是否有与上下文冲突的信息
        if self._contradicts_context(answer, context):
            errors.append({
                'type': '事实错误',
                'description': '回答与提供的上下文信息存在冲突'
            })

        return errors

    def _detect_logic_errors(self, answer: str) -> List[Dict[str, str]]:
        """检测逻辑错误"""
        errors = []

        # 检查自相矛盾
        if self._contains_contradiction(answer):
            errors.append({
                'type': '逻辑错误',
                'description': '回答内容存在自相矛盾'
            })

        return errors

    def _detect_consistency_errors(self, answer: str) -> List[Dict[str, str]]:
        """检测一致性错误"""
        errors = []

        # 检查术语使用是否一致
        if self._inconsistent_terminology(answer):
            errors.append({
                'type': '一致性错误',
                'description': '术语使用不一致'
            })

        return errors

    def _contradicts_context(self, answer: str, context: str) -> bool:
        """检查是否与上下文冲突"""
        # 简化的冲突检测
        return False  # 实际实现需要更复杂的NLP技术

    def _contains_contradiction(self, answer: str) -> bool:
        """检查是否包含自相矛盾"""
        # 简化的矛盾检测
        contradiction_patterns = [
            r'(.+?)是(.+?)，但.*不是',
            r'(.+?)可以(.+?)，然而.*不能'
        ]

        for pattern in contradiction_patterns:
            if re.search(pattern, answer):
                return True

        return False

    def _inconsistent_terminology(self, answer: str) -> bool:
        """检查术语使用是否一致"""
        # 检查同一概念是否使用了不同术语
        term_variations = {
            'rag': ['rag', '检索增强生成', '检索增强'],
            'llm': ['llm', '大语言模型', '大模型']
        }

        for concept, variations in term_variations.items():
            used_terms = [term for term in variations if term in answer.lower()]
            if len(used_terms) > 1:
                return True

        return False


class SelfCorrectionMechanism:
    """自我修正机制"""

    def __init__(self, llm: BaseLLM):
        self.llm = llm
        self.error_detector = ErrorDetector()
        self.correction_prompts = {
            '事实错误': "请检查并修正回答中与上下文信息冲突的部分。",
            '逻辑错误': "请检查并修正回答中存在逻辑矛盾的部分。",
            '一致性错误': "请确保术语使用的一致性，避免同一概念使用不同表述。"
        }

    def generate_with_self_correction(
        self,
        query: str,
        context: str,
        max_corrections: int = 2
    ) -> str:
        """带自我修正的生成"""
        # 构建初始提示
        initial_prompt = f"""基于以下上下文回答问题：

上下文：{context}

问题：{query}

请准确、完整地回答问题。"""

        # 1. 初始生成
        answer = self.llm.generate(initial_prompt)

        correction_count = 0
        while correction_count < max_corrections:
            # 2. 错误检测
            errors = self.error_detector.detect_errors(answer, context)

            if not errors:
                break  # 没有错误，退出循环

            # 3. 生成修正提示
            correction_prompt = self.build_correction_prompt(
                query, context, answer, errors
            )

            # 4. 修正生成
            corrected_answer = self.llm.generate(correction_prompt)

            # 5. 更新答案
            answer = corrected_answer
            correction_count += 1

        return answer

    def build_correction_prompt(
        self,
        query: str,
        context: str,
        original_answer: str,
        errors: List[Dict[str, str]]
    ) -> str:
        """构建修正提示"""
        error_descriptions = []
        for error in errors:
            error_type = error['type']
            description = error['description']

            # 获取对应的修正指导
            guidance = self.correction_prompts.get(
                error_type,
                "请修正检测到的问题。"
            )

            error_descriptions.append(f"- {error_type}: {description}\n  修正指导: {guidance}")

        return f"""原始问题：{query}

参考上下文：{context}

原始回答：{original_answer}

检测到的问题：
{chr(10).join(error_descriptions)}

请根据上述问题修正回答，确保：
1. 纠正所有检测到的错误
2. 保持回答的完整性和流畅性
3. 严格基于提供的上下文信息
4. 保持客观和准确

修正后的回答："""


class GenerationQualityController:
    """生成质量控制器主类"""

    def __init__(self, llm: BaseLLM):
        self.llm = llm
        self.fact_checker = AdvancedFactChecker()
        self.safety_checker = SafetyChecker()
        self.relevance_scorer = RelevanceScorer()
        self.self_correction = SelfCorrectionMechanism(llm)

        # 质量阈值配置
        self.quality_thresholds = {
            'factual_accuracy': 0.7,
            'relevance': 0.8,
            'completeness': 0.6,
            'consistency': 0.7,
            'safety': 0.9
        }

    def generate_with_quality_control(
        self,
        prompt: str,
        max_iterations: int = 3,
        use_self_correction: bool = True
    ) -> GenerationResult:
        """带质量控制的生成过程"""
        start_time = time.time()
        best_answer = None
        best_metrics = None
        best_score = 0

        for iteration in range(max_iterations):
            # 1. 生成候选答案
            if use_self_correction and iteration == max_iterations - 1:
                # 最后一次迭代使用自我修正
                candidate_answer = self._generate_with_correction(prompt)
            else:
                # 普通生成，逐步增加随机性
                candidate_answer = self.llm.generate(
                    prompt,
                    temperature=0.3 + iteration * 0.1
                )

            # 2. 质量评估
            quality_metrics = self.evaluate_answer_quality(
                candidate_answer, prompt
            )

            current_score = quality_metrics.overall_score()

            # 3. 更新最佳答案
            if current_score > best_score:
                best_answer = candidate_answer
                best_metrics = quality_metrics
                best_score = current_score

            # 4. 质量达标则提前退出
            if self._meets_quality_thresholds(quality_metrics):
                break

        # 5. 最终安全检查
        is_safe = self.safety_checker.is_safe(best_answer)
        if not is_safe:
            best_answer = self.generate_safe_fallback_response()
            is_safe = True

        generation_time = time.time() - start_time

        return GenerationResult(
            content=best_answer,
            quality_metrics=best_metrics,
            generation_time=generation_time,
            iteration_count=iteration + 1,
            is_safe=is_safe
        )

    def evaluate_answer_quality(
        self,
        answer: str,
        prompt: str
    ) -> QualityMetrics:
        """多维度质量评估"""
        # 事实准确性
        factual_accuracy = self.fact_checker.check_accuracy(answer, prompt)

        # 相关性
        relevance = self.relevance_scorer.score_relevance(answer, prompt)

        # 完整性
        completeness = self.assess_completeness(answer, prompt)

        # 一致性
        consistency = self.check_internal_consistency(answer)

        # 安全性
        safety = self.safety_checker.safety_score(answer)

        # 流畅性
        fluency = self.assess_fluency(answer)

        return QualityMetrics(
            factual_accuracy=factual_accuracy,
            relevance=relevance,
            completeness=completeness,
            consistency=consistency,
            safety=safety,
            fluency=fluency
        )

    def assess_completeness(self, answer: str, prompt: str) -> float:
        """评估答案完整性"""
        # 基于回答长度的初步评估 - 调整评分更合理
        base_score = min(len(answer) / 150, 1.0)  # 降低门槛到150字符

        # 对于短但精准的回答给予基础分数
        if len(answer) >= 50:  # 50字符以上的回答有基础分数
            base_score = max(base_score, 0.5)

        # 检查是否包含结构化信息
        structure_indicators = ['首先', '其次', '最后', '总之', '1.', '2.', '3.']
        if any(indicator in answer for indicator in structure_indicators):
            base_score += 0.2

        # 检查是否包含具体例子
        example_indicators = ['例如', '比如', '举例', '具体来说']
        if any(indicator in answer for indicator in example_indicators):
            base_score += 0.1

        # 检查是否包含定义性内容（对于"什么是"类问题很重要）
        definition_indicators = ['是一种', '是指', '定义为', '概念']
        if any(indicator in answer for indicator in definition_indicators):
            base_score += 0.2

        print(f"DEBUG - 完整性评分: 长度{len(answer)}, 基础分{base_score}")
        return min(1.0, base_score)

    def check_internal_consistency(self, answer: str) -> float:
        """检查内部一致性"""
        # 简化的一致性检查
        consistency_score = 1.0

        # 检查是否存在明显矛盾
        if '但是' in answer or '然而' in answer:
            # 有转折，需要更仔细检查
            consistency_score -= 0.1

        # 检查术语使用一致性
        if self._has_inconsistent_terms(answer):
            consistency_score -= 0.2

        return max(0.0, consistency_score)

    def assess_fluency(self, answer: str) -> float:
        """评估语言流畅性"""
        fluency_score = 0.8  # 基础分数

        # 检查句子长度分布
        sentences = answer.split('。')
        if sentences:
            avg_length = sum(len(s) for s in sentences) / len(sentences)
            if 20 <= avg_length <= 80:  # 合理的句子长度
                fluency_score += 0.1

        # 检查重复词汇
        words = answer.split()
        if len(set(words)) / len(words) > 0.7:  # 词汇多样性
            fluency_score += 0.1

        return min(1.0, fluency_score)

    def _generate_with_correction(self, prompt: str) -> str:
        """使用自我修正生成"""
        # 从提示中提取查询和上下文
        query, context = self._extract_query_context(prompt)
        return self.self_correction.generate_with_self_correction(
            query, context
        )

    def _extract_query_context(self, prompt: str) -> Tuple[str, str]:
        """从提示中提取查询和上下文"""
        # 简化的提取逻辑
        lines = prompt.split('\n')
        context = ""
        query = ""

        for line in lines:
            if '上下文' in line or 'context' in line.lower():
                context = line.split('：')[-1] if '：' in line else line
            elif '问题' in line or 'question' in line.lower():
                query = line.split('：')[-1] if '：' in line else line

        return query.strip(), context.strip()

    def _meets_quality_thresholds(self, metrics: QualityMetrics) -> bool:
        """检查是否满足质量阈值"""
        for metric, threshold in self.quality_thresholds.items():
            if hasattr(metrics, metric):
                if getattr(metrics, metric) < threshold:
                    return False
        return True

    def _has_inconsistent_terms(self, answer: str) -> bool:
        """检查是否有不一致的术语使用"""
        # 简化的检查逻辑
        answer_lower = answer.lower()

        # 检查同一概念的不同表述
        concepts = {
            'ai': ['人工智能', 'ai', '机器智能'],
            'ml': ['机器学习', 'ml', '机器学习算法']
        }

        for concept, terms in concepts.items():
            used_terms = [term for term in terms if term in answer_lower]
            if len(used_terms) > 1:
                return True

        return False

    def generate_safe_fallback_response(self) -> str:
        """生成安全的兜底回复"""
        return ("抱歉，我无法提供准确的回答。建议您咨询相关专业人士或"
                "查阅权威资料获取更可靠的信息。")


# 使用示例
if __name__ == "__main__":
    # 初始化组件
    llm = MockLLM()
    quality_controller = GenerationQualityController(llm)

    # 测试提示
    test_prompt = """基于以下上下文回答问题：

上下文：RAG（检索增强生成）是一种结合信息检索和语言生成的AI技术。它通过检索相关文档来提升生成质量，特别适用于知识密集型任务。

问题：什么是RAG技术？

请准确、完整地回答问题。"""

    # 生成并评估
    result = quality_controller.generate_with_quality_control(
        test_prompt, max_iterations=2
    )

    print("生成结果:")
    print("=" * 50)
    print(f"内容: {result.content}")
    print(f"总体质量分数: {result.quality_metrics.overall_score():.3f}")
    print(f"生成时间: {result.generation_time:.3f}s")
    print(f"迭代次数: {result.iteration_count}")
    print(f"安全性: {'通过' if result.is_safe else '未通过'}")

    print("\n详细质量指标:")
    print(f"事实准确性: {result.quality_metrics.factual_accuracy:.3f}")
    print(f"相关性: {result.quality_metrics.relevance:.3f}")
    print(f"完整性: {result.quality_metrics.completeness:.3f}")
    print(f"一致性: {result.quality_metrics.consistency:.3f}")
    print(f"安全性: {result.quality_metrics.safety:.3f}")
    print(f"流畅性: {result.quality_metrics.fluency:.3f}")

DEBUG - 原始prompt: '基于以下上下文回答问题：

上下文：RAG（检索增强生成）是一种结合信息检索和语言生成的AI技术。它通过检索相关文档来提升生成质量，特别适用于知识密集型任务。

问题：什么是RAG技术？

请准确、完整地回答问题。'
DEBUG - 正则匹配到: '什么是RAG技术？'
DEBUG - 提取的查询: '什么是RAG技术？'
DEBUG - 查询词汇: {'技', '术', '么', 'rag', '是', '什'}
DEBUG - 回答词汇: {'和', '技', '集', '检', '种', '关', '结', '的', '息', '术', '是', '成', '它', '需', '来', '务', 'rag', '生', 'ai', '特', '通', '信', '文', '于', '质', '知', '相', '型', '档', '用', '索', '本', '增', '识', '过', '别', '密', '要', '适', '强', '任', '量', '一', '合'}
DEBUG - 交集: {'rag', '技', '术', '是'}
DEBUG - 并集大小: 46
DEBUG - Jaccard分数: 0.08695652173913043
DEBUG - 核心词汇覆盖率: 1.0, 加分: 0.3
DEBUG - 直接回答问题: True
DEBUG - 定义类问题加分: 0.2
DEBUG - 最终相关性分数: 0.9739130434782608
DEBUG - 完整性评分: 长度64, 基础分0.7
生成结果:
内容: RAG（检索增强生成）是一种结合信息检索和文本生成的AI技术。它通过检索相关文档来增强生成质量，特别适用于需要知识密集型的任务。
总体质量分数: 0.873
生成时间: 0.000s
迭代次数: 1
安全性: 通过

详细质量指标:
事实准确性: 0.800
相关性: 0.974
完整性: 0.700
一致性: 1.000
安全性: 1.000
流畅性: 1.000
