# 索引优化
    1.摘要索引
        针对概括性查询问题，摘要索引可以解决
    2.父子索引
        检索准备精准（靠小块）
        回答内容完整（大段 Parent）
    3.假设性问题检索
        即使用户提问与原文不直接匹配，也能通过“生成相关问题 → 再做向量检索”找到语义更深层、更准确的相关内容
    4.元数据过滤
        向量检索+元数据过滤，检索准确度更高

In [None]:
from langchain_community.chat_models.tongyi import ChatTongyi
from langchain_community.embeddings.dashscope import DashScopeEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser

# 初始化llm（通义千问）
llm = ChatTongyi(model="qwen-max")
embeddings_model = DashScopeEmbeddings(model="text-embedding-v1")


def test_no_summary():
    '''未使用摘要索引'''
    docs = [
        "DeepSeek，全称杭州深度求索人工智能基础技术研究有限公司，是一家创新型科技公司 ，成立于2023年7月17日，使用数据蒸馏技术 ，得到更为精炼、有用的数据 。由知名私募巨头幻方量化孕育而生，专注于开发先进的大语言模型（LLM）和相关技术。",
        "DeepSeek R1是推理模型，遵循 MIT License，通过设置 model='deepseek-reasoner' 即可调用。DeepSeek V3是通用大模型，目前版本号 DeepSeek-V3-0324。通过指定 model='deepseek-chat' 即可调用 DeepSeek V3。",
    ]

    query = "DeepSeek R1模型的开发公司叫什么？"

    # 将文档和查询转换为向量
    doc_embeddings = embeddings_model.embed_documents(docs)
    query_embedding = embeddings_model.embed_query(query)

    # 计算相似度  余弦相似度：点积（A*B）/模长乘积（A模长*B模长）
    similarities = [dot(query_embedding, doc_embedding) / (norm(query_embedding) * norm(doc_embedding)) for
                    doc_embedding in doc_embeddings]

    print("==========================不使用摘要索引======================================")
    # 期望得到文档1,检索出文档2
    for i, similarity in enumerate(similarities):
        print(f"第{i + 1}个相似度：", similarity)


def test_summary():
    '''使用摘要索引'''
    # 假设生成了文档的摘要
    summary_docs = [
        "DeepSeek公司介绍",
        "DeepSeek模型调用说明"
    ]
    query = "DeepSeek R1模型的开发公司叫什么？"

    summary_doc_embeddings = embeddings_model.embed_documents(summary_docs)
    query_embedding = embeddings_model.embed_query(query)

    # 计算问题与文档摘要的相似度（基于摘要的检索）
    similarities = [dot(query_embedding, summary_doc_embedding) / (norm(query_embedding) * norm(summary_doc_embedding))
                    for summary_doc_embedding in summary_doc_embeddings]

    print("==========================使用摘要索引相似度======================================")
    for i, similarity in enumerate(similarities):
        print(f"第{i + 1}个相似度：", similarity)


def test_summary_index():
    # 1.文件路径
    RESOURCE_DIR = "../../data/base/resources"
    TXT_DOCUMENT_PATH = os.path.join(RESOURCE_DIR, "deepseek百度百科.txt")

    # 2.初始化模型
    llm = ChatTongyi(model="qwen-max")
    embeddings_model = DashScopeEmbeddings(model="text-embedding-v1")

    # 3. 加载本地文件
    loader = TextLoader(TXT_DOCUMENT_PATH, encoding='utf-8')
    docs = loader.load()
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=100)
    chunks = text_splitter.split_documents(docs)
    for i, chunk in enumerate(chunks):
        print(f"块 {i + 1} :   {repr(chunk.page_content[:50])}...")

    # 4.生成摘要
    # 创建摘要生成链
    chain = (
            {"chunk": lambda x: x.page_content}
            | ChatPromptTemplate.from_template("总结下面的文档:\n\n{chunk}")
            | llm
            | StrOutputParser()
    )

    # 批量生成文档摘要（最大并发数5）
    summaries = chain.batch(chunks, {"max_concurrency": 5})

    for i, summary in enumerate(summaries):
        print(f"块 {i + 1} :   {repr(summary[:50])}...")

    # 5.索引准备
    #   InMemoryByteStore 是一个内存中的存储层，用于存储原始文档
    #   Chroma 是一个文档向量数据库，用于存储文档摘要的向量表示
    # 初始化Chroma实例（用于存储摘要向量）
    vectorstore = Chroma(
        collection_name="summaries",
        embedding_function=embeddings_model
    )

    # 初始化内存字节存储（用于存储原始文档）
    store = InMemoryByteStore()

    # 6.索引构建
    # 初始化多向量检索器（结合向量存储和文档存储）
    id_key = "doc_id"
    retriever = MultiVectorRetriever(
        vectorstore=vectorstore,
        byte_store=store,
        id_key=id_key,
        search_kwargs={"k": 1}
    )

    # 为每个文档生成唯一ID
    doc_ids = [str(uuid.uuid4()) for _ in chunks]

    # 创建摘要文档列表（包含生成的唯一ID作为对应摘要文档的元数据）
    summary_docs = [
        Document(page_content=s, metadata={id_key: doc_ids[i]})
        for i, s in enumerate(summaries)
    ]

    # 将摘要添加到向量数据库
    retriever.vectorstore.add_documents(summary_docs)

    # 将原始文档存储到字节存储（使用ID关联）
    retriever.docstore.mset(list(zip(doc_ids, chunks)))

    # 7.检索
    def pretty_print_docs(docs):
        print(
            f"\n{'-' * 100}\n".join(
                [f"Document {i + 1}:\n\n" + d.page_content for i, d in enumerate(docs)]
            )
        )

    contexts = retriever.invoke('deepseek的企业动态')
    pretty_print_docs(contexts)


test_no_summary()
test_summary()
test_summary_index()


In [None]:
# 2.父子索引
import os

from langchain_chroma import Chroma
from langchain_classic.retrievers import ParentDocumentRetriever
from langchain_community.chat_models.tongyi import ChatTongyi
from langchain_community.document_loaders import TextLoader
from langchain_community.embeddings.dashscope import DashScopeEmbeddings
from langchain_core.stores import InMemoryStore
from langchain_text_splitters import RecursiveCharacterTextSplitter


# 格式化输出内容
def pretty_print_docs(docs):
    print(
        f"\n{'-' * 100}\n".join(
            [f"Document {i + 1}:\n\n" + d.page_content for i, d in enumerate(docs)]
        )
    )


# 1.文件路径
RESOURCE_DIR = "../../data/base/resources"
TXT_DOCUMENT_PATH = os.path.join(RESOURCE_DIR, "deepseek百度百科.txt")

# 2.初始化模型
llm = ChatTongyi(model="qwen-max")
embeddings_model = DashScopeEmbeddings(model="text-embedding-v1")

# 3.加载本地数据
loader = TextLoader(TXT_DOCUMENT_PATH, encoding='utf-8')
docs = loader.load()
# 创建主文档分割器
parent_splitter = RecursiveCharacterTextSplitter(chunk_overlap=32, chunk_size=256)
# 创建子文档分割器
child_splitter = RecursiveCharacterTextSplitter(chunk_overlap=16, chunk_size=64)

# 4.存储准备
# 存储小块
vectorstore = Chroma(
    collection_name="split_parents", embedding_function=embeddings_model
)
# 创建内存存储对象，存储大块
store = InMemoryStore()

# ================================4. 创建检索器================================
# 创建父文档检索器
retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter,
    parent_splitter=parent_splitter,
    search_kwargs={"k": 1}
)

# 5.存储文档
retriever.add_documents(docs)

# 6.子索引检索
sub_docs = vectorstore.similarity_search("介绍下DeepSeek和市场占用情况")
print("-" * 20 + "子索引检索" + "-" * 20)
for sub_doc in sub_docs:
    print(sub_doc.page_content)

# 7.父索引检索
print("-" * 20 + "父索引检索" + "-" * 20)
retrieved_docs = retriever.invoke("介绍下DeepSeek和市场占用情况")
for retrieved_doc in retrieved_docs:
    print(retrieved_doc.page_content)


In [None]:
# 3.假设性问题检索
import os
import uuid
from typing import List

from langchain_chroma import Chroma
from langchain_classic.retrievers import MultiVectorRetriever
from langchain_community.document_loaders import TextLoader
from langchain_community.embeddings.dashscope import DashScopeEmbeddings
from langchain_core.documents import Document
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.stores import InMemoryByteStore
from langchain_openai import ChatOpenAI
from langchain_text_splitters import RecursiveCharacterTextSplitter
from numpy import dot
from numpy.linalg import norm
from pydantic.v1 import BaseModel, Field


# 格式化输出内容
def pretty_print_docs(docs):
    print(
        f"\n{'-' * 100}\n".join(
            [f"Document {i + 1}:\n\n" + d.page_content for i, d in enumerate(docs)]
        )
    )


# 1.文件路径
RESOURCE_DIR = "../../data/base/resources"
TXT_DOCUMENT_PATH = os.path.join(RESOURCE_DIR, "deepseek百度百科.txt")

# 2.模型准备 模型能力很关键
llm = ChatOpenAI(api_key=os.getenv("DEEPSEEK_API_KEY"),
                 base_url="https://api.deepseek.com/v1",
                 model="deepseek-chat")
embeddings_model = DashScopeEmbeddings(model="text-embedding-v1")

# 3.加载数据
loader = TextLoader(TXT_DOCUMENT_PATH, encoding='utf-8')
docs = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=100)
docs = text_splitter.split_documents(docs)


# 4.生成假设性问题准备
class HypotheticalQuestions(BaseModel):
    """生成假设性问题"""
    questions: List[str] = Field(..., description="List of questions")


prompt = ChatPromptTemplate.from_template(
    """生成一个包含3个假设问题的列表，以下文档可用于回答这些问题:

    {doc}
    """
)

chain = (
        {"doc": lambda x: x.page_content}
        | prompt
        | llm.with_structured_output(HypotheticalQuestions) # Tool 模式对 JSON 严格要求 → 中文 + 长文本必炸
        | (lambda x: x.questions)
)

# 5. 构建假设性问题索引
# 批量处理所有文档生成假设性问题（最大并行数5）
hypothetical_questions = chain.batch(docs, {"max_concurrency": 5})
print(hypothetical_questions)

# 初始化Chroma向量数据库（存储生成的问题向量）
vectorstore = Chroma(
    collection_name="hypo-questions", embedding_function=embeddings_model
)

# 初始化内存存储（存储原始文档）
store = InMemoryByteStore()
# 文档标识键名
id_key = "doc_id"
# 配置多向量检索器
retriever = MultiVectorRetriever(
    vectorstore=vectorstore,
    byte_store=store,
    id_key=id_key,
    search_kwargs={"k": 1}
)

# 为每个原始文档生成唯一ID
doc_ids = [str(uuid.uuid4()) for _ in docs]
# 将生成的问题转换为带元数据的文档对象
question_docs = []
for i, question_list in enumerate(hypothetical_questions):
    question_docs.extend([Document(page_content=s, metadata={id_key: doc_ids[i]}) for s in question_list])

retriever.vectorstore.add_documents(question_docs)  # 将问题文档存入向量数据库
retriever.docstore.mset(list(zip(doc_ids, docs)))  # 将原始文档存入字节存储（通过ID关联）

# 6. 检索
sub_docs = retriever.vectorstore.similarity_search("deepseek受到哪些攻击？")
print("问题：deepseek受到哪些攻击？")
print("similarity_search检索结果：")
print(sub_docs[0].page_content)

retrieved_docs = retriever.invoke("deepseek受到哪些攻击？")
print("问题：deepseek受到哪些攻击？")
print("invoke检索结果：")
print(retrieved_docs[0].page_content)

# 7.演示案例
docs = [
    """团队协作中的常见障碍：
        沟通不畅：团队成员之间缺乏有效的沟通，导致信息传递不准确或不及时。
        目标不明确：团队成员对共同目标的理解不一致，导致工作方向不一致。
        缺乏信任：团队成员之间缺乏信任，导致合作效率低下。
        资源分配不均：团队资源分配不合理，导致部分成员工作量过大，部分成员闲置。""",

    """提高团队协作效率的策略：
        明确目标与分工：确保每个团队成员都清楚团队的共同目标和自己的具体任务。
        建立有效的沟通机制：定期召开团队会议，使用协作工具（如Slack、Microsoft Teams）保持实时沟通。
        培养团队信任：通过团队建设活动和透明的沟通机制，增强团队成员之间的信任。
        合理分配资源：根据团队成员的技能和工作量，合理分配任务和资源。""",

    """远程团队协作的最佳实践：
        使用协作工具：利用Zoom、Google Workspace等工具进行远程会议和文档协作。
        定期检查进度：通过定期的进度报告和检查，确保远程团队的工作进展顺利。
        建立明确的沟通规范：制定远程团队的沟通规则，确保信息传递的准确性和及时性。"""
]

query = "团队沟通不畅怎么办"
doc_embeddings = embeddings_model.embed_documents(docs)
query_embedding = embeddings_model.embed_query(query)
similarities = [dot(query_embedding, doc_embedding) / (norm(query_embedding) * norm(doc_embedding))
                for doc_embedding in doc_embeddings]

# 8.未来使用假设性问题
print("==========================未使用假设性问题索引==========================")
for i, similarity in enumerate(similarities):
    print(f"第{i + 1}个相似度：", similarity)

# 9.使用假设性问题索引
chain = (
        {"doc": lambda x: x}
        | prompt
        | llm.with_structured_output(HypotheticalQuestions)
        | (lambda x: x.questions)
)

hypothetical_questions = chain.batch(docs, {"max_concurrency": 5})
for q in hypothetical_questions:
    print(q)
    print("=" * 150)

print("==========================使用假设性问题索引==========================")
for hypothetical_question in hypothetical_questions:
    hypothetical_question_embeddings = embeddings_model.embed_documents(
        [question for question in hypothetical_question])
    similarities = [
        dot(query_embedding, hypothetical_embedding) / (norm(query_embedding) * norm(hypothetical_embedding)) for
        hypothetical_embedding in hypothetical_question_embeddings]
    print(similarities)


In [None]:
# 4.元数据
import os

from langchain_chroma import Chroma
from langchain_classic.retrievers import SelfQueryRetriever
from langchain_community.chat_models.tongyi import ChatTongyi
from langchain_community.embeddings.dashscope import DashScopeEmbeddings
from langchain_core.documents import Document
from langchain_openai import ChatOpenAI


# 格式化输出内容
def pretty_print_docs(docs):
    print(
        f"\n{'-' * 100}\n".join(
            [f"Document {i + 1}:\n\n" + d.page_content for i, d in enumerate(docs)]
        )
    )


# 1.文件路径
RESOURCE_DIR = "../../data/base/resources"
TXT_DOCUMENT_PATH = os.path.join(RESOURCE_DIR, "deepseek百度百科.txt")

# 2.模型准备 模型能力很关键
llm = ChatOpenAI(api_key=os.getenv("DEEPSEEK_API_KEY"),
                 base_url="https://api.deepseek.com/v1",
                 model="deepseek-chat")
embeddings_model = DashScopeEmbeddings(model="text-embedding-v1")

# 3.数据准备==========================
docs = [
    Document(
        page_content="小米智能手环6",
        metadata={"品牌": "小米", "价格": 249, "评分": 4.6}
    ),
    Document(
        page_content="华为FreeBuds Pro无线耳机",
        metadata={"品牌": "华为", "价格": 999, "评分": 4.8}
    ),
    Document(
        page_content="小米移动电源3",
        metadata={"品牌": "小米", "价格": 99, "评分": 4.4}
    ),
    Document(
        page_content="华为Mate 40 Pro智能手机",
        metadata={"品牌": "华为", "价格": 6999, "评分": 5.0}
    ),
    Document(
        page_content="小米AirDots Pro蓝牙耳机",
        metadata={"品牌": "小米", "价格": 299, "评分": 4.5}
    ),
    Document(
        page_content="华为智能手表GT 2",
        metadata={"品牌": "华为", "价格": 1288, "评分": 4.7}
    ),
    Document(
        page_content="小米小爱音箱Play",
        metadata={"品牌": "小米", "价格": 169, "评分": 4.3}
    )
]

# 4. 构建元数据索引
metadata_field_info = [
    {"name": "品牌", "type": "string", "description": "产品的品牌名称"},
    {"name": "价格", "type": "integer", "description": "产品的价格"},
    {"name": "评分", "type": "float", "description": "产品的用户评分"},
]

# 文档内容描述（指导LLM理解文档内容）
document_content_description = "电子产品的信息"
vectorstore = Chroma.from_documents(docs, embeddings_model, collection_name="self-query")

# 5.创建自查询检索器（核心组件）
retriever = SelfQueryRetriever.from_llm(
    llm,
    vectorstore,
    document_content_description,
    metadata_field_info,
)

# 6.检索
pretty_print_docs(retriever.invoke("华为价格5000以上的商品"))
