In [19]:
import bs4
from langchain_community.document_loaders import WebBaseLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain import hub
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI

# 初始化加载器（以《西游记》在线阅读页面为例）
loader = WebBaseLoader("https://www.gutenberg.org/cache/epub/23962/pg23962-images.html")  # 替换为目标URL
docs = loader.load()

# 提取文本内容与元数据
content=docs[0].page_content# 网页正文



In [20]:
# 查看第一个文档（前100字符）
print(docs[0].page_content[:100])


The Project Gutenberg eBook of 西遊記, by Cheng'en Wu


















The Project Gutenberg eBook of


In [35]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=200, add_start_index=True
)
all_splits = text_splitter.split_documents(docs)

In [None]:
!pip install sentence-transformers

In [28]:
!pip install hf_xet

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Collecting hf_xet
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/a1/de/00b2e2568a39c01b0e013db3300f4d5841f2e597d7b0518923c7881bd166/hf_xet-1.0.3-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (53.8 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.8/53.8 MB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
[?25hInstalling collected packages: hf_xet
Successfully installed hf_xet-1.0.3
[0m

In [38]:
from langchain_community.embeddings import HuggingFaceEmbeddings
vectorstore  = Chroma.from_documents(
     documents=all_splits,
    collection_name="xiyouji",
    embedding=HuggingFaceEmbeddings(
        model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"  # 支持中英文的轻量级模型‌:ml-citation{ref="6,7" data="citationList"}
    ),
    persist_directory="./chroma_langchain_db",  # Where to save data locally, remove if not neccesary
)

In [33]:
# 使用 VectorStoreRetriever 从向量存储中检索与查询最相关的文档
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 6})

In [47]:
# 替换为 DeepSeek 的 API 参数
llm = ChatOpenAI(
    model="deepseek-chat",  # DeepSeek 指定的模型名称
    base_url="https://api.deepseek.com",  # DeepSeek 的 API 端点
    api_key="sk-3e5ff44e82b745a7ab7a748b806951c2",  # 替换为你的 DeepSeek API Key
    temperature=0.5,
    max_tokens=4000
)

In [41]:
# 使用 hub 模块拉取 rag 提示词模板
prompt = hub.pull("rlm/rag-prompt")



In [21]:
# 定义格式化文档的函数
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [48]:
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [49]:
# 流式生成回答
for chunk in rag_chain.stream("西游记有几个人"):
    print(chunk, end="", flush=True)

《西游记》主要人物有四个：唐僧（唐三藏）、孙悟空、猪八戒（悟能）和沙僧（悟净）。

In [50]:
for chunk in rag_chain.stream("西游记有多少个妖怪"):
    print(chunk, end="", flush=True)

根据提供的上下文，无法确定《西游记》中妖怪的具体数量。文中提到了多个妖怪（如青毛狮子怪、黄牙老象、大鹏雕等），但并未统计总数。建议查阅完整原著或相关研究资料以获取准确数字。

In [51]:
for chunk in rag_chain.stream("西游记的最后一回讲的是什么"):
    print(chunk, end="", flush=True)

西游记的最后一回讲的是唐僧师徒历经九九八十一难后到达西天，取得真经并返回东土大唐的故事。他们最终修成正果，被封为佛、菩萨等果位，完成了取经的使命。