### 1. 连接大模型

In [1]:
from utils import get_ernie_models
from utils import get_qwen_models

In [2]:
llm, chat, embed = get_qwen_models()

### 2. 基本RAG系统的构成

### 2.1. 引入必要的库和包

In [4]:
# 解析 Web 页面的库（用面向对象的方式来封装 HTML 页面）
import bs4
# hub 生态中心
from langchain import hub
# 引入 Chroma 向量库
from langchain_chroma import Chroma
# 在线加载网页
from langchain_community.document_loaders import WebBaseLoader
# 输出解析器
from langchain_core.output_parsers import StrOutputParser
# 可执行的占位符
from langchain_core.runnables import RunnablePassthrough
# 文档切分器
from langchain_text_splitters import RecursiveCharacterTextSplitter

USER_AGENT environment variable not set, consider setting it to identify your requests.


### 2.2 加载页面

In [5]:
# Load, chunk and index the contents of the blog.
loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)

In [6]:
# 加载数据
docs = loader.load()

### 2.3 文本分割

In [35]:
# 递归式 字符级 文本 切分器
"""
    chunk_size: 建议段落大小
"""
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=200)
splits = text_splitter.split_documents(docs)

### 2.4 向量化并入库

In [40]:
len(splits)

142

In [41]:
# 底层存储使用的 SQLite，没有指定存储位置的话，则在内存中建立临时库
vectorstore = Chroma.from_documents(documents=splits[:6], embedding=embed)

In [45]:
my_splits = splits[6:]

In [46]:
len(my_splits)

136

In [51]:
for idx in range(23):
    my_docs = my_splits[idx * 6: (idx + 1) * 6]
    vectorstore.add_documents(documents=my_docs)

In [52]:
vectorstore

<langchain_chroma.vectorstores.Chroma at 0x1c7b5e35310>

### 2.5 RAG系统搭建

In [56]:
# 把向量操作封装为一个基本检索器
retriever = vectorstore.as_retriever()

In [59]:
# prompt = hub.pull("rlm/rag-prompt")

LangSmithUserError: API key must be provided when using hosted LangSmith API

In [78]:
from langchain_core.prompts import ChatPromptTemplate

# RAG系统经典的 Prompt (A 增强的过程)
prompt = ChatPromptTemplate.from_messages([
  ("human", """You are an assistant for question-answering tasks. Use the following pieces 
  of retrieved context to answer the question. 
  If you don't know the answer, just say that you don't know. 
  Use three sentences maximum and keep the answer concise.
  Question: {question} 
  Context: {context} 
  Answer:""")
])

In [79]:
# 把检索到的4条上下文的文本使用 \n\n 练成一个大的字符串
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [80]:
# RAG 链
rag_chain = (
    {"context": retriever | format_docs, 
     "question": RunnablePassthrough()}
    | prompt
    | chat
    | StrOutputParser()
)

In [81]:
rag_chain.invoke(input="What is Task Decomposition?")

'Task Decomposition is a method of breaking down complex tasks into simpler, manageable subgoals. This can be achieved through various approaches, including prompting a language model, using task-specific instructions, or incorporating human input. It enhances problem-solving by allowing step-by-step processing, facilitating efficient task management, and enabling reflection and refinement for improved outcomes.'

In [82]:
rag_chain.invoke(input="中国和美国哪个人口更多？")

'中国的人口比美国多。'

In [77]:
chat.invoke(input="中国和美国哪个人口更多？")

AIMessage(content='中国的人口比美国多。根据2022年的数据，中国的人口约为14.1亿，而美国的人口约为3.3亿。', response_metadata={'model_name': 'qwen-max', 'finish_reason': 'stop', 'request_id': 'fba6f5c2-193f-9a66-8088-b2635222a65d', 'token_usage': {'input_tokens': 16, 'output_tokens': 35, 'total_tokens': 51}}, id='run-bc56b2a0-f300-4b5c-95e7-d8309f3b7c5f-0')