In [1]:
import bs4
from langchain_community.document_loaders import WebBaseLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

loader = WebBaseLoader(
    web_paths=(
        "https://lilianweng.github.io/posts/2023-03-15-prompt-engineering/",
    ),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)
documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=200)

docs = text_splitter.split_documents(documents)

docs[1]

USER_AGENT environment variable not set, consider setting it to identify your requests.


Document(metadata={'source': 'https://lilianweng.github.io/posts/2023-03-15-prompt-engineering/'}, page_content="Prompt Engineering, also known as In-Context Prompting, refers to methods for how to communicate with LLM to steer its behavior for desired outcomes without updating the model weights. It is an empirical science and the effect of prompt engineering methods can vary a lot among models, thus requiring heavy experimentation and heuristics.\nThis post only focuses on prompt engineering for autoregressive language models, so nothing with Cloze tests, image generation or multimodality models. At its core, the goal of prompt engineering is about alignment and model steerability. Check my previous post on controllable text generation.\n[My personal spicy take] In my opinion, some prompt engineering papers are not worthy 8 pages long, since those tricks can be explained in one or a few sentences and the rest is all about benchmarking. An easy-to-use and shared benchmark infrastructur

In [2]:
# 查看文档数量
print(f"总文档数量: {len(docs)}")
print(f"前5个文档的长度: {[len(doc.page_content) for doc in docs[:5]]}")

# 查看第一个文档内容示例
print(f"\n第一个文档内容前200字符:\n{docs[0].page_content[:200]}...")


总文档数量: 20
前5个文档的长度: [103, 1964, 193, 1520, 1782]

第一个文档内容前200字符:
Prompt Engineering
    
Date: March 15, 2023  |  Estimated Reading Time: 21 min  |  Author: Lilian Weng...


In [None]:
# 导入必要的库
import sys
sys.path.append("..")
from llm_utils import qwen_embeddings,llm
from langchain_milvus import Milvus



# 只处理前10个documents
docs = docs[:10]

# vectorstore 是一个已经加载了从博客文章中提取并分割成块的文本的 Milvus 向量存储实例。
vectorstore = Milvus.from_documents(
    documents=docs,
    embedding=qwen_embeddings,
    connection_args={
        "uri": "http://localhost:19530",  # http://localhost:19530
    },
    collection_name="langchain_example",
    drop_old=False,  # Drop the old Milvus collection if it exists
)

2025-08-26 16:21:26,902 [DEBUG][_create_connection]: Created new connection using: async-http://localhost:19530 (async_milvus_client.py:599)


In [4]:
query = "What is self-reflection of an AI Agent?"
vectorstore.similarity_search(query, k=1)

[Document(metadata={'source': 'https://lilianweng.github.io/posts/2023-06-23-agent/', 'pk': 460375432065191400}, page_content='Examples of reasoning trajectories for knowledge-intensive tasks (e.g. HotpotQA, FEVER) and decision-making tasks (e.g. AlfWorld Env, WebShop). (Image source: Yao et al. 2023).\n\nIn both experiments on knowledge-intensive tasks and decision-making tasks, ReAct works better than the Act-only baseline where Thought: … step is removed.\nReflexion (Shinn & Labash 2023) is a framework to equip agents with dynamic memory and self-reflection capabilities to improve reasoning skills. Reflexion has a standard RL setup, in which the reward model provides a simple binary reward and the action space follows the setup in ReAct where the task-specific action space is augmented with language to enable complex reasoning steps. After each action $a_t$, the agent computes a heuristic $h_t$ and optionally may decide to reset the environment to start a new trial depending on the 

In [5]:
from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser

# Define the prompt template for generating AI responses
PROMPT_TEMPLATE = """
Human: You are an AI assistant, and provides answers to questions by using fact based and statistical information when possible.
Use the following pieces of information to provide a concise answer to the question enclosed in <question> tags.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
<context>
{context}
</context>

<question>
{question}
</question>

The response should be specific and use statistics or numbers when possible.

Assistant:"""

# Create a PromptTemplate instance with the defined template and input variables
prompt = PromptTemplate(
    template=PROMPT_TEMPLATE, input_variables=["context", "question"]
)
# Convert the vector store to a retriever
retriever = vectorstore.as_retriever()


# Define a function to format the retrieved documents
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# Define the RAG (Retrieval-Augmented Generation) chain for AI response generation
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

# rag_chain.get_graph().print_ascii()

# Invoke the RAG chain with a specific question and retrieve the response
res = rag_chain.invoke(query)
res

'Self-reflection in an AI agent, as described in the Reflexion framework, involves the agent assessing its past actions to improve future decision-making. It utilizes a heuristic function to identify inefficient trajectories or hallucinations, where the agent may encounter consecutive identical actions leading to the same outcome. The self-reflection process includes showing two-shot examples of (failed trajectory, ideal reflection) to the language model, allowing it to extract insights. Agents can store up to three reflections in their working memory to guide future actions. In experiments, hallucination is reported as a more frequent failure than inefficient planning in decision-making tasks like AlfWorld.'