In [None]:
%pip install -qU langchain langchain-community faiss-cpu sentence-transformers

In [None]:
import os, warnings, logging


os.environ["LANGCHAIN_TRACING_V2"] = "false"
os.environ.pop("LANGCHAIN_API_KEY", None)
os.environ.pop("LANGCHAIN_ENDPOINT", None)
os.environ.pop("LANGCHAIN_PROJECT", None)

# 可选：降低日志噪声
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)
logging.getLogger("langchain").setLevel(logging.ERROR)
logging.getLogger("langsmith").setLevel(logging.ERROR)

In [None]:
import os

from dotenv import load_dotenv

# 加载环境变量
load_dotenv()

if not os.getenv("GITHUB_TOKEN"):
    print("Error: GITHUB_TOKEN is not set in .env file")
    exit(1)


os.environ["GITHUB_TOKEN"] = os.getenv("GITHUB_TOKEN")
os.environ["GITHUB_MODEL"] = os.getenv("GITHUB_MODEL")
os.environ["OPENAI_BASE_URL"] = os.getenv("OPENAI_BASE_URL")

# 可选：减少不必要的日志
os.environ["LANGCHAIN_TRACING_V2"] = "true"

In [None]:
from langchain_community.document_loaders import TextLoader

# langChain tex loader
loader = TextLoader("data/kongyiji.txt", encoding="utf-8")
docs = loader.load()

print(f"加载了 {len(docs)} 个 Document")
print(docs[0].page_content[:200])  # 看一下前200字符
print(docs[0].metadata)  # 看一下元数据
#print(docs[0].page_content)  # 看一下后200字符


In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(
    chunk_size=600,
    chunk_overlap=80,
    separators=["\n\n", "\n", "。", "，", " ", ""],
)

split_docs = splitter.split_documents(docs)

print(f"切分后共 {len(split_docs)} 段")
print(split_docs[0].page_content)


In [None]:
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

emb = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectorstore = FAISS.from_documents(split_docs, emb)
retriever = vectorstore.as_retriever(search_kwargs={"k": 4})

print("向量库构建完成 ✅")


In [None]:
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain_openai import ChatOpenAI
import os

llm = ChatOpenAI(
    model="gpt-4o-mini",  # 这里可以替换成 github models 的名称
    temperature=0,
    base_url="https://models.github.ai/inference",  # 如果用 GitHub Models 要加上
    api_key=os.environ["GITHUB_TOKEN"]
)

mq = MultiQueryRetriever.from_llm(retriever=retriever, llm=llm, include_original=True)

query = "根据文本，谈一下孔乙己的性格特点"
results = mq.get_relevant_documents(query)

print(f"检索到 {len(results)} 个文档片段：")
for i, d in enumerate(results, 1):
    print(f"[{i}] {d.page_content[:100]}...")
