In [None]:
%pip install -qU langchain langchain-openai langchain-community faiss-cpu tiktoken
%pip install -qU sentence-transformers

In [None]:
import os

from dotenv import load_dotenv

# 加载环境变量
load_dotenv()

if not os.getenv("GITHUB_TOKEN"):
    print("Error: GITHUB_TOKEN is not set in .env file")
    exit(1)


os.environ["GITHUB_TOKEN"] = os.getenv("GITHUB_TOKEN")
os.environ["GITHUB_MODEL"] = os.getenv("GITHUB_MODEL")
os.environ["OPENAI_BASE_URL"] = os.getenv("OPENAI_BASE_URL")

# 可选：减少不必要的日志
os.environ["LANGCHAIN_TRACING_V2"] = "true"

In [None]:
from pathlib import Path
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter

# 你的txt路径（举例）
TXT_PATH = Path("data/kongyiji.txt")  # 改成你的真实路径

# 读取文本（处理编码问题）
text = TXT_PATH.read_text(encoding="utf-8", errors="ignore")

# 切分：chunk_size/overlap 可按需要调整
splitter = RecursiveCharacterTextSplitter(
    chunk_size=300,    # 每块字符数
    chunk_overlap=40,  # 块间重叠，有助于跨段语义衔接
    separators=["\n\n", "\n", "。", "，", " ", ""],  # 中英混排时好用
)

chunks = splitter.split_text(text)
docs = [
    Document(page_content=chunk, metadata={"source": str(TXT_PATH), "chunk_id": i})
    for i, chunk in enumerate(chunks)
]

print(f"原文长度: {len(text)} 字符，切分得到 {len(docs)} 个文档块。示例：\n")
for d in docs[:2]:
    print(d.metadata, d.page_content[:120], "...\n")

In [None]:
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_openai import ChatOpenAI
from langchain.retrievers.multi_query import MultiQueryRetriever

# 1) 本地免费嵌入，构建向量库
emb = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectorstore = FAISS.from_documents(docs, emb)
retriever = vectorstore.as_retriever(search_kwargs={"k": 4})

# 2) 用 GitHub Models 当 LLM
llm = ChatOpenAI(
    model="gpt-4o-mini",            
    temperature=0,
    base_url="https://models.github.ai/inference",   # 关键：GitHub Models 新端点
    api_key=os.environ["GITHUB_TOKEN"],  # 你的 GitHub PAT
)

# 3) MultiQueryRetriever
mq = MultiQueryRetriever.from_llm(retriever=retriever, llm=llm, include_original=True)
query = "根据语料，谈下孔乙己的性格特点？"
results = mq.get_relevant_documents(query)
print(f"召回去重：{len(results)}")
#print(f"results:：{results}")

for r in results:
    print(r, "...\n")