In [None]:
%pip install -qU langchain langchain-community faiss-cpu sentence-transformers langchain-litellm litellm pypdf


In [None]:
import os, warnings, logging
warnings.filterwarnings("ignore")

os.environ["LANGCHAIN_TRACING_V2"] = "false"
for k in ["LANGCHAIN_API_KEY","LANGCHAIN_ENDPOINT","LANGCHAIN_PROJECT"]:
    os.environ.pop(k, None)

logging.getLogger("langchain").setLevel(logging.ERROR)
logging.getLogger("langsmith").setLevel(logging.ERROR)

# 选择 LLM 方案（二选一）：
USE_OLLAMA = False           # 本地 LLM（需本机已安装 ollama 并拉取模型）
USE_GITHUB_MODELS = True     # GitHub Models（免费额度；需 GITHUB_TOKEN）

# GitHub Models 的 PAT（scopes: models 或 models:read）
os.environ["GITHUB_TOKEN"] = os.getenv("GITHUB_TOKEN", "ghp_xxx_put_your_token_here")


In [None]:
from pathlib import Path
from langchain_community.document_loaders import TextLoader, PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

# 你的私域数据目录（可混放 txt/md/pdf）
DATA_DIR = Path("data/private_corpus")

def load_corpus(data_dir: Path):
    docs = []
    # txt/md
    for p in data_dir.rglob("*.txt"):
        docs += TextLoader(str(p), encoding="utf-8").load()
    for p in data_dir.rglob("*.md"):
        docs += TextLoader(str(p), encoding="utf-8").load()
    # pdf（可选）
    for p in data_dir.rglob("*.pdf"):
        docs += PyPDFLoader(str(p)).load()
    return docs

raw_docs = load_corpus(DATA_DIR)
print(f"加载原始文档数：{len(raw_docs)}")

splitter = RecursiveCharacterTextSplitter(
    chunk_size=300, 
    chunk_overlap=60, 
    separators=["\n\n", "\n", "。", "；", "，", " ", ""]
)
docs = splitter.split_documents(raw_docs)
# 附加来源标记
for i, d in enumerate(docs):
    d.metadata["chunk_id"] = i

print(f"切分后文档块数：{len(docs)}")
print(docs[0].metadata, docs[0].page_content[:120], "...")


In [None]:
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

emb = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectorstore = FAISS.from_documents(docs, emb)

# 可选：持久化
INDEX_DIR = "faiss_index_private"
vectorstore.save_local(INDEX_DIR)

retriever = vectorstore.as_retriever(search_kwargs={"k": 6})
print("FAISS 索引构建完成 ✅")


In [None]:
from langchain.schema import BaseMessage
from typing import List

# A) 使用 Ollama, 需本机已安装并拉取模型
if USE_OLLAMA:
    from langchain_community.chat_models import ChatOllama
    llm = ChatOllama(model="llama3", temperature=0)   # 可换 qwen2.5, mistral 等
    print("LLM: Ollama/llama3")

# B) 使用 GitHub Models（OpenAI 兼容；通过 LiteLLM 确保 /inference 路径）
if USE_GITHUB_MODELS:
    from langchain_litellm import ChatLiteLLM
    import os
    llm = ChatLiteLLM(
        model="openai/gpt-4o-mini",                         # 到 GitHub Models 目录可替换别的模型
        api_base="https://models.github.ai/inference",      # 关键：/inference
        api_key=os.environ["GITHUB_TOKEN"],
        temperature=0,
    )
    print("LLM: GitHub Models / gpt-4o-mini")


In [None]:
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain.prompts import ChatPromptTemplate
from langchain.schema import StrOutputParser

mq = MultiQueryRetriever.from_llm(
    retriever=retriever,
    llm=llm,
    include_original=True,  # 原始问题也参与检索
)

SYSTEM_PROMPT = (
    "你是一个严谨的企业内知識助手。严格遵守：\n"
    "1) **仅根据提供的上下文**回答；若无法从上下文中确定答案，必须回答“我不确定”。\n"
    "2) 回答尽量简洁、分点描述。\n"
    "3) 在答案末尾给出引用列表，格式：[source: 文件名#chunk_id]，可包含多个。\n"
)

QA_PROMPT = ChatPromptTemplate.from_messages([
    ("system", SYSTEM_PROMPT),
    ("human", "用户问题：{question}\n\n上下文：\n{context}\n\n请作答：")
])

def format_docs(docs):
    # 传给模型阅读的上下文
    return "\n\n".join(
        [f"[{d.metadata.get('chunk_id')}] ({d.metadata.get('source','unknown')})\n{d.page_content}"
         for d in docs]
    )

def citations(docs):
    # 供模型在答案后部附带引用
    outs = []
    for d in docs:
        src = d.metadata.get("source", "unknown")
        cid = d.metadata.get("chunk_id")
        outs.append(f"[source: {Path(src).name}#{cid}]")
    # 去重
    return sorted(set(outs), key=lambda x: x)

# RAG 链（按需可替换为历史对话版本）
def rag_answer(question: str) -> str:
    # 多查询召回 + 去重
    top_docs = mq.invoke(question)
    if not top_docs:
        return "我不确定。未在知识库中检索到相关内容。"

    ctx = format_docs(top_docs)
    answer = (QA_PROMPT | llm | StrOutputParser()).invoke({"question": question, "context": ctx})

    # 追加引用（稳妥起见也可让模型自己生成）
    cites = " ".join(citations(top_docs))
    if cites and cites not in answer:
        answer = f"{answer}\n\n参考来源：{cites}"
    return answer

print("RAG Bot 就绪 ✅")


In [None]:
q = "根据语料，三星电子晋升评估中, 绩效占比多少?并给出证据。"
print(rag_answer(q))
