In [None]:
pip install langchain_huggingface

In [None]:
pip install tabulate

In [57]:
from model import RagEmbedding, RagLLM, QwenLLM
from doc_parse import chunk, read_and_process_excel, logger

In [18]:
import pandas as pd
from langchain_chroma import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter
import chromadb
import shutil

In [30]:
PERSIST_DIRECTORY = "./chroma_db/zhidu_db"
COLLECTION_NAME = "zhidu_db"

In [31]:
pdf_files = ["./data/zhidu_employee.pdf", "./data/zhidu_travel.pdf"]
excel_files = ["./data/zhidu_detail.xlsx"]

In [32]:
r_spliter = RecursiveCharacterTextSplitter(
    chunk_size=128,
    chunk_overlap=30,
    separators=["\n\n", 
                "\n", 
                ".", 
                "\uff0e", 
                "\u3002",
                ",",
                "\uff0c",
                "\u3001'"
                ])

In [None]:
doc_data = []
for pdf_file_name in pdf_files:
    res = chunk(pdf_file_name, callback=logger)
    for data in res:
        content = data["content_with_weight"]
        if '<table>' not in content and len(content) > 200:
            doc_data = doc_data + r_spliter.split_text(content)
        else:
            doc_data.append(content)

In [None]:
for i in doc_data:
    print(len(i), "="*10, i)

In [35]:
for excel_file_name in excel_files:
    data = read_and_process_excel(excel_file_name)
    df = pd.DataFrame(data[8:], columns=data[7])
    data_excel = df.drop(columns=df.columns[11:17])
    doc_data.append(data_excel.to_markdown(index=False).replace(' ', ""))

In [36]:
from langchain_core.documents import Document
documents = []

for chunk in doc_data:
    document = Document(
        page_content=chunk,
        metadata={"source": "test"})
    documents.append(document)

In [None]:
embedding_cls = RagEmbedding(model_name="BAAI/bge-m3", device="cpu")

In [None]:
embedding_db = Chroma.from_documents(
    documents=documents,
    embedding=embedding_cls,
    persist_directory=PERSIST_DIRECTORY,  # 使用本地目录存储
    collection_name=COLLECTION_NAME
)

In [11]:
query = "迟到有什么规定？"

In [12]:
related_docs = embedding_db.similarity_search(query, k=2)

In [None]:
related_docs

RAG问答流程

In [19]:
llm = RagLLM()

In [20]:
prompt_template = """
    你是企业员工助手，熟悉公司考勤和报销标准等规章制度，需要根据提供的上下文信息context来回答员工的提问。\
    请直接回答问题，如果上下文信息context没有和问题相关的信息，请直接回答[不知道,请咨询HR] \
    问题：{question} 
    "{context}"
    回答：
"""

定义RAG问答流程

In [27]:
def run_rag_pipline(querys, k=3):
    for query in querys:
        
        related_docs = embedding_db.similarity_search(query, k=k)
        context = "\n".join([f"上下文{i+1}: {doc.page_content} \n" \
                         for i, doc in enumerate(related_docs)])
        
        print()
        print("#"*100)
        print(f"query: {query}")
        print(f"context: {context}")
        llm_prompt = prompt_template.replace("{query}", query).replace("{context}", context)
        response = llm(llm_prompt, stream=True)
        print(f"response: ")
        
        for chunk in response:
            print(chunk.choices[0].text, end="", flush=True)

In [None]:
run_rag_pipline(["加班有加班费吗？"])

In [None]:
run_rag_pipline(["出差可以买意外险吗？需要自己购买吗？"])

RagAs 评估框架

In [39]:
llm = RagLLM()

In [40]:
prompt_template = """
    你是企业员工助手，熟悉公司考勤和报销标准等规章制度，需要根据提供的上下文信息context来回答员工的提问。\
    请直接回答问题，如果上下文信息context没有和问题相关的信息，请直接回答[不知道,请咨询HR] \
    问题：{question} 
    "{context}"
    回答：
"""

In [23]:
def run_rag_pipline_without_stream(query, k=3):
    related_docs = embedding_db.similarity_search(query, k=k)
    context_list = [f"上下文{i+1}: {doc.page_content} \n" \
                         for i, doc in enumerate(related_docs)]
    context = "\n".join(context_list)

    llm_prompt = prompt_template.replace("{question}", query).replace("{context}", context)
    response = llm(llm_prompt, stream=False)
    return response, context_list

构建评估数据集
- 问题
- 标准答案
- 上下文信息
- 生成的答案

In [41]:
questions = [
    "伙食补助费标准是什么?",
    "出差可以买意外保险吗？需要自己购买吗",
]
ground_truths = [
    "伙食补助费标准: 西藏、青海、新疆 120元/人、天 其他省份 100元/人、天",
    "出差可以购买交通意外保险，由单位统一购买，不再重复购买",
]

In [42]:
answers = []
contexts = []

In [43]:
for query in questions:
    response, context_list = run_rag_pipline_without_stream(query, k=3)
    answers.append(response)
    contexts.append(context_list)

In [44]:
from datasets import Dataset

In [54]:
data = {
    "user_input": questions,
    "response": answers,
    "contexts": contexts,
    "ground_truth": ground_truths
}

In [55]:
dataset = Dataset.from_dict(data)

In [None]:
pip install ragas

In [47]:
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision
)

In [48]:
from ragas import evaluate
from ragas import RunConfig

In [None]:
eval_llm = QwenLLM()
embedding_cls = RagEmbedding(model_name="BAAI/bge-m3", device="cpu")

In [50]:
config = RunConfig(timeout=1200, log_tenacity = True)

In [58]:
result = evaluate(
    dataset = dataset,
    llm = eval_llm,
    embeddings = embedding_cls,
    metrics=[faithfulness, answer_relevancy, context_recall, context_precision],
    run_config=config,
    raise_exceptions=True
)

df = result.to_pandas()
print("评估结果：")
print(df)

Evaluating: 100%|██████████| 8/8 [04:54<00:00, 36.81s/it]


评估结果：
           user_input                                 retrieved_contexts  \
0         伙食补助费标准是什么?  [上下文1: <table><caption>伙食补助费参考以下标准：</caption>\...   
1  出差可以买意外保险吗？需要自己购买吗  [上下文1: 差旅费用标准\n差旅费开支范围包括工作人员临时到常驻地以外地区公务出差所发生的...   

                                            response  \
0      伙食补助费的标准如下：在西藏、青海、新疆地区，伙食补助费为120元/人、天；在其他省...   
1  根据上下文信息，乘坐飞机、火车、轮船等交通工具的，每人次可以购买交通意外保险一份。由公司统一...   

                                  reference  faithfulness  answer_relevancy  \
0  伙食补助费标准: 西藏、青海、新疆 120元/人、天 其他省份 100元/人、天           1.0          0.661572   
1               出差可以购买交通意外保险，由单位统一购买，不再重复购买           1.0          0.000000   

   context_recall  context_precision  
0             1.0                1.0  
1             1.0                1.0  
