# 加载环境

In [1]:
from langchain.llms.base import LLM
from typing import Any, List, Optional
from langchain.callbacks.manager import CallbackManagerForLLMRun
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# 首先导入所需第三方库
from langchain_community.document_loaders import TextLoader
from langchain_community.document_loaders import UnstructuredMarkdownLoader
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from tqdm import tqdm
import os
# 加载嵌入模型
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
# 建立索引
from langchain_community.vectorstores import FAISS
from langchain_core.prompts import ChatPromptTemplate


# 配置LangSmith

In [7]:
import os
from uuid import uuid4

unique_id = uuid4().hex[0:8]
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_PROJECT"] = f"Tracing Walkthrough - {unique_id}"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
os.environ["LANGCHAIN_API_KEY"] = "ls__f438da3457f5446db8509ca0e0ee48f3"  # Update to your API key

from langsmith import Client

client = Client()

# 自定义大模型 Internlm-chat-7b-finetuned

In [2]:
class InternLM_LLM(LLM):
    # 基于本地 InternLM 自定义 LLM 类
    tokenizer : AutoTokenizer = None
    model: AutoModelForCausalLM = None

    def __init__(self, model_path :str):
        # model_path: InternLM 模型路径
        # 从本地初始化模型
        super().__init__()
        print("正在从本地加载模型...")
        self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
        self.model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True).to(torch.bfloat16).cuda()
        self.model = self.model.eval()
        print("完成本地模型的加载")

    def _call(self, prompt : str, stop: Optional[List[str]] = None,
                run_manager: Optional[CallbackManagerForLLMRun] = None,
                **kwargs: Any):
        # 重写调用函数
        system_prompt = """You are an AI assistant whose name is InternLM (书生·浦语).
        - InternLM (书生·浦语) is a conversational language model that is developed by Shanghai AI Laboratory (上海人工智能实验室). It is designed to be helpful, honest, and harmless.
        - InternLM (书生·浦语) can understand and communicate fluently in the language chosen by the user such as English and 中文.
        """
        
        messages = [(system_prompt, '')]
        response, history = self.model.chat(self.tokenizer, prompt , history=messages)
        return response
        
    @property
    def _llm_type(self) -> str:
        return "InternLM2"

# 加载大模型
#llm = InternLM_LLM("/root/share/model_repos/internlm2-chat-7b")
llm = InternLM_LLM("/root/project/bisai2/sft-guanfang-self/merged_7b_e10")

## （可选）离线构建检索库

### 定义加载知识库文件的工具函数

In [None]:
# 获取文件路径函数
def get_files(dir_path):
    # args：dir_path，目标文件夹路径
    file_list = []
    for filepath, dirnames, filenames in os.walk(dir_path):
        # os.walk 函数将递归遍历指定文件夹
        for filename in filenames:
            # 通过后缀名判断文件类型是否满足要求
            if filename.endswith(".md"):
                # 如果满足要求，将其绝对路径加入到结果列表
                file_list.append(os.path.join(filepath, filename))
            elif filename.endswith(".txt"):
                file_list.append(os.path.join(filepath, filename))
            elif filename.endswith(".csv"):
                file_list.append(os.path.join(filepath, filename))
    return file_list

# 加载文件函数
def get_text(dir_path):
    # args：dir_path，目标文件夹路径
    # 首先调用上文定义的函数得到目标文件路径列表
    file_lst = get_files(dir_path)
    # docs 存放加载之后的纯文本对象
    docs = []
    # 遍历所有目标文件
    for one_file in tqdm(file_lst):
        file_type = one_file.split('.')[-1]
        if file_type == 'md':
            loader = UnstructuredMarkdownLoader(one_file)
        elif file_type == 'txt':
            loader = TextLoader(one_file)
        elif file_type == 'csv':
            loader = CSVLoader(one_file)
        else:
            # 如果是不符合条件的文件，直接跳过
            continue
        docs.extend(loader.load())
    return docs

## 准备知识库文件

In [None]:
#在question 的开头增加时间

#官方数据制作的知识库
from tqdm import tqdm
import pandas as pd
import json

df = pd.read_excel('../data/gong_gao_ce_ping.xls', dtype=str)
#system_value = "你是上市公司的董秘，你乐于助人，诚实无害，你竭诚为投资者解答关于公司运营、财务状况、投资者关系等方面的问题。"
train_data = []
output_dir = './raw_txt_knowledge'
for index, row in tqdm(df.iterrows(), total=len(df)):
    try:
        question = f"{row['评测问题']}"
        answer = row['答案']
        if question is None or question == "":
            print(row)
            continue
        if answer is None or answer == "":
            print(row)
            continue
        
        conversation = f"问：{question} \n答：{answer}"
    
        with open(f'{output_dir}/gong_gao_ce_ping_{index}.txt', 'w', encoding='utf-8') as file:
            file.write(conversation)
        
    except Exception as e:
        print(row)
        print(e)
        continue

print(f"Conversion complete. Output written to {output_dir}")

In [None]:
# 大模型生成的问答对知识库
import json
with open("../data/rag_qa_extend.jsonl", 'r', encoding='utf-8') as file:
    json_str = file.read()
    train_json = json.loads(json_str)
    index = 0
    for qa in train_json:
        question = qa['conversation'][0]["input"]
        answer   = qa['conversation'][0]["output"]
        if question is None or question == "":
            print(row)
            continue
        if answer is None or answer == "":
            print(row)
            continue
        conversation = f"问：{question} \n答：{answer}"
        
        with open(f'{output_dir}/qa_extend_{index}.txt', 'w', encoding='utf-8') as file:
            file.write(conversation)
        index +=1
    

In [None]:
# 抓取的数据建立的知识库
# 大模型生成的问答对知识库
import json
with open("../data/train.jsonl", 'r', encoding='utf-8') as file:
    json_str = file.read()
    train_json = json.loads(json_str)
    index = 0
    for qa in train_json:
        question = qa['conversation'][0]["input"]
        answer   = qa['conversation'][0]["output"]
        if question is None or question == "":
            print(qa)
            continue
        if answer is None or answer == "":
            print(qa)
            continue
        conversation = f"问：{question} \n答：{answer}"
        
        with open(f'{output_dir}/train_{index}.txt', 'w', encoding='utf-8') as file:
            file.write(conversation)
        index +=1

## 创建检索库(如果没有)

In [None]:
# 知识库文件保存的目录
tar_dir = [
    #"/root/project/TinyRAG/data/2024-01-23_10-07-23",
    #"/root/project/bisai2/data/vector_db_txt",
    "./raw_txt_knowledge",
]

# 加载目标文件
docs = []
for dir_path in tar_dir:
    docs.extend(get_text(dir_path))


# 对文本进行分块
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500, chunk_overlap=150)
documents = text_splitter.split_documents(docs)

# 加载embedding
embeddings = HuggingFaceEmbeddings(model_name="/root/model/jinaai/jina-embeddings-v2-base-zh")
#embeddings = HuggingFaceEmbeddings(model_name="/root/data/model/sentence-transformer")


vectordb = FAISS.from_documents(documents, embeddings)
# 将加载的向量数据库持久化到磁盘上
vectordb.save_local('./vectordb')

## 创建检索库(如果有)

In [None]:
# 加载已有数据库
embeddings = HuggingFaceEmbeddings(model_name="/root/model/jinaai/jina-embeddings-v2-base-zh")
vectordb = FAISS.load_local('./vectordb', embeddings,allow_dangerous_deserialization=True)

In [None]:
# 建立一个链，该链接受问题和检索到的文档并生成答案
prompt = ChatPromptTemplate.from_template("""使用以上下文来回答用户的问题。总是使用中文回答。
<context>
{context}
</context>
如果给定的上下文无法让你做出回答，请对回答的内容以“*”开头，然后根据你的理解作答。
有用的回答: {input}""")

document_chain = create_stuff_documents_chain(llm, prompt)


#对于给定的问题，我们可以使用检索器动态选择最相关的文档并将其传递进去

retriever = vectordb.as_retriever()
retrieval_chain = create_retrieval_chain(retriever, document_chain)

# 返回结果
response = retrieval_chain.invoke({"input": "上汽集团有多少车辆搭载了德威系列的发动机"})
print(response["answer"])

In [8]:
# 建立一个链，该链接受问题和检索到的文档并生成答案
prompt = ChatPromptTemplate.from_template("""使用以上下文来回答用户的问题。如果你不知道答案，就说你不知道。总是使用中文回答。
<context>
{context}
</context>
如果给定的上下文无法让你做出回答，请对回答的内容以“*”开头，然后根据你的理解作答。
有用的回答: {input}""")

document_chain = create_stuff_documents_chain(llm, prompt)


#对于给定的问题，我们可以使用检索器动态选择最相关的文档并将其传递进去

retriever = vectordb.as_retriever()
retrieval_chain = create_retrieval_chain(retriever, document_chain)

# 返回结果
response = retrieval_chain.invoke({"input": "上汽集团的董事长是谁"})
print(response["answer"])

对不起，我无法回答您的问题，因为我无法找到相关信息。如果您有其他问题，我会很乐意为您提供帮助。 


In [None]:
#  提取问题

In [4]:
llm.invoke("你好")

prompt = ChatPromptTemplate.from_messages([
    ("system", "You are world class technical documentation writer."),
    ("user", "{input}")
])
chain = prompt | llm 

'您好！我是一个名叫书生·浦语的AI助手，很高兴为您服务。我致力于通过自然语言处理和深度学习技术，帮助解答您的问题，提供信息和建议。如果您有任何疑问或需要帮助，请随时告诉我。 '

# 离线构建检索库

In [7]:
# 获取文件路径函数
def get_files(dir_path):
    # args：dir_path，目标文件夹路径
    file_list = []
    for filepath, dirnames, filenames in os.walk(dir_path):
        # os.walk 函数将递归遍历指定文件夹
        for filename in filenames:
            # 通过后缀名判断文件类型是否满足要求
            if filename.endswith(".md"):
                # 如果满足要求，将其绝对路径加入到结果列表
                file_list.append(os.path.join(filepath, filename))
            elif filename.endswith(".txt"):
                file_list.append(os.path.join(filepath, filename))
            elif filename.endswith(".csv"):
                file_list.append(os.path.join(filepath, filename))
    return file_list

# 加载文件函数
def get_text(dir_path):
    # args：dir_path，目标文件夹路径
    # 首先调用上文定义的函数得到目标文件路径列表
    file_lst = get_files(dir_path)
    # docs 存放加载之后的纯文本对象
    docs = []
    # 遍历所有目标文件
    for one_file in tqdm(file_lst):
        file_type = one_file.split('.')[-1]
        if file_type == 'md':
            loader = UnstructuredMarkdownLoader(one_file)
        elif file_type == 'txt':
            loader = TextLoader(one_file)
        elif file_type == 'csv':
            loader = CSVLoader(one_file)
        else:
            # 如果是不符合条件的文件，直接跳过
            continue
        docs.extend(loader.load())
    return docs

In [42]:


# 目标文件夹
tar_dir = [
    #"/root/project/TinyRAG/data/2024-01-23_10-07-23",
    "/root/project/bisai2/data/vector_db_txt",
]

# 加载目标文件
docs = []
for dir_path in tar_dir:
    docs.extend(get_text(dir_path))


# 对文本进行分块
# Todo  怎么查看超长的chunk
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500, chunk_overlap=150)
documents = text_splitter.split_documents(docs)




embeddings = HuggingFaceEmbeddings(model_name="/root/model/jinaai/jina-embeddings-v2-base-zh")
#embeddings = HuggingFaceEmbeddings(model_name="/root/data/model/sentence-transformer")




vectordb = FAISS.from_documents(documents, embeddings)
# 将加载的向量数据库持久化到磁盘上
vectordb.save_local('./vectordb')

'''
# 构建向量数据库
# 定义持久化路径
persist_directory = 'data_base/vector_db/chroma'
# 加载数据库
vectordb = Chroma.from_documents(
    documents=split_docs,
    embedding=embeddings,
    persist_directory=persist_directory  # 允许我们将persist_directory目录保存到磁盘上
)
# 将加载的向量数据库持久化到磁盘上
vectordb.persist()
'''


100%|██████████| 4795/4795 [00:03<00:00, 1573.06it/s]
Some weights of BertModel were not initialized from the model checkpoint at /root/model/jinaai/jina-embeddings-v2-base-zh and are newly initialized: ['embeddings.position_embeddings.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.intermediate.dense.weight', 'encoder.layer.0.output.LayerNorm.bias', 'encoder.layer.0.output.LayerNorm.weight', 'encoder.layer.0.output.dense.bias', 'encoder.layer.0.output.dense.weight', 'encoder.layer.1.intermediate.dense.bias', 'encoder.layer.1.intermediate.dense.weight', 'encoder.layer.1.output.LayerNorm.bias', 'encoder.layer.1.output.LayerNorm.weight', 'encoder.layer.1.output.dense.bias', 'encoder.layer.1.output.dense.weight', 'encoder.layer.10.intermediate.dense.bias', 'encoder.layer.10.intermediate.dense.weight', 'encoder.layer.10.output.LayerNorm.bias', 'encoder.layer.10.output.LayerNorm.weight', 'encoder.layer.10.output.dense.bias', 'encoder.layer.10.output.dense.weight', 'enco

"\n# 构建向量数据库\n# 定义持久化路径\npersist_directory = 'data_base/vector_db/chroma'\n# 加载数据库\nvectordb = Chroma.from_documents(\n    documents=split_docs,\n    embedding=embeddings,\n    persist_directory=persist_directory  # 允许我们将persist_directory目录保存到磁盘上\n)\n# 将加载的向量数据库持久化到磁盘上\nvectordb.persist\n"

In [None]:
# 加载已有数据库
embeddings = HuggingFaceEmbeddings(model_name="/root/model/jinaai/jina-embeddings-v2-base-zh")
vectordb = FAISS.load_local('./vectordb',embeddings,allow_dangerous_deserialization=True)

In [23]:
# 建立一个链，该链接受问题和检索到的文档并生成答案


prompt = ChatPromptTemplate.from_template("""使用以上下文来回答用户的问题。如果你不知道答案，就说你不知道。总是使用中文回答。
<context>
{context}
</context>
如果给定的上下文无法让你做出回答，请对回答的内容以“*”开头，然后根据你的理解作答。
有用的回答: {input}""")

document_chain = create_stuff_documents_chain(llm, prompt)


#对于给定的问题，我们可以使用检索器动态选择最相关的文档并将其传递进去


retriever = vectordb.as_retriever()
retrieval_chain = create_retrieval_chain(retriever, document_chain)

# 返回结果
response = retrieval_chain.invoke({"input": "上汽集团的董事长是谁"})
print(response["answer"])

*关于上汽集团的董事长，我需要查询一下相关信息。根据最新的信息，上汽集团的董事长是陈虹。他在2018年5月25日出任该职务，并在之后一直担任该职位。陈虹在汽车行业中拥有丰富的经验和卓越的领导能力，他在上汽集团的任职期间推动了一系列改革和发展措施，进一步提升了公司的竞争力。 


In [34]:


# 加载文本
from langchain_community.document_loaders import WebBaseLoader
loader = WebBaseLoader("https://docs.smith.langchain.com/user_guide")

docs = loader.load()

# 加载嵌入模型
from langchain_community.embeddings.huggingface import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(model_name="/root/model/jinaai/jina-embeddings-v2-base-zh")

#pip install faiss-cpu

# 建立索引
from langchain_community.vectorstores import FAISS
from langchain_text_splitters import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter()
documents = text_splitter.split_documents(docs)
vector = FAISS.from_documents(documents, embeddings)

Some weights of BertModel were not initialized from the model checkpoint at /root/model/jinaai/jina-embeddings-v2-base-zh and are newly initialized: ['embeddings.position_embeddings.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.intermediate.dense.weight', 'encoder.layer.0.output.LayerNorm.bias', 'encoder.layer.0.output.LayerNorm.weight', 'encoder.layer.0.output.dense.bias', 'encoder.layer.0.output.dense.weight', 'encoder.layer.1.intermediate.dense.bias', 'encoder.layer.1.intermediate.dense.weight', 'encoder.layer.1.output.LayerNorm.bias', 'encoder.layer.1.output.LayerNorm.weight', 'encoder.layer.1.output.dense.bias', 'encoder.layer.1.output.dense.weight', 'encoder.layer.10.intermediate.dense.bias', 'encoder.layer.10.intermediate.dense.weight', 'encoder.layer.10.output.LayerNorm.bias', 'encoder.layer.10.output.LayerNorm.weight', 'encoder.layer.10.output.dense.bias', 'encoder.layer.10.output.dense.weight', 'encoder.layer.11.intermediate.dense.bias', 'encoder.layer.

In [20]:


# 建立一个链，该链接受问题和检索到的文档并生成答案
from langchain.chains.combine_documents import create_stuff_documents_chain

prompt = ChatPromptTemplate.from_template("""Answer the following question based only on the provided context:

<context>
{context}
</context>

Question: {input}""")

document_chain = create_stuff_documents_chain(llm, prompt)


#对于给定的问题，我们可以使用检索器动态选择最相关的文档并将其传递进去
from langchain.chains import create_retrieval_chain

retriever = vector.as_retriever()
retrieval_chain = create_retrieval_chain(retriever, document_chain)

# 返回结果
response = retrieval_chain.invoke({"input": "how can langsmith help with testing?"})
print(response["answer"])

Some weights of BertModel were not initialized from the model checkpoint at /root/model/jinaai/jina-embeddings-v2-base-zh and are newly initialized: ['embeddings.position_embeddings.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.intermediate.dense.weight', 'encoder.layer.0.output.LayerNorm.bias', 'encoder.layer.0.output.LayerNorm.weight', 'encoder.layer.0.output.dense.bias', 'encoder.layer.0.output.dense.weight', 'encoder.layer.1.intermediate.dense.bias', 'encoder.layer.1.intermediate.dense.weight', 'encoder.layer.1.output.LayerNorm.bias', 'encoder.layer.1.output.LayerNorm.weight', 'encoder.layer.1.output.dense.bias', 'encoder.layer.1.output.dense.weight', 'encoder.layer.10.intermediate.dense.bias', 'encoder.layer.10.intermediate.dense.weight', 'encoder.layer.10.output.LayerNorm.bias', 'encoder.layer.10.output.LayerNorm.weight', 'encoder.layer.10.output.dense.bias', 'encoder.layer.10.output.dense.weight', 'encoder.layer.11.intermediate.dense.bias', 'encoder.layer.

LangSmith can help with testing in several ways:

1. **Beta Testing**: LangSmith allows developers to collect more data on how their LLM applications are performing in real-world scenarios. It helps in understanding the types of inputs the app is performing well or poorly on and how it's breaking down in those cases.

2. **Collecting Feedback**: LangSmith enables users to attach feedback scores to logged traces, which can be hooked up to a feedback button in the app. This helps in gathering human feedback on the responses produced by the application and highlighting edge cases causing problematic responses.

3. **Annotating Traces**: LangSmith supports sending runs to annotation queues, allowing annotators to closely inspect interesting traces and annotate them with respect to different criteria. This helps in catching regressions across important evaluation criteria.

4. **Adding Runs to a Dataset**: As the application progresses through the beta testing phase, LangSmith enables users

In [21]:
import os
from uuid import uuid4

unique_id = uuid4().hex[0:8]
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_PROJECT"] = f"Tracing Walkthrough - {unique_id}"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
os.environ["LANGCHAIN_API_KEY"] = "ls__f438da3457f5446db8509ca0e0ee48f3"  # Update to your API key

from langsmith import Client

client = Client()


In [None]:
from langchain_core.output_parsers import StrOutputParser

output_parser = StrOutputParser()

In [18]:
#from langchain_community.embeddings import OllamaEmbeddings
#from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain_community.embeddings.huggingface import HuggingFaceEmbeddings

In [16]:
from langchain_text_splitters import (
    Language,
    RecursiveCharacterTextSplitter,
)

In [31]:
from langchain_text_splitters import RecursiveJsonSplitter
splitter = RecursiveJsonSplitter(max_chunk_size=300)
json_chunks = splitter.split_json(json_data=json_data)
# The splitter can also output documents
docs = splitter.create_documents(texts=[json_data])

# or a list of strings
texts = splitter.split_text(json_data=json_data)

print(texts[0])
print(texts[1])


{"openapi": "3.1.0", "info": {"title": "LangSmith", "version": "0.1.0"}, "paths": {"/api/v1/sessions/{session_id}": {"get": {"tags": ["tracer-sessions"], "summary": "Read Tracer Session", "description": "Get a specific session."}}}}
{"paths": {"/api/v1/sessions/{session_id}": {"get": {"operationId": "read_tracer_session_api_v1_sessions__session_id__get", "security": [{"API Key": []}, {"Tenant ID": []}, {"Bearer Auth": []}]}}}}
