In [46]:
from huggingface_hub import snapshot_download

In [48]:
snapshot_download(repo_id="THUDM/chatglm-6b", local_dir="./chatglm-6b/", local_dir_use_symlinks=False)

Fetching 21 files:   0%|          | 0/21 [00:00<?, ?it/s]


In [3]:
from typing import Optional, List, Dict, Mapping, Any

from transformers import AutoModel, AutoTokenizer
from langchain.llms.base import LLM
from langchain.llms.utils import enforce_stop_tokens
import torch

In [31]:
DEVICE = "cuda"
DEVICE_ID = "2"
CUDA_DEVICE = f"{DEVICE}:{DEVICE_ID}" if DEVICE_ID else DEVICE

def torch_gc():
    if torch.cuda.is_available():
        with torch.cuda.device(CUDA_DEVICE):
            torch.cuda.empty_cache()
            torch.cuda.ipc_collect()

# 自定义大模型
class ChatLLM(LLM):
    max_token: int = 10000
    temperature: float = 0.1
    top_p: float = 0.9
    history: list = []
    model_name_or_path: str = "/home/kmzn01/wangjing/ChatGLM2-6B/chatglm2-6b"
    tokenizer: object = None
    model: object = None
    
    def __init__(self):
        super().__init__()
    
    @property
    def _llm_type(self) -> str:
        return "ChatGLM"
    
    def _call(self,
              prompt: str,
              stop: Optional[List[str]] = None) -> str:
        response, _ = self.model.chat(
            self.tokenizer,
            prompt,
            history = self.history,
            max_length = self.max_token,
            temperature = self.temperature,
        )
        torch_gc()
        if stop is not None:
            response = enforce_stop_tokens(response, stop)
        self.history = self.history + [[None, response]]
        
        return response
    
    def load_llm(self):
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path,trust_remote_code=True,use_auth_token="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")
        self.model = AutoModel.from_pretrained(self.model_name_or_path,trust_remote_code=True).half().cuda()
        self.model = self.model.eval()

In [32]:
llm = ChatLLM()
llm.load_llm()

Loading checkpoint shards: 100%|██████████| 8/8 [00:29<00:00,  3.64s/it]


In [33]:
llm("你好")

The dtype of attention mask (torch.int64) is not bool


'你好👋！我是人工智能助手 ChatGLM-6B，很高兴见到你，欢迎问我任何问题。'

In [1]:
import os
import sentence_transformers
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.prompts.prompt import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.document_loaders import UnstructuredFileLoader

# os.environ['HTTP_PROXY']= 'http://192.168.1.203:7890'
# os.environ['HTTPS_PROXY']= 'http://192.168.1.203:7890'

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
embeddings = HuggingFaceEmbeddings()
text1 = "This is a test1 document."
text2 = "This is a test2 document."
query_result = embeddings.embed_query(text1)
doc_result = embeddings.embed_documents([text1, text2])

In [30]:
query_result1 = embeddings.embed_query(text1)
query_result2 = embeddings.embed_query(text2)

In [34]:
import numpy as np

In [37]:
len(doc_result)

2

社区经过实践，对中文支持比较好的模型是Hugging face上的 ganymedenil/text2vec-large-chinese
|model|size|
|:---|----|
|ganymedenil/text2vec-large-chinese|1.3G|
|shibing624/text2vec-base-chinese|409M|

In [41]:
embeddings = HuggingFaceEmbeddings(model_name="./ganymedenil/text2vec-large-chinese/ganymedenil_text2vec-large-chinese")

No sentence-transformers model found with name ./ganymedenil/text2vec-large-chinese/ganymedenil_text2vec-large-chinese. Creating a new one with MEAN pooling.


## embedding测试

In [7]:
with open('embedding_test.txt', 'r', encoding='utf-8') as f:
    embedding_test = f.read().splitlines()

while '' in embedding_test:
    embedding_test.remove('')

len(embedding_test)

17

In [8]:
embedding_tests = []
for text in embedding_test:
    embedding_tests += list(map(lambda x: x+"。", text.split("。")))
while '。' in embedding_tests:
    embedding_tests.remove('。')
len(embedding_tests)

49

In [42]:
doc_result = embeddings.embed_documents(embedding_tests)

In [43]:
doc_result

[[-1.1543313264846802,
  0.6527954339981079,
  0.18347673118114471,
  0.8445881009101868,
  0.2623565196990967,
  0.8159534335136414,
  -0.775972306728363,
  -0.0460771806538105,
  -0.3848603367805481,
  -0.5543479323387146,
  0.7588033080101013,
  0.18419846892356873,
  -1.0125643014907837,
  -0.946342945098877,
  0.9776009321212769,
  -0.9400618076324463,
  -1.1985714435577393,
  -1.0844160318374634,
  1.1240267753601074,
  -0.4950336515903473,
  0.18190474808216095,
  0.3486224114894867,
  1.3946490287780762,
  -0.15425796806812286,
  0.33831343054771423,
  0.2885476350784302,
  -0.53839111328125,
  0.8338764905929565,
  0.8799440264701843,
  0.3114974796772003,
  1.2836753129959106,
  0.369482159614563,
  1.3546781539916992,
  0.5877166986465454,
  -0.05515991523861885,
  -1.1473480463027954,
  -0.29976531863212585,
  -0.48180779814720154,
  -0.45336630940437317,
  -0.384003609418869,
  0.0665195882320404,
  -0.4951612949371338,
  0.5252581238746643,
  -0.9137829542160034,
  -0.575

In [50]:
embedding_test1 = embedding_tests*10
embedding_test2 = embedding_tests*1000
embedding_test3 = embedding_tests*100000

In [42]:
import time

In [None]:
for test in [embedding_test1, embedding_test2, embedding_test3]:
    start = time.time()
    embeddings.embed_documents(test)
    print(f"{len(test)}句话，耗时{time.time()-start}s")

490句话，耗时1.6526176929473877s
49000句话，耗时152.55237460136414s


In [65]:
import pandas as pd

In [66]:
pd.DataFrame({'data':embedding_tests}).to_csv('embedding_test.csv', index=False)

In [56]:
prompt_template = """
基于以下已知信息，请简洁并专业地回答用户的问题。
如果无法从中得到答案，请说 "根据已知信息无法回答该问题" 或 "没有提供足够的相关信息"。不允许在答案中添加编造成分。另外，答案请使用中文。
已知内容:
{context}
问题:
{question}
"""
prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])

In [57]:
prompt

PromptTemplate(input_variables=['context', 'question'], output_parser=None, partial_variables={}, template='\n基于以下已知信息，请简洁并专业地回答用户的问题。\n如果无法从中得到答案，请说 "根据已知信息无法回答该问题" 或 "没有提供足够的相关信息"。不允许在答案中添加编造成分。另外，答案请使用中文。\n已知内容:\n{context}\n问题:\n{question}\n', template_format='f-string', validate_template=True)

In [5]:
filepath = "./embedding_test.md"
loader = UnstructuredFileLoader(filepath, mode="elements")
docs = loader.load()
docs

[nltk_data] Downloading package punkt to /home/kmzn01/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/kmzn01/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


[Document(page_content='直新闻：乌克兰总统泽连斯基亲口向来访的加拿大总理特鲁多承认，乌军正针对俄军发起反攻行动。\n那么结合这几天的战局来看，乌军这一轮反攻的规模有多大，目标是什么，效果又如何呢？。\n特约评论员 吴蔚：“传说中”的大反攻终于上演。\n此前我曾作出判断，就以目前的战场态势而言，俄乌双方其实都在积极准备，等待对方发起进攻，因为防守方的优势往往更大一些。\n那么为何是乌军率先发起了反攻呢？我想，政治的因素要远高于军事因素。\n去年底至今年初，美西方加码“投喂”乌克兰一批先进的地面装备，尤其是上百辆主战坦克与数百辆步战车。\n小半年过去了，接装部队基本形成战斗力，那就得在战场上把牌打出去呀，不然这些西方金主们会觉得基辅当局“不思进取”。\n从俄乌战线的宏观上看，目前乌军大致处于一个“北守南攻”的态势。\n乌军此次所谓的“大反攻”主要发生在南部战线扎波罗热至顿涅茨克州，西起卡缅斯克，东至弗勒达尔，战场宽度大约150公里左右。\n自5月底乌军开始火力准备，6月初地面兵力开始形成进攻矛头。\n研判一场攻势作战，我们首先要判明三个要素：战役决心、主攻方向、实施方案。\n先说战役决心。\n根据开源情报显示，这是一场集团军级的攻势。\n乌军投入了两个军将近20个旅级单位的兵力，其中不乏装备德制“豹2”主战坦克以及美制“布雷德利”战车的精锐机械化步兵，他们过去小半年一直在波兰境内接受北约教官的训练。\n他们的参战也意味着乌军确实下了血本，展现出较高的战役决心。\n再说主攻方向。\n以目前进攻矛头的指向来看，乌军应该是计划针对俄军占据的沃斯里夫卡、托克马克、波罗伊三个主要城镇发起进攻，依然是旨在抢夺交通要道进而向纵深发展的典型“通道作战”方式。\n若第一阶段进攻目标成功实现，乌军下一阶段便可以向梅利托波尔、别尔江斯克两座城市发展。\n如此一来，乌军将有可能一举实现将俄军战线“一刀两断”的重大战役成果，这意味着驻守在赫尔松与克里米亚半岛上的俄军将陷入补给困难的境地。\n最后再看实施方案。\n在5月底的火力准备阶段，乌军投入了大量北约制式的精确制导武器，对扎波罗热纵深的俄军指挥、防空节点进行打击，这其实已经在很大程度上暴露了进攻意图。\n在6月初的地面进攻阶段，乌军多个机械化步兵旅以营级战术群、连级战斗分队的方式对俄军阵地发起进攻。\n这一阶段进攻的目

In [22]:
vector_store = FAISS.from_documents(docs, embeddings)
vector_store.save_local('faiss_index')

In [17]:
import nltk
# https://zhuanlan.zhihu.com/p/433423216

In [18]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Error loading averaged_perceptron_tagger: <urlopen error
[nltk_data]     [Errno 99] Cannot assign requested address>


False

In [29]:
prompt_template = """
基于以下已知信息，请简洁并专业地回答用户的问题。
如果无法从中得到答案，请说 "根据已知信息无法回答该问题" 或 "没有提供足够的相关信息"。不允许在答案中添加编造成分。另外，答案请使用中文。

已知内容:
{context}

问题:
{question}
"""
prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])

In [28]:
vector_store = FAISS.load_local('faiss_index', embeddings)

In [34]:
top_k = 5
knowledge_chain = RetrievalQA.from_llm(
    llm=llm,
    retriever=vector_store.as_retriever(
        search_kwargs={"k": top_k}
    ),
    prompt=prompt
)

In [35]:
knowledge_chain.combine_documents_chain.document_prompt = PromptTemplate(
    input_variables=["page_content"], template="{page_content}"
)

In [40]:
query = "乌克兰目前派出了哪些武器，如何部署？"
knowledge_chain.return_source_documents = True
result = knowledge_chain({"query": query})
result['result']

'根据已知内容，乌克兰在针对俄军发起反攻行动时，已经派出了多个机械化步兵旅，其中包括装备德制“豹2”主战坦克以及美制“布雷德利”战车的精锐机械化步兵。这些部队在过去小半年里接受北约教官的训练，展现出了较高的战役决心。\n\n在部署方面，乌克兰的机械化步兵旅以营级战术群、连级战斗分队的方式对俄军阵地发起进攻。这一阶段进攻的目标主要是探摸俄军防线火力点与薄弱点，探明俄军雷场布控情况，并以远程火力打击的方式摧毁俄军向前沿阵地运动的增援力量。\n\n然而，乌军在攻势展开情况和装甲力量未能有效展开等问题上面临困难，因此有两个悬念值得关注。'

In [22]:
os.path.join(MODEL_CACHE_PATH,embeddings.model_name)

'./ganymedenil/text2vec-large-chinese'

In [21]:
MODEL_CACHE_PATH = "./"
embeddings.client = sentence_transformers.SentenceTransformer(
    "ganymedenil/text2vec-large-chinese",
    cache_folder=os.path.join(MODEL_CACHE_PATH,embeddings.model_name)
)

No sentence-transformers model found with name ganymedenil/text2vec-large-chinese. Creating a new one with MEAN pooling.


In [None]:
class KnowledgeBasedChatLLM:
    llm: object = None
    embeddings: object = None
    
    def init_model_config(
        self,
        embedding_model: str = "./ganymedenil/text2vec-large-chinese/ganymedenil_text2vec-large-chinese"
    ):
        self.embedding = HuggingFaceEmbeddings(model_name=embedding_model)
        # self.embeddings.client = sentence_transformers.SentenceTransformer(
        #     self.embeddings.model_name,
        #     cache_folder=os.path.join("./",self.embeddings.model_name)
        # )
        
        self.llm = ChatLLM()
        self.llm.load_llm()
    
    def init_kownledge_vector_store(self, filepath):
        docs = self.load_file(filepath)
        
        vector_store = FAISS.from_documents(docs, self.embeddings)
        vector_store.save_local('faiss_index')
        return vector_store
    
    def get_knowledge_based_answer(
        self,
        query,
        history_len: int = 5,
        top_k: int = 5,
    ):
        self.history_len = history_len
        self.top_k = top_k
        prompt_template = """
        基于以下已知信息，请简洁并专业地回答用户的问题。
        如果无法从中得到答案，请说 "根据已知信息无法回答该问题" 或 "没有提供足够的相关信息"。不允许在答案中添加编造成分。另外，答案请使用中文。

        已知内容:
        {context}

        问题:
        {question}
        """
        prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
        
        self.llm.history = history[-self.history_len:] if self.history_len > 0 else []
        vector_store = FAISS.load_local('faiss_index', self.embeddings)
        
        knowledge_chain = RetrievalQA.from_llm(
            llm=self.llm,
            retriever=vector_store.as_retriever(
                search_kwargs={"k": self.top_k}
            ),
            prompt=prompt
        )
        
        knowledge_chain.combine_documents_chain.document_prompt = PromptTemplate(
            input_variables=["page_content"], template="{page_content}"
        )
        
        knowledge_chain.return_source_documents = True
        result = knowledge_chain({"query": query})
        return result
    
    def load_file(self, filepath):
        '''
        上传本地知识库，目前仅支持.md格式
        '''
        loader = UnstructuredFileLoader(filepath, mode="elements")
        docs = loader.load()
        return docs