In [1]:
# 预先准备
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel

# 使用的设备
device = torch.device('cuda:0')

# 绝对路径获取方法
# curPath = os.path.dirname(os.path.abspath(__file__))
# 在 jupyter 中无法获取到 __file__, 但是可以肯定的是其运行路径是文件所在目录
curPath = os.getcwd()
def getAbsPath (relativePath):
  joinPath = os.path.join(curPath, relativePath)
  return os.path.normpath(
    os.path.abspath(joinPath)
  )

# 加载词库
tokenizer = AutoTokenizer.from_pretrained(
  pretrained_model_name_or_path=getAbsPath('../models/gpt2-chitchat-learn/')
)

# 加载模型(两种方式都行，均为在本地加载，模型目录事先通过 git clone 到本地，相比于直接使用 name 的方式可以减少模型加载的远端检查时间)
model = AutoModelForCausalLM.from_pretrained(
  pretrained_model_name_or_path=getAbsPath('../models/gpt2-chitchat-learn/')
)

model = PeftModel.from_pretrained(
  model,
  getAbsPath('../models/gpt2-chitchat-lora/checkpoint-20')
)

model = model.to(device)
model.eval()

  from .autonotebook import tqdm as notebook_tqdm


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): GPT2LMHeadModel(
      (transformer): GPT2Model(
        (wte): Embedding(13317, 768)
        (wpe): Embedding(300, 768)
        (drop): Dropout(p=0.1, inplace=False)
        (h): ModuleList(
          (0-9): 10 x GPT2Block(
            (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (attn): GPT2Attention(
              (c_attn): Linear(
                in_features=768, out_features=2304, bias=True
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=768, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=2304, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
          

In [5]:
# 模型运行结果的处理策略配置(topk_topp策略)
temperature = 1     # 生成温度, 设置为1即保持原有系数不变
topk = 10           # 最高k选1
topp = 0.8          # 最高累积概率(这里设置为0, 即最终只取了概率最高的词，建议修改为其他值)
repetition_penalty = 1.0    # 重复惩罚系数，这里设置为 1.0 即保持原有概率

# 语句与记忆策略
max_len = 25            # 单条响应语句的最大文字数目
max_history_len = 3     # 关联的最大上下文条目(TODO: 后续的定制化改造中尝试采用总结式AI做记忆压缩)

# 生成逻辑核心部分
def generate_response (input_ids):
  # 变为张量格式并进行扩维
  input_ids = torch.LongTensor([input_ids]).to(device)

  # 回复记录
  response = []

  # 按照最大字数生成内容
  for i in range(max_len):
    input_ids = model.generate(
      input_ids=input_ids,
      do_sample=True,           # 这个参数被设定后续的配置才会生效

      temperature=temperature,
      top_k=topk,
      top_p=topp,
      repetition_penalty=repetition_penalty,

      bos_token_id=tokenizer.cls_token_id,
      pad_token_id=tokenizer.pad_token_id,
      eos_token_id=tokenizer.pad_token_id,

      max_new_tokens=1
    )
    next_token = input_ids[:, -1][0]

    # 如果词是 [SEP] 则表明该轮 response 结束
    if next_token == tokenizer.sep_token_id:
      break
    response.append(next_token.item())
  
  response_str = tokenizer.convert_ids_to_tokens(response)
  response_str = ''.join(response_str)
  return response_str

# 生成逻辑处理
def generate(history):
  input_ids = [tokenizer.cls_token_id]   # 每个 input 以 [CLS] 为开头

  # 历史记录拼接
  for history_id, history_str in enumerate(history):
    history_str_ids = tokenizer.encode(history_str, add_special_tokens=False)
    input_ids.extend(history_str_ids)
    input_ids.append(tokenizer.sep_token_id)

  res = generate_response(input_ids)
  print(res)

# mock 的历史记录
history = [
  '你是谁',
  '我是猫雷',
  '好的猫雷，可以唱一首“威风堂堂”吗？',
  '小拳拳'
]
history.append('喵露露要反击了')        # 将当前的 text 拼接到历史记录中
history = history[-max_history_len:]   # 记忆长度控制


generate(history)

反击就是对自己的猫嗨，我不敢跟它讲话，毕竟我怕猫，
