In [2]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"当前使用的是cuda/cpu?: {device}")

import os
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'

当前使用的是cuda/cpu?: cuda


In [3]:
from peft import PeftModel
from transformers import AutoTokenizer,AutoModelForCausalLM

def load_math_teacher_model():

    #下面是加载模型，和train里面一样
    model_name = "Qwen/Qwen1.5-7B"
    tokenizer = AutoTokenizer.from_pretrained(
        model_name,
        cache_dir='./myModels'
    )
    if tokenizer.pad_token is None: #pad_token是用来填充较短的序列的
        tokenizer.pad_token = tokenizer.eos_token #结束token
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16,
        device_map="auto",
        load_in_4bit=False,
        trust_remote_code=True,
        cache_dir='./myModels',
    )

    #加载LoRA适配器
    LoRA_adapter='./qwen1.5-7b-math-teacher'
    model=PeftModel.from_pretrained(model,LoRA_adapter)
    return model,tokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
def math_teacher_chat(model,tokenizer,initial_question,max_turns=10): #max_turns是最大对话轮数
    print(f'学生：{initial_question}')
    history=f'<|im_start|>user\n{initial_question}<|im_end|>\n'

    for turn in range(max_turns):
        prompt=history+'<|im_start|>assistant\n'
        inputs=tokenizer(prompt,return_tensors='pt').to(device)
        with torch.no_grad():
            outputs=model.generate(
                **inputs, #**是把字典解包为函数所需要传递的关键字参数
                max_length=500,
                temperature=0.7, #temperature为0会总选择概率最高的token，temperature变大会有随机性
                do_sample=True, #采用采样而不是贪婪搜索，do_sample=False的时候会总选择概率最高的token，会忽略temperature
                pad_token_id=tokenizer.eos_token_id,
                eos_token_id=tokenizer.eos_token_id,
                repetition_penalty=1.1, #重复惩罚，避免出现重复的token,为1.0的时候无惩罚，值越大惩罚越大
            )

        response=tokenizer.decode(outputs[0],skip_special_tokens=True) #skip_special_tokens=True会跳过[PAD]等特殊token，得到干净文本
        if 'assistant' in response:
            response=response.split('assistant')[-1].strip() #提取回答，去掉历史对话，.strip()用来去除首尾空白字符

            #TODO：后面会不会生成user内容？
        
        print(f'老师：{response}')
        history=f'<|im_start|>assistant\n{response}<|im_end|>\n'

        question=input('学生：')
        if question=='q':
            break
        history=f'<|im_start|>user\n{question}<|im_end|>\n'

In [5]:
if __name__ == "__main__":
    model,tokenizer=load_math_teacher_model()


    initial_question=input('请提出你的数学问题，输入q退出')
    if initial_question.strip() and initial_question!='q':
        math_teacher_chat(model,tokenizer,initial_question)

`torch_dtype` is deprecated! Use `dtype` instead!
Loading checkpoint shards: 100%|██████████| 4/4 [00:08<00:00,  2.03s/it]
Both `max_new_tokens` (=2048) and `max_length`(=500) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


学生：一个长方形长是宽的两倍，周长是36，面积是多少？
老师：完全正确！这道题你理解得很好，做得也很棒！


KeyboardInterrupt: Interrupted by user