In [None]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

In [1]:
import os
import json
import ast
import random
import time
import torch
import pandas as pd
import numpy as np
from transformers import AutoModelForCausalLM, AutoTokenizer
from accelerate import init_empty_weights, load_checkpoint_and_dispatch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class Config():
    num_few_shot = 3
    model_path = 'Qwen/Qwen1.5-14B-Chat'
    excel_path = '/data/yinxiaoln/datasets/dota2eval/DOTA2.xlsx'
    dota2_eval_path = '/data/yinxiaoln/datasets/dota2eval/DOTA2_eval.json'
    dota2_train_sft_path = '/data/yinxiaoln/datasets/dota2eval/DOTA2_train_sft.json'

In [None]:
df = pd.read_excel(Config.excel_path, sheet_name=0)
exams = []
for _, row in df.iterrows():
    question = row['题目'].strip().replace('\n', '').replace('\t', '')
    A = row['选项A']
    A = str(A).strip().replace('\n', '').replace('\t', '')
    B = row['选项B']
    B = str(B).strip().replace('\n', '').replace('\t', '')
    C = row['选项C']
    if not pd.isna(C):
        C = str(C).strip().replace('\n', '').replace('\t', '')
    else:
        C = None
    D = row['选项D']
    if not pd.isna(D):
        D = str(D).strip().replace('\n', '').replace('\t', '')
    else:
        D = None

    answer = row['答案']
    answer = str(answer).strip().replace('\n', '').replace('\t', '')
    if answer == 'A':
        answer = A
    elif answer == 'B':
        answer = B
    elif answer == 'C':
        answer = C
    elif answer == 'D':
        answer = D
    else:
        print(row)
        assert False
    eval = {}
    eval['question'] = question
    choices = [A, B, C, D]
    choices = list(filter(lambda x: x is not None, choices))
    random.shuffle(choices)
    answer = choices.index(answer)
    answer = 'ABCDEF'[answer]
    eval['choices'] = choices
    eval['answer'] = answer
    exams.append(eval)

with open(Config.dota2_eval_path, 'w', encoding='utf-8') as f:
    json.dump(exams, f, ensure_ascii=False, indent=4)

In [None]:
df = pd.read_excel(Config.excel_path, sheet_name=1)
df = df.replace(np.nan, None)
sfts = []
for _, row in df.iterrows():
    instruction = row['instruction'].strip()
    input = row['input']
    if input is not None:
        input = input.strip()
    output = row['output']
    system = row['system']
    if system is not None:
        system = system.strip()
    history = row['history']
    if history is not None:
        history = history.strip()
        history = history.split('\n')
        history = [ast.literal_eval(str_list) for str_list in history]
    sft = {}
    sft['instruction'] = instruction
    if input is not None:
        sft['input'] = input
    sft['output'] = output
    if system is not None:
        sft['system'] = system
    if history is not None:
        sft['history'] = history
    sfts.append(sft)

with open(Config.dota2_train_sft_path, 'w', encoding='utf-8') as f:
    json.dump(sfts, f, ensure_ascii=False, indent=4)

推理貌似要用单卡，多卡会报错

In [3]:
device = 'cpu'
os.environ['CUDA_VISIBLE_DEVICES'] = '0,1'
model_path = Config.model_path
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    device_map='cpu',
    torch_dtype='auto',
    trust_remote_code=True,
    mirror='tuna',
    resume_download=True
)
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True, mirror='tuna')

Loading checkpoint shards: 100%|██████████| 8/8 [00:00<00:00, 10.27it/s]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


环境变量要在代码开始执行前设置好，否则import torch过程中多个环境变量会冲突，
比如CUDA_VISIBLE_DEVICES

In [None]:

device = 'cpu'
model_path = Config.model_path
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True, mirror='tuna')
with init_empty_weights():
    model = AutoModelForCausalLM.from_pretrained(
        model_path,
        device_map='cpu',
        torch_dtype='auto',
        trust_remote_code=True,
        mirror='tuna',
        resume_download=True
    )
checkpoint = os.path.join(model_path, 'model.safetensors.index.json')
model = load_checkpoint_and_dispatch(
    model, checkpoint='/home/yinxiaoln/.cache/huggingface/hub/models--Qwen--Qwen1.5-14B-Chat/snapshots/79e31c0b5875db412a7db189514b523ec8440928/model.safetensors.index.json', device_map='auto')

In [5]:
def format_example(exam, include_answer=True, cot=False, add_prompt=''):
    chat = []
    CHOICES = 'ABCDEFGHIJK'
    example = add_prompt + exam['question']
    for i, choice in enumerate(exam['choices']):
        example += f'\n{CHOICES[i]}. {choice}'

    example += '\n答案：\n\n'
    chat.append({"role": "user", "content": example})
    if include_answer:
        if cot:
            ans = "让我们一步一步思考，\n" + \
                exam["explanation"] + f"\n所以答案是{exam['answer']}。"
        else:
            ans = exam["answer"]
        chat.append({"role": "assistant", "content": f'\n{ans}'})
    return chat


def generate_few_shot_prompt(few_shot=[], cot=False):
    chat = [
        {"role": "system", "content": "以下是关于游戏Dota2知识考试的单项选择题，请选出其中正确的答案。\n\n"}
    ]
    for exam in few_shot:
        chat.extend(format_example(exam, cot=cot))
    return chat


def judge(answer: str, outputs: str):
    outputs = ''.join(ch for ch in outputs if ch.isupper())
    #print(f'ans={answer}, output={outputs}')
    if outputs != answer:
        print(f'ans={answer}, output={outputs}, {False}')
    return outputs == answer

In [8]:

def dota2_eval(file, model, tokenizer):
    with open(file, 'r') as f:
        exams = json.load(f)
    few_shot = exams[0: 3]
    chat = generate_few_shot_prompt(few_shot, cot=False)
    right = 0
    for i in range(3, len(exams)):
        chat_one = format_example(exams[i], include_answer=False, cot=False)
        chat_tmp = chat
        chat_tmp.extend(chat_one)
        formatted_chat = tokenizer.apply_chat_template(
            chat_tmp, tokenize=False, add_generation_prompt=True)
        #print(formatted_chat)

        model_inputs = tokenizer([formatted_chat], return_tensors="pt").to(device)
        generated_ids = model.generate(
            model_inputs.input_ids,
            max_new_tokens=128
        )
        generated_ids = [
            output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
        ]

        response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
        #print(f'{i}:{response}')
        if judge(exams[i]['answer'], response):
            right += 1
        else:
            print(f'exam={exams[i]}, response={response}')
    print("acc=", right / (len(exams) - 3))

dota2_eval(Config.dota2_eval_path, model, tokenizer)

ans=B, output=C, False
exam={'question': '他们称我为祈求者，然而我真正的名字是', 'choices': ['kalel', 'carl', 'kael', 'caler'], 'answer': 'B'}, response=C
ans=B, output=D, False
exam={'question': '图纸（血棘）的价格是多少', 'choices': ['1100', '100', '5375', '2150'], 'answer': 'B'}, response=D
ans=D, output=B, False
exam={'question': '风行者在DOTA2中名字与称谓都发生了改变，她的新名字莱瑞蕾，相比于旧名字奥蕾莉亚，除了调换字母顺序外还变动了几个字母', 'choices': ['2', '1', '3', '4'], 'answer': 'D'}, response=B
ans=B, output=D, False
exam={'question': '在迎霜节活动中，骷髅王转化为了冥魂大帝，他的名字是', 'choices': ['塔瑞斯', '奥斯塔里昂', '克林克兹', '里奥瑞克王'], 'answer': 'B'}, response=D
ans=A, output=D, False
exam={'question': '他只想听她再唱一次歌 这把匕首是她送的礼物 他一直带在身边 谁敢不让她唱 那人就别想活！ 这段话源自哪位英雄的饰品', 'choices': ['斯拉克', '幻影刺客', '力丸', '赏金猎人'], 'answer': 'A'}, response=D
ans=B, output=C, False
exam={'question': '露娜的头像中，隐藏着哪个技能的图标', 'choices': ['月光', '月蚀', '月刃', '月之祝福'], 'answer': 'B'}, response=C
ans=C, output=B, False
exam={'question': '末日使者和影魔曾经联手进攻一位英雄，这位英雄是', 'choices': ['光之守卫', '祈求者', '暗影恶魔', '军团指挥官'], 'answer': 'C'}, 

In [None]:
!env | grep CUDA

|模型|正确率|
|-|-|
|Qwen1.5-14B-Chat（Baseline）|0.3739|
|Qwen1.5-14B-Chat（Baseline-half）|0.33|

In [None]:
!CUDA_VISIBLE_DEVICES=0 python src/export_model.py \
    --model_name_or_path Qwen/Qwen1.5-14B-Chat \
    --adapter_name_or_path /data/yinxiaoln/code/Qwen1.5/examples/sft/output_qwen \
    --template default \
    --finetuning_type lora \
    --export_dir /tmp/lora \
    --export_size 2 \
    --export_legacy_format False