In [1]:
pip install torch transformers datasets nltk rouge jieba

Looking in indexes: https://mirrors.aliyun.com/pypi/simple
[33mDEPRECATION: pytorch-lightning 1.7.7 has a non-standard dependency specifier torch>=1.9.*. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of pytorch-lightning or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0mNote: you may need to restart the kernel to use updated packages.


In [2]:
import ujson
def transform_conversation_data(raw_data):
    try:
        instruction = raw_data.get("conversation", "")[0]['system'] + "\n\n对话："

        conversation = raw_data.get("conversation", [])
        for i, dialog in enumerate(conversation):
            instruction += "\n来访者：" + dialog["input"]

            if i < len(conversation) - 1:
                instruction += "\n医生：" + dialog["output"]

        response = conversation[-1]["output"] if conversation else ""

        instruction += "\n医生："

        return {"instruction": instruction, "output": response}
    
    except Exception as e:
        pass


with open(f'./train_dir/data.json', 'r', encoding='utf-8') as f1:
    data = ujson.load(f1)
with open(f'./train_dir/converted.json', 'w', encoding='utf-8') as f:
    for j, item in enumerate(data):
        temp=transform_conversation_data(item)
        if temp:
            transformed_data =ujson.dumps(temp, ensure_ascii=False)
            f.write(transformed_data+'\n')
    print('********')

FileNotFoundError: [Errno 2] No such file or directory: './train_dir/data.json'

In [7]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge import Rouge
import numpy as np
import jieba

In [8]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    # 字符级别
    # decoded_preds = [" ".join((pred.replace(" ", ""))) for pred in predictions]
    # decoded_labels = [" ".join((label.replace(" ", ""))) for label in labels]

    # 词级别
    decoded_preds = [" ".join(jieba.cut(pred.replace(" ", ""))) for pred in predictions]
    decoded_labels = [" ".join(jieba.cut(label.replace(" ", ""))) for label in labels]




    rouge = Rouge()

    bleu =np.array([0.,0.,0.,0.])
    weights = [(1.,0.,0.,0.),(1./2., 1./2.),(1./3., 1./3., 1./3.),(1./4., 1./4., 1./4., 1./4.)]
    for decoded_label, decoded_pred in zip(decoded_labels, decoded_preds):
        bleu +=np.array( sentence_bleu(
            references=[decoded_label.split(' ')],
            hypothesis=decoded_pred.split(' '),
            smoothing_function=SmoothingFunction().method1,weights=weights
        ))
    bleu /= len(decoded_labels)
    result = rouge.get_scores(decoded_preds, decoded_labels, avg=True)
    result = {key: value['f'] * 100 for key, value in result.items()}
    result["bleu"] = {'bleu_1':bleu[0] * 100,'bleu_2':bleu[1] * 100,'bleu_3':bleu[2] * 100,'bleu_4':bleu[3] * 100}
    return result

In [10]:
## 加载模型
def load_model(model_name_or_path, load_in_4bit=False, adapter_name_or_path=None):
    if load_in_4bit:
        quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            llm_int8_threshold=6.0,
            llm_int8_has_fp16_weight=False,
        )
    else:
        quantization_config = None

    # 加载base model
    model = AutoModelForCausalLM.from_pretrained(
        model_name_or_path,
        load_in_4bit=load_in_4bit,
        trust_remote_code=True,
        low_cpu_mem_usage=True,
        torch_dtype=torch.float16,
        device_map='auto',
        quantization_config=quantization_config
    )

    # 加载adapter
    if adapter_name_or_path is not None:
        model = PeftModel.from_pretrained(model, adapter_name_or_path)

    return model

## 加载tokenzier
def load_tokenizer(model_name_or_path):
    tokenizer = AutoTokenizer.from_pretrained(
        model_name_or_path,
        trust_remote_code=True,
        use_fast=False
    )

    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    return tokenizer

## 构建prompt
def build_prompt(tokenizer, template, query, history, system=None):
    template_name = template.template_name
    system_format = template.system_format
    user_format = template.user_format
    assistant_format = template.assistant_format
    system = system if system is not None else template.system

    history.append({"role": 'user', 'message': query})
    input_ids = []

    # 添加系统信息
    if system_format is not None:
        if system is not None:
            system_text = system_format.format(content=system)
            input_ids = tokenizer.encode(system_text, add_special_tokens=False)
    # 拼接历史对话
    for item in history:
        role, message = item['role'], item['message']
        if role == 'user':
            message = user_format.format(content=message, stop_token=tokenizer.eos_token)
        else:
            message = assistant_format.format(content=message, stop_token=tokenizer.eos_token)
        tokens = tokenizer.encode(message, add_special_tokens=False)
        input_ids += tokens
    input_ids = torch.tensor([input_ids], dtype=torch.long)

    return input_ids

In [14]:
from transformers import AutoModelForCausalLM, AutoTokenizer,DataCollatorWithPadding
from qwen_generation_utils import  decode_tokens
import torch
import datasets
from modelscope import snapshot_download


model_name_or_path = snapshot_download('LLM-Research/Meta-Llama-3-8B-Instruct', 
                                           cache_dir='LLM-Research/Meta-Llama-3-8B-Instruct')#'./model'
# tokenizer = AutoTokenizer.from_pretrained(model_dir, device_map="auto", padding_side='left',trust_remote_code=True)
# tokenizer.add_special_tokens({'pad_token': '[PAD]'})
# # Set `torch_dtype=torch.float16` to load model in float16, otherwise it will be loaded as float32 and might cause OOM Error.
# model = AutoModelForCausalLM.from_pretrained(model_dir, device_map="auto",pad_token_id=tokenizer.eos_token_id, trust_remote_code=True, torch_dtype=torch.float16)
# # (Optional) If on low resource devices, you can load model in 4-bit or 8-bit to further save GPU memory via bitsandbytes.
#   # InternLM 7B in 4bit will cost nearly 8GB GPU memory.
#   # pip install -U bitsandbytes
#   # 8-bit: model = AutoModelForCausalLM.from_pretrained(model_dir, device_map="auto", trust_remote_code=True, load_in_8bit=True)
#   # 4-bit: model = AutoModelForCausalLM.from_pretrained(model_dir, device_map="auto", trust_remote_code=True, load_in_4bit=True)
# model = model.eval()
print_user = True # 控制是否输入提示输入框，用于notebook时，改为True

template_name = 'llama3'
adapter_name_or_path = None

# template = template_dict[template_name]    

# 若开启4bit推理能够节省很多显存，但效果可能下降
load_in_4bit = False

# 生成超参配置，可修改以取得更好的效果
max_new_tokens = 500 # 每次回复时，AI生成文本的最大长度
top_p = 0.9
temperature = 0.6 # 越大越有创造性，越小越保守
repetition_penalty = 1.1 # 越大越能避免吐字重复


model = load_model(
        model_name_or_path,
        load_in_4bit=load_in_4bit,
        adapter_name_or_path=adapter_name_or_path
    ).eval()
tokenizer = load_tokenizer(model_name_or_path if adapter_name_or_path is None else adapter_name_or_path)
if template.stop_word is None:
    template.stop_word = tokenizer.eos_token
stop_token_id = tokenizer.encode(template.stop_word, add_special_tokens=True)
# assert len(stop_token_id) == 1
stop_token_id = stop_token_id[0]


# # convert data
# import ujson
# def transform_conversation_data(raw_data):
#     try:
#         instruction = '<|im_start|>system\n'+raw_data.get("conversation", "")[0]['system'] + "<|im_end|>\n"

#         conversation = raw_data.get("conversation", [])
#         for i, dialog in enumerate(conversation):
#             instruction += "<|im_start|>user\n来访者：" + dialog["input"]+ "<|im_end|>\n"

#             if i < len(conversation) - 1:
#                 instruction += "<|im_start|>assistant\n医生：" + dialog["output"]+"<|im_end|>\n"

#         response = conversation[-1]["output"] if conversation else ""

#         instruction +="<|im_start|>assistant\n医生："

#         return {"instruction": instruction, "output": response}
    
#     except Exception as e:
#         pass


# with open(f'./data_dir/data.json', 'r', encoding='utf-8') as f1:
#     data = ujson.load(f1)
# with open(f'./data_dir/converted.json', 'w', encoding='utf-8') as f:
#     for j, item in enumerate(data):
#         temp=transform_conversation_data(item)
#         if temp:
#             transformed_data =ujson.dumps(temp, ensure_ascii=False)
#             f.write(transformed_data+'\n')

#set test params


#set test params
test_num=1596 #测试数据条数
batch_size=12


#prepare data and dataloader
dataset = datasets.load_dataset('json', data_files='converted.json',split=f"train[:{test_num}]")
references =dataset['output'][:test_num]

hypotheses = []
def preprocess(data):
    length = list(map(len, data['instruction']))
    model_inputs=tokenizer(data['instruction'], max_length=512, truncation=True )
    labels=tokenizer(data['output'], padding=True,max_length=128, truncation=True )
    model_inputs['labels']=labels['input_ids']
    model_inputs['length'] = length
    return model_inputs
preprocessed_dataset = dataset.map(preprocess, batched=True,remove_columns=['instruction', 'output',])


collator=DataCollatorWithPadding(tokenizer=tokenizer,)
from torch.utils.data import DataLoader

dataloader = DataLoader(preprocessed_dataset, batch_size=batch_size, collate_fn=collator)

#generate responses
stop_word="<|im_end|>"
for batch in dataloader:
    batch_input_ids = torch.LongTensor(batch['input_ids']).to(model.device)
    batch_labels = batch['labels']
    attention_mask = batch['attention_mask']
    length = batch['length']
    batch_out_ids = model.generate(
        batch_input_ids.to(model.device),
        return_dict_in_generate=False,
        max_new_tokens=256,
        do_sample=True,
        temperature=0.1,
        eos_token_id=92542
    )
    
    padding_lens = [batch_input_ids[i].eq(tokenizer.pad_token_id).sum().item() for i in range(batch_input_ids.size(0))]
    batch_response = [
    decode_tokens(
        batch_out_ids[i][padding_lens[i]:],
        tokenizer,
        context_length=0,
        raw_text_len=length[i],
        chat_format="raw",
        verbose=False,
        errors='replace'
    ).replace("医生：","") for i in range(batch_size)]
    hypotheses.extend([r.replace(stop_word," ").split()[0] if stop_word in r else r for r in batch_response])


# Load metric
from metric import compute_metrics

print(compute_metrics((hypotheses,references)))

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
