In [7]:
import logging
import os
import sys
import json

import numpy as np
from datasets import load_dataset
import jieba 
from rouge_chinese import Rouge
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import torch

import transformers
from transformers import (
    AutoConfig,
    AutoModel,
    LlamaConfig,
    LlamaTokenizer,
    LlamaForCausalLM,
    AutoTokenizer,
    AutoModelForCausalLM,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    HfArgumentParser,
    Seq2SeqTrainingArguments,
    TrainingArguments,
    DataCollatorForLanguageModeling,
    set_seed,
)

os.environ['CUDA_VISIBLE_DEVICES']='0,1,2,3'
os.environ["WANDB_MODE"]='disabled'

import sys
sys.path.append("./")
sys.path.append("..")


from transformers import(
    T5Tokenizer,
    T5ForConditionalGeneration,
    Seq2SeqTrainer,
)

from accelerate import Accelerator
from peft import LoraConfig, get_peft_model, TaskType

accelerator = Accelerator()
device = accelerator.device

In [3]:
model_name_or_path = '../model/chinese-llama-alpaca-plus-lora-7b'
config = LlamaConfig.from_pretrained(
    model_name_or_path,
    # trust_remote_code=True
)
tokenizer = LlamaTokenizer.from_pretrained(
    model_name_or_path,
    # trust_remote_code=True
)

model = LlamaForCausalLM.from_pretrained(
    model_name_or_path,
    config=config,
)

Loading checkpoint shards: 100%|██████████| 2/2 [00:10<00:00,  5.27s/it]


In [4]:
from peft import PeftConfig, LoraConfig, PeftModelForCausalLM, get_peft_model
checkpoint_name = '../checkpoint/CHIP-CDEE-1e-4/checkpoint-300/adapter_model'
peft_config = LoraConfig.from_pretrained(checkpoint_name)
model = get_peft_model(model, peft_config)
model = PeftModelForCausalLM.from_pretrained(model, checkpoint_name).half().cuda()

In [None]:
# tokenizer.eos_token_id
# tokenizer.bos_token

generation_config = dict(
    max_length=1024,
    # top_p=0.9,
    # do_sample=True,
    max_new_tokens=20,
    # repetition_penalty=1.0,``
)
str = f"### 指令:从下列输入中进行临床发现事件抽取任务。输出临床发现事件的主体词，以及发生状态，描述词和解剖部位这三种属性，其中描述词和解剖部位可能有多个值\n### 输入:\n临床发现事件抽取：入院前1+月，患者无明显诱因再次出现咳嗽、咳痰，为白色泡沫痰，量多，不易咳出，且偶感头晕，感进食梗阻、胸部胀痛，伴背部牵涉痛，食欲明显降低，无发热、畏寒，无头痛，无恶心、呕吐，无胸痛、心前区压榨感，无腹胀、腹痛、腹泻等不适，遂于我院胃肠外科门诊就诊，行上消化道钡餐检查（2018-3-19，我院）示：1.食管中下段外压性改变，考虑左肺门肿块所致，请结合ct检查。\n说明：临床发现事件的主体词包含发生状态，描述词和解剖部位这三种属性，其中描述词和解剖部位可能有多个值\n答：\n### 输出:\n"
# tokenizer = LlamaTokenizer.from_pretrained('../model/chinese-llama-alpaca-plus-lora-7b/')
output = tokenizer(str, padding=False)

# str = f'classify if the sentence is positive or negative. Sentence: I was happy today. Answer:'

inputs = tokenizer(str, return_tensors='pt')
inputs = {k:v.to(device) for k,v in inputs.items()}
inputs['max_new_tokens'] = 200
output = model.generate(
    **inputs,
    # **generation_config
)
tokens = tokenizer.decode(output[0], skip_special_tokens=False)
# result = tokens.split(f'输出{tokenizer.bos_token}')[1].split(tokenizer.eos_token)[0]
print(tokens)
# print(result)

In [None]:
from sklearn.metrics import accuracy_score
def accuracy(predictions, references, normalize=True, sample_weight=None):
        return {
            "accuracy": float(
                accuracy_score(references, predictions, normalize=normalize, sample_weight=sample_weight)
            )
        }
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    # preds have the same shape as the labels, after the argmax(-1) has been calculated
    # by preprocess_logits_for_metrics but we need to shift the labels
    labels = labels[:, 1:].reshape(-1)
    preds = preds[:, :-1].reshape(-1)
    return accuracy(predictions=preds, references=labels)

In [10]:

TASK_TO_INSTRUCTION = {
    "CHIP-CDEE" : "从下列输入中进行临床发现事件抽取任务。输出临床发现事件的主体词，以及发生状态，描述词和解剖部位这三种属性，其中描述词和解剖部位可能有多个值",
    "CHIP-CDN" : "诊断实体的语义标准化, 从给定的实体选项中选择与原诊断描述匹配的诊断标准词。从实体选项候选输出结果" ,
    "CHIP-CTC" : "根据输入的句子，确定该句子描述的临床试验筛选标准所属的类型。从类型选项候选输出结果",
    "CHIP-MDCFNPC" : "阴阳性判断的任务，在对话中，给出了一系列临床发现实体，然后根据每个实体判断其阴性或阳性。实体包括症状、疾病或假设可能发生的疾病，以及其他医学检查结果。根据对话内容，需要判断每个实体是已有症状疾病、未患有症状疾病，或者回答不明确或无实际意义。",
    # reconstruct
    "CHIP-MDCFNPC" : "阴阳性判断的任务，在对话中，给出了一系列临床发现实体，然后根据每个实体判断其阴性或阳性。实体包括症状、疾病或假设可能发生的疾病，以及其他医学检查结果。根据对话内容，需要判断每个实体是已有症状疾病、未患有症状疾病，或者回答不明确或无实际意义。",
    
    "CHIP-STS": "判断输入中的两句话的意思是否相同。如果两句话意思相同输出\"是的\",意思不相同输出\"不是\"",
    "CMeEE-V2" : "抽取出输入中的医学相关命令实体，并根据提供的选项选择特定类型的实体列表。",
    "CMeIE" :  "从给定的文本中找出特定类型的关系，并找出关系的头实体和尾实体。对每个特定关系三元组输出格式，具有**关系的头尾实体对如下：头实体为**，尾实体为**。如果没有找到实体对。输出\"没有指定类型的三元组\". " ,
    "IMCS-V2-DAC" : "判断输入中给定的问诊句子或陈述句的意图类型。根据所提供的选项，选择输出与句子意图相匹配的答案。", 
    "IMCS-V2-MRG" : "根据下输入中给定的问诊对话生成诊疗报告。输出报告需要包括主诉，现病史，辅助检查，既往史，诊断，建议的内容",
    "IMCS-V2-NER" : "根据给定的输入文本，输出对应的实体类型和实体名称。如果没有找到实体对。输出\"上述句子没有指定类型实体\"",
    "IMCS-V2-SR" : "根据给定的对话历史和当前对话，输出每个对话中涉及的症状以及这些症状的阴阳性判断。如果患有该症状输出\"阳性\",没有患有该症状输出\"阴性\",无法根据上下文确定病人是否患有该症状输出\"无法确定\"",
    # 为什么是相关和不相关呢？ ok or not ok? 能不能换成其他的
    "KUAKE-IR" : "判断输入中的医疗搜索和回答内容是否相关。如果内容相关输出\"相关\",内容不相关输出\"不相关\"",
    "KUAKE-QIC" : "根据输入中的搜索内容句子，判断搜索的意图类型, 从类型选项候选输出结果",
    "KUAKE-QQR" : "判断输入两个句子之间的语义包含关系。是\"完全一致\"，\"后者是前者的语义子集\"，\"后者是前者的语义父集\"，\"语义无直接关联\"的哪一种",
    "KUAKE-QTR" : "判断输入两个句子之间的语义相似程度。是\"完全不匹配或者没有参考价值\"，\"很少匹配有一些参考价值\"，\"部分匹配\"，\"完全匹配\"中的哪一种" ,
    "MedDG" : "根据输入中给定的问诊对话历史生成医生的下一句回复"
}

In [28]:

# load dataset
data_path="../datasets/toy_examples/"
train_file =  os.path.join(data_path, 'train.json')
validation_file =  os.path.join(data_path, 'dev.json')
test_file =  os.path.join(data_path, 'test.json')
# Load dataset
data_files = {}
if train_file is not None:
    data_files["train"] = train_file
    extension = train_file.split(".")[-1]
if validation_file is not None:
    data_files["validation"] = validation_file
    extension = validation_file.split(".")[-1]
if test_file is not None:
    data_files["test"] = test_file
    extension = test_file.split(".")[-1]

raw_datasets = load_dataset(
    extension,
    data_files=data_files,
)

# Get the column names for input/target.
prompt_column = 'input'
response_column = 'target'

column_names = raw_datasets["validation"].column_names
# Temporarily set max_target_length for training.
max_input_length = 1024


def generate_prompt(instruction, data):
    return f"""### 指令:\n{instruction}\n### 输入:\n{data[prompt_column]}\n### 输出:\n"""
    
def tokenize(prompt, add_eos_token=True):
    result = tokenizer(
        prompt,
        truncation=True,
        max_length=max_input_length,
        padding=False,
        return_tensors=None,
    )
    if (
        result["input_ids"][-1] == tokenizer.eos_token_id
        and len(result["input_ids"]) < max_input_length
        and add_eos_token
    ):
        result["input_ids"] = result["input_ids"][1:-1]
        result["attention_mask"] = result["input_ids"][1:-1]

    result["labels"] = result["input_ids"].copy()

    return result

def preprocess_function(data_point):
    instruction = TASK_TO_INSTRUCTION[data_point['task_dataset']]
    full_prompt = generate_prompt(instruction, data_point)
    tokenized_full_prompt = tokenize(full_prompt)
    return tokenized_full_prompt



lm_dataset = raw_datasets.map(
    preprocess_function,
    # batched=True,
    num_proc=4,
    remove_columns=column_names,
    load_from_cache_file=True,
)

# lm_dataset = tokenized_dataset
lm_dataset.set_format('torch', columns=['input_ids', 'attention_mask'])
predict_dataset = lm_dataset["test"]

Found cached dataset json (/home/lzw/.cache/huggingface/datasets/json/default-ded47da5b3131547/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)
100%|██████████| 3/3 [00:00<00:00, 324.07it/s]
Loading cached processed dataset at /home/lzw/.cache/huggingface/datasets/json/default-ded47da5b3131547/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-23df3ec0e0a9015b_*_of_00004.arrow
Loading cached processed dataset at /home/lzw/.cache/huggingface/datasets/json/default-ded47da5b3131547/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-0f5f3520f5638776_*_of_00004.arrow
Loading cached processed dataset at /home/lzw/.cache/huggingface/datasets/json/default-ded47da5b3131547/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-675bf6b834d10048_*_of_00004.arrow
Loading cached processed dataset at /home/lzw/.cache/huggingface/datasets/json/default-ded47da5b3131547/0.0.0/0f7e3662623656454fcd2b650f34e88

In [35]:
val_dataset = lm_dataset["train"]

In [44]:
tokenizer.decode(val_dataset[3]['input_ids'])

toy_example = val_dataset[0]
inputs = {k:v.unsqueeze(0).to(device) for k,v in toy_example.items()}
inputs['max_new_tokens'] = 20
print(inputs)
output = model.generate(
    **inputs,
)

{'input_ids': tensor([[    1,   835, 29871, 40412, 29901,    13, 32237, 34485, 30210, 40138,
         30214, 32932, 31751, 40138, 33749, 30210, 35260, 34964, 43880, 32404,
         38593, 30210, 33812, 30267, 31594, 33812, 38631, 31974, 31333, 35808,
         32350,    13,  2277, 29937, 29871, 34485, 29901,    13, 29941, 30409,
         31593, 31461, 42670, 42545, 31180, 30746, 32227, 31888, 30419, 29950,
          5194, 29928, 41329, 30409, 43906, 30386, 32111, 30748, 31608,    13,
         32561, 35260, 34964, 43880, 32404, 33812, 30882,    13, 38631, 30383,
         33282, 30946, 30952, 30214, 32174, 30214, 32326, 30682, 31436, 30952,
         30214, 34459, 30214, 38601, 32293, 30214, 32418, 31391, 33618, 30214,
         33405, 30214, 32914, 35694, 30214, 42438, 33767, 30214, 37154, 37020,
         30214, 37929, 30214, 33824, 36215, 35188, 30214, 34321, 32146, 30214,
         32670, 30214, 34484, 32263, 30214, 45006, 33171, 30214, 32500, 35188,
         30214, 32108, 32263, 30214, 3

In [45]:
tokenizer.decode(output[0], skip_special_tokens=True)

'### 指令:\n根据输入的句子，确定该句子描述的临床试验筛选标准所属的类型。从类型选项候选输出结果\n### 输入:\n3）汉密尔顿抑郁量表17项（HAMD-17）评分≥18分；\n是什么临床试验筛选标准类型？\n选项：依存性，能力，数据可及性，饮食，口腔相关，治疗或手术，疾病，风险评估，吸烟状况，肿瘤进展，护理，残疾群体，献血，设备，居住情况，病例来源，健康群体，教育情况，怀孕相关，性别，过敏耐受，研究者决定，种族，器官组织状态，诊断，年龄，症状(患者感受)，疾病分期，受体状态，睡眠，特殊病人特征，参与其它试验\n答：\n### 输出:\n选项：依存性，能力，数据可及性，饮食，口腔相关，治疗或'

In [None]:
import transformers
args = TrainingArguments(
    output_dir='toy_dir',
    run_name='toy_run',
    do_eval=True,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    evaluation_strategy='epoch',
    num_train_epochs=5,
    save_total_limit=2,
    # report_to='wandb',
)

tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id
data_collator = transformers.DataCollatorForSeq2Seq(
    tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True)



In [None]:
args.local_rank
# args.world_size
# device = torch.device('cuda:4')
# model = model.to(device)

In [None]:
class Data_args:
    def __init__(self):
        self.val_max_target_length = 200
        self.eval_beams = 1
data_args = Data_args()
args.predict_with_generate = True

In [None]:
args.predict_with_generate = True
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id


In [None]:
from trainer import Trainer
trainer = Trainer(
    model,
    args=args,
    data_args=data_args,
    tokenizer=tokenizer,
    data_collator=data_collator,
    train_dataset=lm_dataset['train'],
    eval_dataset=lm_dataset['validation'],
    compute_metrics=compute_metrics,
    preprocess_logits_for_metrics=preprocess_logits_for_metrics
)

In [None]:
predict_result = trainer.evaluate(lm_dataset['validation'])

In [None]:
from transformers import Seq2SeqTrainer
from peft import LoraModel