In [5]:
import os
import torch
from datasets import load_dataset
from transformers import T5Tokenizer, T5Config, T5ForConditionalGeneration

os.environ["CUDA_VISIBLE_DEVICES"] = '3'

device = 'cuda' if torch.cuda.is_available() else 'cpu'

# load tokenizer and model 
pretrained_model = "IDEA-CCNL/Randeng-T5-784M-MultiTask-Chinese"

special_tokens = ["<extra_id_{}>".format(i) for i in range(100)]

In [2]:
tokenizer = T5Tokenizer.from_pretrained(
    pretrained_model,
    # do_lower_case=True,
    # max_length=512,
    # truncation=True,
    # additional_special_tokens=special_tokens,
)
config = T5Config.from_pretrained(pretrained_model)
model = T5ForConditionalGeneration.from_pretrained(pretrained_model, config=config).to(device)
model.resize_token_embeddings(len(tokenizer))
model.eval()

T5ForConditionalGeneration(
  (shared): Embedding(32596, 1024)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32596, 1024)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=1024, out_features=1024, bias=False)
              (k): Linear(in_features=1024, out_features=1024, bias=False)
              (v): Linear(in_features=1024, out_features=1024, bias=False)
              (o): Linear(in_features=1024, out_features=1024, bias=False)
              (relative_attention_bias): Embedding(32, 16)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=1024, out_features=2816, bias=False)
              (wi_1): Linear(in_features=1024, out_features=2816, bias=False)
       

In [30]:
text = '语义匹配：：“有糖尿病，高血压的人，胆结石怎么治疗？”，：“请问怎么治疗高血压”，以上两句话的内容是否相似\n选项：是的，不是\n答：'
inputs = tokenizer(f'{text}',max_length=512, return_tensors='pt')
print(inputs)
logits = model.generate(
  input_ids = inputs['input_ids'].cuda(),
  max_length=4,
#   do_sample=True
  # early_stopping=True,
  )

logits=logits[:,1:]
predict_label = [tokenizer.decode(i,skip_special_tokens=True) for i in logits]
print(predict_label)

{'input_ids': tensor([[  259,  4478,  5520, 11077,  3484,   267,  3067,   794, 22044,   261,
          1053,  4243,  5268,  5822,   261, 10655,  4806,  1930,  2248, 15502,
          1509,   261,  3067, 23999,  2248, 15502,  1053,  4243,  5268,   657,
          3520,  2830, 26921, 15705,  9007,  2031,  9066,   259, 27543,   267,
           766,   408,   261,  6061,   259,  6930,   267,     1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
['不是']


In [4]:
from datasets import load_dataset
task_name = 'CHIP-STS'
data_path=f"../data/{task_name}"
train_file =  os.path.join(data_path, 'train.json')
validation_file =  os.path.join(data_path, 'dev.json')
test_file =  os.path.join(data_path, 'test.json')

data_files = {}
if train_file is not None:
    data_files["train"] = train_file
    extension = train_file.split(".")[-1]
if validation_file is not None:
    data_files["validation"] = validation_file
    extension = validation_file.split(".")[-1]
if test_file is not None:
    data_files["test"] = test_file
    extension = test_file.split(".")[-1]

raw_datasets = load_dataset(
    extension,
    data_files=data_files,
)

max_source_length = 512,
max_target_length = 128

prompt_column='input'
response_column='target'
ignore_pad_token_for_loss = True

def proprocess_tokenize(examples):
    max_seq_length = max_source_length + max_target_length
    model_inputs = {
        "input_ids": [],
        "labels": [],
    }
    for i in range(len(examples[prompt_column])):
        if examples[prompt_column][i] and examples[response_column][i]:
            query, answer = examples[prompt_column][i], examples[response_column][i]
            task_name = examples['task_dataset'][i]
            prompt = query

            prompt = TASK_TO_TASK_TYPE[task_name] + prompt
            a_ids = tokenizer.encode(text=prompt, add_special_tokens=False)
            b_ids = tokenizer.encode(text=answer, add_special_tokens=False)

            if len(a_ids) > max_source_length - 1:
                a_ids = a_ids[: max_source_length - 1]

            if len(b_ids) > max_target_length - 2:
                b_ids = b_ids[: max_target_length - 2]

            input_ids = tokenizer.build_inputs_with_special_tokens(a_ids, b_ids)

            context_length = input_ids.index(tokenizer.bos_token_id)
            mask_position = context_length - 1
            labels = [-100] * context_length + input_ids[mask_position+1:]
            
            pad_len = max_seq_length - len(input_ids)
            input_ids = input_ids + [tokenizer.pad_token_id] * pad_len
            labels = labels + [tokenizer.pad_token_id] * pad_len
            # print("input_ids: ", len(input_ids))
            # print("labels: ", len(labels))

            if ignore_pad_token_for_loss:
                labels = [(l if l != tokenizer.pad_token_id else -100) for l in labels]

            model_inputs["input_ids"].append(input_ids)
            model_inputs["labels"].append(labels)

    return model_inputs

In [3]:
TASK_TO_TASK_TYPE = {
    "CHIP-STS" :'语义匹配',
}