In [1]:
# uv 环境下可运行如下命令下载模型：
# uv run modelscope download --model langboat/mengzi-t5-base --local_dir models/mengzi-t5-base

import evaluate
import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import DataCollatorForSeq2Seq
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
EPOCHS = 10  # 模型学习训练集的轮次
BATCH_SIZE = 4  # 模型并行学习的样本数量
INPUT_HEADER = "text_a"  # 模型输入的表头名
OUTPUT_HEADER = "sentiment"  # 模型输出的表头名
PRETRAINED_MODEL_NAME_OR_PATH = "../models/mengzi-t5-base"  # 预训练模型路径
CSV_PATH = "../datasets/label_studio文本分类数据集.csv"
OUPUT_CHECKPOINT_PATH = (
    "../models/text2text_generation_checkpoint"  # 训练过程中产生的模型文件
)
OUTPUT_MODEL_PATH = "../models/text2text_generation_model"  # 微调后的文本分类模型路径
VAL_SIZE = 0.1  # 验证集的占比
MAX_LENGTH = 256  # 输入文本的token长度，lert模型最大只能是512
MAX_NEW_TOKENS = 8  # 模型输出文本的最大长度，最大可设置为512
ROUGE_PATH = "../common/rouge.py"  # 评估脚本本地路径

In [3]:
# 从本地加载数据集
dataset = load_dataset("csv", data_files=CSV_PATH)
dataset = dataset.select_columns([INPUT_HEADER, OUTPUT_HEADER])
dataset = dataset["train"].train_test_split(test_size=VAL_SIZE)

tokenizer = AutoTokenizer.from_pretrained(
    PRETRAINED_MODEL_NAME_OR_PATH,
    max_length=MAX_LENGTH,
    truncation=True,
    padding="max_length",
)


def preprocess_function(examples):
    model_inputs = tokenizer(examples[INPUT_HEADER])

    labels = tokenizer(text_target=examples[OUTPUT_HEADER])

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


tokenized_billsum = dataset.map(preprocess_function, batched=True)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Map: 100%|██████████| 89/89 [00:00<00:00, 8016.95 examples/s]
Map: 100%|██████████| 10/10 [00:00<00:00, 1667.52 examples/s]


In [4]:
rouge = evaluate.load(ROUGE_PATH)


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )

    prediction_lens = [
        np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions
    ]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [5]:
model = AutoModelForSeq2SeqLM.from_pretrained(PRETRAINED_MODEL_NAME_OR_PATH)

training_args = Seq2SeqTrainingArguments(
    output_dir=OUPUT_CHECKPOINT_PATH,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    weight_decay=0.01,
    save_total_limit=1,
    num_train_epochs=EPOCHS,
    predict_with_generate=True,
    generation_max_length=MAX_NEW_TOKENS,
    load_best_model_at_end=True,
    fp16=True,
)

data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer, model=PRETRAINED_MODEL_NAME_OR_PATH
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_billsum["train"],
    eval_dataset=tokenized_billsum["test"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()
model.half()
model.save_pretrained(OUTPUT_MODEL_PATH)
tokenizer.save_pretrained(OUTPUT_MODEL_PATH)

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,No log,0.175334,0.0,0.0,0.0,0.0,3.0
2,No log,0.151619,0.0,0.0,0.0,0.0,3.0
3,No log,0.402646,0.0,0.0,0.0,0.0,3.0
4,No log,0.056491,0.0,0.0,0.0,0.0,3.0
5,No log,0.16169,0.0,0.0,0.0,0.0,3.0
6,No log,0.049959,0.0,0.0,0.0,0.0,3.0
7,No log,0.038287,0.0,0.0,0.0,0.0,3.0
8,No log,0.072897,0.0,0.0,0.0,0.0,3.0
9,No log,0.079297,0.0,0.0,0.0,0.0,3.0
10,No log,0.050328,0.0,0.0,0.0,0.0,3.0


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].


('../models/text2text_generation_model\\tokenizer_config.json',
 '../models/text2text_generation_model\\special_tokens_map.json',
 '../models/text2text_generation_model\\spiece.model',
 '../models/text2text_generation_model\\added_tokens.json',
 '../models/text2text_generation_model\\tokenizer.json')

In [6]:
import torch
from transformers import pipeline
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(
    OUTPUT_MODEL_PATH,
    truncation=True,
    max_length=MAX_LENGTH,
    padding="max_length",
    use_fast=True,
)

text2text = pipeline(
    "text2text-generation",
    tokenizer=tokenizer,
    model=OUTPUT_MODEL_PATH,
    torch_dtype=torch.float16
)

print(text2text("奥创设定太差。"))

Device set to use cpu


[{'generated_text': '差评'}]
