In [4]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
from datasets import load_dataset
from transformers import Trainer, TrainingArguments
import evaluate

In [8]:
# 确定是否可以使用GPU，如果可以就使用GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [7]:
# 加载 cnn_dailymail 数据集
dataset = load_dataset("cnn_dailymail", "3.0.0")

# 数据预处理函数
def preprocess_data(examples):
    inputs = examples["article"]
    targets = examples["highlights"]
    return {"input_text": inputs, "target_text": targets}

# 应用数据预处理
dataset = dataset.map(preprocess_data, remove_columns=["article", "highlights"])


Map:   0%|          | 0/287113 [00:00<?, ? examples/s]

Map:   0%|          | 0/13368 [00:00<?, ? examples/s]

Map:   0%|          | 0/11490 [00:00<?, ? examples/s]

In [9]:
# 加载 T5-small 模型和分词器
model_name = "t5-small"  # 使用更小的模型 t5-small
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name).to(device)  # 加载模型并移至 GPU


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [10]:
# 数据编码函数
def tokenize_data(examples):
    inputs = tokenizer(examples["input_text"], padding="max_length", truncation=True, max_length=256)  # 设置较小的最大长度
    targets = tokenizer(examples["target_text"], padding="max_length", truncation=True, max_length=150)
    inputs["labels"] = targets["input_ids"]
    return inputs

# 对数据集进行编码
dataset = dataset.map(tokenize_data, batched=True)

Map:   0%|          | 0/287113 [00:00<?, ? examples/s]

Map:   0%|          | 0/13368 [00:00<?, ? examples/s]

Map:   0%|          | 0/11490 [00:00<?, ? examples/s]

In [None]:
# 计算 ROUGE 分数
rouge = evaluate.load("rouge")

Map:   0%|          | 0/287113 [00:00<?, ? examples/s]

Map:   0%|          | 0/13368 [00:00<?, ? examples/s]

Map:   0%|          | 0/11490 [00:00<?, ? examples/s]

In [None]:
from transformers import Trainer, TrainingArguments

# 设置训练参数
training_args = TrainingArguments(
    output_dir="./results",        # 输出目录
    evaluation_strategy="epoch",    # 评估策略
    learning_rate=2e-5,            # 学习率
    per_device_train_batch_size=8, # 训练时的批次大小
    per_device_eval_batch_size=8,  # 验证时的批次大小
    num_train_epochs=3,            # 训练轮数
)

# 创建 Trainer 对象
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=tokenizer,
)

# 开始训练
trainer.train()




  trainer = Trainer(


  0%|          | 0/107670 [00:00<?, ?it/s]

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
