# 基于T5的文本摘要

## Step1 导入相关包

In [None]:
!pip install datasets transformers rouge-score nltk rouge_chinese

In [7]:
import torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments

InvalidVersion: Invalid version: '0.10.1,<0.11'

## Step2 加载数据集

In [5]:
ds = Dataset.load_from_disk("./nlpcc_2017/")
ds

Dataset({
    features: ['title', 'content'],
    num_rows: 5000
})

In [6]:
ds = ds.train_test_split(200, seed=42)
ds

Loading cached split indices for dataset at /mnt/workspace/tra-code/02-NLP Tasks/15-text_summarization/nlpcc_2017/cache-f88b03791a18aede.arrow and /mnt/workspace/tra-code/02-NLP Tasks/15-text_summarization/nlpcc_2017/cache-6b0d5568bb085c36.arrow


DatasetDict({
    train: Dataset({
        features: ['title', 'content'],
        num_rows: 4800
    })
    test: Dataset({
        features: ['title', 'content'],
        num_rows: 200
    })
})

In [7]:
ds["train"][0]

{'title': '郴州市发布雷电橙色预警:过去2小时北湖区、苏仙区、郴州市区、桂阳县、宜章县、嘉禾县、资兴市、桂东县、汝城县已经受...',
 'content': '发布日期:2015-03-3007:55:33郴州市气象台3月30日7时52分发布雷电橙色预警信号:过去2小时北湖区、苏仙区、郴州市区、桂阳县、宜章县、嘉禾县、资兴市、桂东县、汝城县已经受雷电活动影响,并将持续,出现雷电灾害事故的可能性比较大,请注意防范。图例标准防御指南2小时内发生雷电活动的可能性很大,或者已经受雷电活动影响,且可能持续,出现雷电灾害事故的可能性比较大。1、政府及相关部门按照职责落实防雷应急措施;2、人员应当留在室内,并关好门窗;3、户外人员应当躲入有防雷设施的建筑物或者汽车内;4、切断危险电源,不要在树下、电杆下、塔吊下避雨;5、在空旷场地不要打伞,不要把农具、羽毛球拍、高尔夫球杆等扛在肩上。'}

## Step3 数据处理

In [8]:
from modelscope import snapshot_download
model_dir = snapshot_download("langboat/mengzi-t5-base")

2024-07-23 18:20:13,115 - modelscope - INFO - PyTorch version 2.0.1+cpu Found.
2024-07-23 18:20:13,117 - modelscope - INFO - TensorFlow version 2.13.0 Found.
2024-07-23 18:20:13,118 - modelscope - INFO - Loading ast index from /mnt/workspace/.cache/modelscope/ast_indexer
2024-07-23 18:20:13,141 - modelscope - INFO - Loading done! Current index file version is 1.9.5, with md5 79827826d04c54fc06982662c5095533 and a total number of 945 components indexed


In [9]:
model_dir

'/mnt/workspace/.cache/modelscope/langboat/mengzi-t5-base'

In [10]:
tokenizer = AutoTokenizer.from_pretrained(model_dir)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [11]:
def process_func(exmaples):
    contents = ["摘要生成: \n" + e for e in exmaples["content"]]
    inputs = tokenizer(contents, max_length=384, truncation=True)
    labels = tokenizer(text_target=exmaples["title"], max_length=64, truncation=True)
    inputs["labels"] = labels["input_ids"]
    return inputs

In [12]:
tokenized_ds = ds.map(process_func, batched=True)
tokenized_ds

                                                                 

DatasetDict({
    train: Dataset({
        features: ['title', 'content', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 4800
    })
    test: Dataset({
        features: ['title', 'content', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 200
    })
})

In [13]:
tokenizer.decode(tokenized_ds["train"][0]["input_ids"])

'摘要生成: 发布日期:2015-03-3007:55:33郴州市气象台3月30日7时52分发布雷电橙色预警信号:过去2小时北湖区、苏仙区、郴州市区、桂阳县、宜章县、嘉禾县、资兴市、桂东县、汝城县已经受雷电活动影响,并将持续,出现雷电灾害事故的可能性比较大,请注意防范。图例标准防御指南2小时内发生雷电活动的可能性很大,或者已经受雷电活动影响,且可能持续,出现雷电灾害事故的可能性比较大。1、政府及相关部门按照职责落实防雷应急措施;2、人员应当留在室内,并关好门窗;3、户外人员应当躲入有防雷设施的建筑物或者汽车内;4、切断危险电源,不要在树下、电杆下、塔吊下避雨;5、在空旷场地不要打伞,不要把农具、羽毛球拍、高尔夫球杆等扛在肩上。</s>'

In [14]:
tokenizer.decode(tokenized_ds["train"][0]["labels"])

'郴州市发布雷电橙色预警:过去2小时北湖区、苏仙区、郴州市区、桂阳县、宜章县、嘉禾县、资兴市、桂东县、汝城县已经受...</s>'

In [15]:
print(tokenized_ds["train"][0]["labels"])

[7, 27508, 5045, 1299, 1098, 580, 21296, 7701, 13, 888, 99, 1429, 512, 1080, 159, 6, 1193, 1707, 159, 6, 27508, 5045, 159, 6, 4449, 12817, 6, 2471, 761, 308, 6, 2351, 10099, 308, 6, 2476, 1345, 157, 6, 4449, 301, 308, 6, 7787, 17442, 147, 425, 1542, 1]


## Step4 创建模型

In [16]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_dir)

  return self.fget.__get__(instance, owner)()


## Step5 创建评估函数

In [None]:
# !pip install rouge_chinese transformers==4.4.2

In [18]:
import numpy as np
from rouge_chinese import Rouge
from nltk.translate.bleu_score import sentence_bleu
import numpy as np

rouge = Rouge()

def compute_metric(evalPred):
    def calculate_bleu_scores(candidate, references):
        candidate = list(candidate.split(" "))
        reference = [list(references.split(" "))]
        weights_list = [
            (1, 0, 0, 0),   # BLEU-1
            (0.5, 0.5, 0, 0),   # BLEU-2
            (1/3, 1/3, 1/3, 0),   # BLEU-3
            (0.25, 0.25, 0.25, 0.25) # BLEU-4
        ]

        bleu_scores = []
        for weights in weights_list:
            # print(sentence_bleu(reference, candidate, weights=weights))
            score = sentence_bleu(reference, candidate, weights=weights)
            bleu_scores.append(score)
        return bleu_scores
    predictions, labels = evalPred
    decode_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decode_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    decode_preds = [" ".join(p) for p in decode_preds]
    decode_labels = [" ".join(l) for l in decode_labels]
    rouge_scores = rouge.get_scores(decode_preds, decode_labels, avg=True)
    bleu_scores_batch = np.mean(np.array([calculate_bleu_scores(cand, refs) for cand, refs in zip(decode_preds, decode_labels)]), axis=0)
    return {
        "rouge-1": rouge_scores["rouge-1"]["f"],
        "rouge-2": rouge_scores["rouge-2"]["f"],
        "rouge-l": rouge_scores["rouge-l"]["f"],
        "bleu-l": bleu_scores_batch[0],
        "bleu-2": bleu_scores_batch[1],
        "bleu-3": bleu_scores_batch[2],
        "bleu-4": bleu_scores_batch[3],
    }

In [None]:
rouge = Rouge()
decode_preds = [" ".join(p) for p in ["这是一份行动指南，确保军队始终服从党的命令", "确保军队始终服从党的命令"]]
decode_labels = [" ".join(l) for l in ["这是一份行动指南，确保军队永远听从党的指挥", "确保军队永远听从党的指挥"]]
print(decode_preds, decode_labels)
scores = rouge.get_scores(decode_preds, decode_labels, avg=True)
scores

In [None]:
from nltk.translate.bleu_score import sentence_bleu
import numpy as np
# 定义一个函数来计算所有四个BLEU分数



# 使用一个列表推导式来计算batch中每个候选句子的BLEU分数
bleu_scores_batch = np.mean(np.array([calculate_bleu_scores(cand, refs) for cand, refs in zip(decode_preds, decode_labels)]), axis=0)
bleu_dict = {f'bleu-{i+1}': score for i, score in enumerate(bleu_scores_batch)}
bleu_dict

## Step6 配置训练参数

In [30]:
import matplotlib.pyplot as plt
from transformers import Seq2SeqTrainingArguments, TrainerCallback
from transformers.trainer_utils import TrainOutput

class MetricsLoggerCallback(TrainerCallback):
    def __init__(self):
        self.metrics = []
        self.losses = []

    def on_log(self, args, state, control, logs=None, **kwargs):
        if state.is_local_process_zero:
            loss = logs['loss']
            self.losses.append(loss)

    def on_evaluate(self, args, state, control, metrics, **kwargs):
        if state.is_local_process_zero:
            self.metrics.append(metrics)

# 创建MetricsLoggerCallback实例
metrics_logger_callback = MetricsLoggerCallback()

args = Seq2SeqTrainingArguments(
    output_dir='./model_output',
    num_train_epochs=5,
    per_device_train_batch_size=16,
    logging_dir='./logs',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    report_to='none',  # 避免额外报告，仅使用自定义日志
    predict_with_generate=True,
)

In [None]:
help(Seq2SeqTrainingArguments)

## Step7 创建训练器

In [31]:
trainer = Seq2SeqTrainer(
    args=args,
    model=model,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    compute_metrics=compute_metric,
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer),
    callbacks=[metrics_logger_callback],
)

TypeError: __init__() got an unexpected keyword argument 'use_seedable_sampler'

## Step8 模型训练

In [None]:
trainer.train()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss


## Step9 模型推理

In [None]:
from transformers import pipeline

In [None]:
pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer, device=0)

In [None]:
pipe("摘要生成:\n" + ds["test"][-1]["content"], max_length=64, do_sample=True)

In [None]:
ds["test"][-1]["title"]