## 安装依赖

如果当前环境已经有相关依赖了，则不用执行

In [None]:
!pip install transformers[torch] datasets==3.6.0 evaluate

## 加载数据

In [None]:
from datasets import load_dataset
import evaluate

task = "cola"
model_checkpoint = "distilbert-base-uncased"
batch_size = 16
# 除了mnli-mm以外，其他任务都可以直接通过任务名字进行加载。数据加载之后会自动缓存。
actual_task = "mnli" if task == "mnli-mm" else task
dataset = load_dataset("glue", actual_task)
# 评估指标加载
metric = evaluate.load("glue", actual_task)

In [None]:
# 你可以查看dataset或metric对象都封装了什么东西
dataset

## 数据集可视化

为了能够进一步理解数据长什么样子，下面的函数将从数据集里随机选择几个例子进行展示。

In [None]:
import datasets
import random
import pandas as pd
from IPython.display import display, HTML

def show_random_element(dataset, num_examples=10):
  assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
  picks = []
  for _ in range(num_examples):
    pick = random.randint(0, len(dataset) - 1)
    while pick in picks:
      pick = random.randint(0, len(dataset) - 1)
    picks.append(pick)

  df = pd.DataFrame(dataset[picks])
  for column, typ in dataset.features.items():
    if isinstance(typ,datasets.ClassLabel):
      df[column] = df[column].transform(lambda i: typ.names[i])
  display(HTML(df.to_html()))

show_random_element(dataset["train"])

## 数据预处理

1. 获取分词器

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

`use_fast=True` 要求tokenizer必须是transformers.PreTrainedTokenizerFast类型；

因为我们在预处理的时候需要用到fast tokenizer的一些特殊特性（比如多线程快速tokenizer，offset mapping等）。

如果对应的模型没有fast tokenizer，去掉这个选项即可。

几乎所有模型对应的tokenizer都有对应的fast tokenizer;可以到模型API文档中查看：https://huggingface.co/docs/transformers/v4.53.3/en/model_doc/bert#transformers.BertTokenizer

2. 根据具体的任务，处理数据集的样本

In [None]:
# 不同的任务数据集的字段是不同的
task_to_keys = {
    "cola": ("sentence", None),
    "mnli": ("premise", "hypothesis"),
    "mnli-mm": ("premise", "hypothesis"),
    "mrpc": ("sentence1", "sentence2"),
    "qnli": ("question", "sentence"),
    "qqp": ("question1", "question2"),
    "rte": ("sentence1", "sentence2"),
    "sst2": ("sentence", None),
    "stsb": ("sentence1", "sentence2"),
    "wnli": ("sentence1", "sentence2")
}

3. 对数据格式进行检查

In [None]:
sentence1_key, sentence2_key = task_to_keys[task]
if sentence2_key is None:
    print(f"Sentence: {dataset['train'][0][sentence1_key]}")
else:
    print(f"Sentence 1: {dataset['train'][0][sentence1_key]}")
    print(f"Sentence 2: {dataset['train'][0][sentence2_key]}")

封装成预处理函数

In [None]:
def preprocess_function(examples):
    if sentence2_key is None:
        return tokenizer(examples[sentence1_key], truncation=True) # truncation 超过512自动截断
    return tokenizer(examples[sentence1_key], examples[sentence2_key], truncation=True)

4. 对预处理后的数据集进行数据映射

In [None]:
# batched=True，num_proc=4；使用多线程同时并行对输入进行批处理
encoded_dataset = dataset.map(preprocess_function, batched=True, num_proc=4)

## 加载预训练模型

In [None]:
from transformers import AutoModelForSequenceClassification

# 设置NLP任务中，标签的类别总数，不同任务，标签数据有所区别
num_labels = 3 if task.startswith("mnli") else 1 if task == "stsb" else 2

# 加载预训练模型
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)

## 获取评估指标

In [None]:
metric_name = "pearson" if task == "stsb" else "matthews_correlation" if task == "cola" else "accuracy"

## 创建训练参数配置类

In [None]:
from transformers import TrainingArguments

args = TrainingArguments(
    "test-glue",                            # 自定义的任务名称
    eval_strategy="epoch",                  # 每轮都测评
    save_strategy="epoch",                  # 每轮都保存权重
    learning_rate=2e-5,                     # 优化器的初始学习率
    per_device_train_batch_size=batch_size, # 训练阶段的批次大小
    per_device_eval_batch_size=batch_size,  # 验证阶段的批次大小
    num_train_epochs=5,                     # 总训练轮次
    weight_decay=0.01,                      # 权重衰减
    load_best_model_at_end=True,            # 在最后阶段加载最佳模型权重
    metric_for_best_model=metric_name,      # 评估不同权重的模型时所使用的指标
    report_to="none"                        # 是否需要同步记录训练日志
)

## 构建评估指标函数

In [None]:
import numpy as np

# 不同的任务，测评指标不同；
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    if task != "stsb":
        predictions = np.argmax(predictions, axis=1)
    else:
        predictions = predictions[:, 0]
    return metric.compute(predictions=predictions, references=labels)

## 创建训练器

In [None]:
from transformers import Trainer

validation_key = "validation_mismatched" if task == "mnli-mm" else "validation_matched" if task == "mnli" else "validation"
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset[validation_key],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

## 开始训练

In [None]:
trainer.train()

## 模型评估

In [None]:
trainer.evaluate()