In [2]:
import transformers, datasets, evaluate

import torch, torch.utils, torch.utils.data
from torch.optim.adamw import AdamW

import numpy, scipy, sklearn

import tqdm.auto
import pprint

In [2]:
checkpoint = "bert-base-uncased"
tokenizer = transformers.AutoTokenizer.from_pretrained(checkpoint)
model = transformers.AutoModelForSequenceClassification.from_pretrained(checkpoint).to("cuda:0")

sequences = [
    "I like apple",
    "Apple is a fruit"
]

batch = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt").to("cuda:0")
batch["labels"] = torch.tensor([1, 1]).cuda()

optimizer = AdamW(model.parameters())
loss = model(**batch).loss
loss.backward()
optimizer.step()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  attn_output = torch.nn.functional.scaled_dot_product_attention(


# §3.1 datasets库

In [3]:
raw_datasets = datasets.load_dataset("glue", "mrpc")
pprint.pprint({
    "raw_datasets": raw_datasets,
    "raw_datasets_feature": raw_datasets["test"].features,
    "raw_one_data_demo": raw_datasets["test"][0]
})

{'raw_datasets': DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
}),
 'raw_datasets_feature': {'idx': Value(dtype='int32', id=None),
                          'label': ClassLabel(names=['not_equivalent',
                                                     'equivalent'],
                                              id=None),
                          'sentence1': Value(dtype='string', id=None),
                          'sentence2': Value(dtype='string', id=None)},
 'raw_one_data_demo': {'idx': 0,
                       'label': 1,
                       'sentence1': "PCCW 's chief operating officer , Mike "
                                    'Butcher , and Alex Arena , the chief '

In [4]:
# 一次性加载所有数据集到内存中，for()单独每条数据处理

tokenized_dataset = tokenizer(
    raw_datasets["train"]["sentence1"],
    raw_datasets["train"]["sentence2"],
    padding=True,
    truncation=True,
    return_tensors="pt"
)
pprint.pprint({
    "tokenized_dataset": tokenized_dataset,
    "tokenized_dataset_shape": {key_name: tokenized_dataset[key_name].shape for key_name in tokenized_dataset.keys()}
})

{'tokenized_dataset': {'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]),
                       'input_ids': tensor([[  101,  2572,  3217,  ...,     0,     0,     0],
        [  101,  9805,  3540,  ...,     0,     0,     0],
        [  101,  2027,  2018,  ...,     0,     0,     0],
        ...,
        [  101,  1000,  2057,  ...,     0,     0,     0],
        [  101,  1996, 26828,  ...,     0,     0,     0],
        [  101,  1996,  2382,  ...,     0,     0,     0]]),
                       'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])},
 'tokenized_dataset_shape': {'attention_mask': torch.Size([3668, 103]),
        

In [5]:
# 一次性加载所有数据集到内存中，map(batch=True)批量处理，速度快

tokenized_dataset = raw_datasets.map(
    function=lambda examples: tokenizer(examples["sentence1"], examples["sentence2"], truncation=True), 
    batched=True
)
pprint.pprint({
    "tokenized_dataset": tokenized_dataset,
    "tokenized_dataset_shape": {key_name: tokenized_dataset[key_name].shape for key_name in tokenized_dataset.keys()},
    "句子转化成长度不同的token":  torch.tensor([len(data["input_ids"]) for data in tokenized_dataset["test"]])
})

Map:   0%|          | 0/1725 [00:00<?, ? examples/s]

{'tokenized_dataset': DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1725
    })
}),
 'tokenized_dataset_shape': {'test': (1725, 7),
                             'train': (3668, 7),
                             'validation': (408, 7)},
 '句子转化成长度不同的token': tensor([49, 72, 60,  ..., 35, 74, 81])}


In [6]:
# Padding到相同长度

data_collator = transformers.DataCollatorWithPadding(tokenizer=tokenizer)
# tokenized_dataset = data_collator(tokenized_dataset[:8],)

samples = tokenized_dataset["train"][:]
# samples: dict[
#     "sentence1": list[str],
#     "sentence2": list[str],
#     "label": list[int],
#     "idx": list[int],
#     "input_ids": list[list[int]]
# ]

samples = {k: v for k, v in samples.items() if k in ["label", "input_ids"]}
# samples: dict[
#     "label": list[int],
#     "input_ids": list[list[int]]
# ]

batch = data_collator(samples)

pprint.pprint({
    "type(samples)": type(samples),
    "samples_content": {key: type(value) for key, value in samples.items()},
    "type(batch)": type(batch),
    "batch_shape": {k: v.shape for k, v in batch.items()},
})

{'batch_shape': {'attention_mask': torch.Size([3668, 103]),
                 'input_ids': torch.Size([3668, 103]),
                 'labels': torch.Size([3668])},
 'samples_content': {'input_ids': <class 'list'>, 'label': <class 'list'>},
 'type(batch)': <class 'transformers.tokenization_utils_base.BatchEncoding'>,
 'type(samples)': <class 'dict'>}


# §3.2 Trainer API

In [7]:
raw_datasets = datasets.load_dataset("glue", "mrpc")
checkpoint = "bert-base-uncased"
tokenizer = transformers.AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)

tokenized_datasets = raw_datasets.map(
    function=tokenize_function,
    batched=True
)
data_collator = transformers.DataCollatorWithPadding(tokenizer=tokenizer)

In [8]:
# 定义训练参数
training_args = transformers.TrainingArguments("test-trainer") # 模型训练的保存目录

# 定义模型
model = transformers.AutoModelForSequenceClassification.from_pretrained(
    checkpoint,
    num_labels=2
)

# 定义Trainer
trainer = transformers.Trainer(
    model, 
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer
)

# tokenized_datasets["train"]: Dataset({
#     features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
#     num_rows: 3668
# })

# tokenized_datasets["validation"]: Dataset({
#     features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
#     num_rows: 408
# })

# 开始训练
trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/1377 [00:00<?, ?it/s]

{'loss': 0.5363, 'grad_norm': 1.1965887546539307, 'learning_rate': 3.184458968772695e-05, 'epoch': 1.09}
{'loss': 0.2833, 'grad_norm': 0.08787903189659119, 'learning_rate': 1.3689179375453886e-05, 'epoch': 2.18}
{'train_runtime': 1052.9603, 'train_samples_per_second': 10.451, 'train_steps_per_second': 1.308, 'train_loss': 0.34164859491928645, 'epoch': 3.0}


TrainOutput(global_step=1377, training_loss=0.34164859491928645, metrics={'train_runtime': 1052.9603, 'train_samples_per_second': 10.451, 'train_steps_per_second': 1.308, 'total_flos': 405114969714960.0, 'train_loss': 0.34164859491928645, 'epoch': 3.0})

In [9]:
predictions = trainer.predict(tokenized_datasets["validation"])
# trainer.predict(tokenized_datasets["validation"]): Dataset({
#     features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
#     num_rows: 408
# })

  0%|          | 0/51 [00:00<?, ?it/s]

In [10]:
pprint.pprint({
    "predictions": predictions,
    "predictions.predictions.shape": predictions.predictions.shape,
    "predictions.label_ids.shape": predictions.label_ids.shape
})

{'predictions': PredictionOutput(predictions=array([[-3.015744  ,  3.6877723 ],
       [ 2.5303028 , -3.4010143 ],
       [ 1.1536517 , -0.8440161 ],
       [-2.9627337 ,  3.666817  ],
       [ 2.534016  , -3.3643885 ],
       [-2.8641746 ,  3.587876  ],
       [-2.7236996 ,  3.5345864 ],
       [-2.983623  ,  3.6432285 ],
       [-2.4337814 ,  3.3980978 ],
       [-3.0381052 ,  3.6667738 ],
       [-2.9769688 ,  3.6420445 ],
       [ 2.5517476 , -3.466066  ],
       [ 2.352597  , -3.2868183 ],
       [-2.7957087 ,  3.5809133 ],
       [-3.0151188 ,  3.6849408 ],
       [-1.1385697 ,  2.0387857 ],
       [-3.0073507 ,  3.6618736 ],
       [ 2.2607965 , -2.9414856 ],
       [-3.006884  ,  3.653608  ],
       [ 2.2340896 , -2.6657424 ],
       [ 2.487991  , -3.3157384 ],
       [-1.5030047 ,  2.4938443 ],
       [ 1.172013  , -1.5260674 ],
       [-2.9847126 ,  3.6555583 ],
       [-2.947463  ,  3.6641955 ],
       [-1.3008718 ,  2.2326744 ],
       [-2.488938  ,  3.366145  ],
       [-3

In [11]:
preds = numpy.argmax(predictions.predictions, axis=-1)
pprint.pprint(
    preds
)

array([1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0,
       1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1,
       0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1,

In [12]:
# 评估模型性能

metric = evaluate.load("glue", "mrpc")
metric.compute(
    predictions=preds,
    references=predictions.label_ids
)

{'accuracy': 0.8602941176470589, 'f1': 0.9038785834738617}

In [42]:
# 给Trainer集成metric评估函数

raw_datasets = datasets.load_dataset("glue", "mrpc")
checkpoint = "bert-base-uncased"
tokenizer = transformers.AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)

tokenized_datasets = raw_datasets.map(
    function=tokenize_function,
    batched=True
)

training_args = transformers.TrainingArguments(
    "test-trainer", 
    eval_strategy="epoch"
)

model = transformers.AutoModelForSequenceClassification.from_pretrained(
    checkpoint, 
    num_labels=2
)
model.to(torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu"))

data_collator = transformers.DataCollatorWithPadding(tokenizer=tokenizer)

temp = 1

def compute_metrics(eval_preds):
    metirc: evaluate.module.EvaluationModule = evaluate.load("glue", "mrpc")
    logits, labels = eval_preds
    temp = eval_preds
    pprint.pprint({
        "eval_preds": eval_preds
    })
    predictions = numpy.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

trainer = transformers.Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# §3.3 完整的微调步骤

In [3]:
# 1.加载原始str数据集raw_datasets
# 2.创建Tokenizer及其DataCollatorWithPadding
# 3.使用raw_datasets.map(lambda)批量处理原始数据集，得到Tokenized数据集tokenized_datasets
# 4.创建Bert二分类model

raw_datasets: datasets.DatasetDict = datasets.load_dataset("glue", "mrpc")
# raw_datasets: DatasetDict({
#     train: Dataset({
#         features: ['sentence1', 'sentence2', 'label', 'idx'],
#         num_rows: 3668
#     })
#     validation: Dataset({
#         features: ['sentence1', 'sentence2', 'label', 'idx'],
#         num_rows: 408
#     })
#     test: Dataset({
#         features: ['sentence1', 'sentence2', 'label', 'idx'],
#         num_rows: 1725
#     })
# })
# raw_datasets.__getitem__(__index: int) -> dict[
#     {
#         'sentence1': str,
#         'sentence2': str,
#         'label': int,
#         'idx': __index
#     }
# ]

checkpoint: str = "bert-base-uncased"

tokenizer: transformers.BertTokenizerFast = transformers.AutoTokenizer.from_pretrained(checkpoint)
# tokenizer: BertTokenizerFast(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
# 	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
# 	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
# 	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
# 	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
# 	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
# }
# tokenizer(
#     text: str | list[str],
#     text_pair: str | list[str]
# ) -> dict[
#     {
#         'input_ids': list[int] | list[list[int]], 
#         'token_type_ids': list[int] | list[list[int]], 
#         'attention_mask': list[int] | list[list[int]]
#     }
# ]
# 当tokenizer.__call__()中的函数形参text和text_pair同时使用时，视为一段序列的两个部分
# 例如tokenizer("hello", "hello")的生成的input_ids经解码后为'[CLS] hello [SEP] hello [SEP]'

data_collator = transformers.DataCollatorWithPadding(tokenizer=tokenizer)
# data:collator: DataCollatorWithPadding(tokenizer=BertTokenizerFast(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
# 	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
# 	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
# 	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
# 	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
# 	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
# }, padding=True, max_length=None, pad_to_multiple_of=None, return_tensors='pt')

def tokenize_function(example: dict) -> dict:
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)
# tokenizer_function(
#     example: dict 即 type(raw_datasets.__getitem__())
# ) -> dict[{
#     'input_ids': list[int] | list[list[int]], 
#     'token_type_ids': list[int] | list[list][int], # 前半部分为0，后半部分为1
#     'attention_mask': list[int] | list[list][int] # 全1
# }]

tokenized_datasets: datasets.DatasetDict = raw_datasets.map(tokenize_function, batched=True)
# tokenized_datasets: DatasetDict({
#     train: Dataset({
#         features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
#         num_rows: 3668
#     })
#     validation: Dataset({
#         features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
#         num_rows: 408
#     })
#     test: Dataset({
#         features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
#         num_rows: 1725
#     })
# })
# tokenized_datasets.__getitem__(str): Dataset({
#     features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
#     num_rows: int
# })

model: transformers.BertForSequenceClassification = transformers.AutoModelForSequenceClassification.from_pretrained(
    checkpoint,
    num_labels = 2
)
model.to(torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu"))
# BertForSequenceClassification(
#   (bert): BertModel(
#     (embeddings): BertEmbeddings(
#       (word_embeddings): Embedding(30522, 768, padding_idx=0)
#       (position_embeddings): Embedding(512, 768)
#       (token_type_embeddings): Embedding(2, 768)
#       (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
#       (dropout): Dropout(p=0.1, inplace=False)
#     )
#     (encoder): BertEncoder(
#       (layer): ModuleList(
#         (0-11): 12 x BertLayer(
#           (attention): BertAttention(
#             (self): BertSdpaSelfAttention(
#               (query): Linear(in_features=768, out_features=768, bias=True)
#               (key): Linear(in_features=768, out_features=768, bias=True)
#               (value): Linear(in_features=768, out_features=768, bias=True)
#               (dropout): Dropout(p=0.1, inplace=False)
#             )
#             (output): BertSelfOutput(
#               (dense): Linear(in_features=768, out_features=768, bias=True)
#               (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
#               (dropout): Dropout(p=0.1, inplace=False)
#             )
#           )
#           (intermediate): BertIntermediate(
#             (dense): Linear(in_features=768, out_features=3072, bias=True)
#             (intermediate_act_fn): GELUActivation()
#           )
#           (output): BertOutput(
#             (dense): Linear(in_features=3072, out_features=768, bias=True)
#             (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
#             (dropout): Dropout(p=0.1, inplace=False)
#           )
#         )
#       )
#     )
#     (pooler): BertPooler(
#       (dense): Linear(in_features=768, out_features=768, bias=True)
#       (activation): Tanh()
#     )
#   )
#   (dropout): Dropout(p=0.1, inplace=False)
#   (classifier): Linear(in_features=768, out_features=2, bias=True)
# )

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [4]:
# 5.删除tokenized_datasets中不必要的字段
tokenized_datasets: datasets.DatasetDict = tokenized_datasets.remove_columns(["sentence1", "sentence2", "idx"])
tokenized_datasets: datasets.DatasetDict = tokenized_datasets.rename_column("label", "labels")
pprint.pprint(tokenized_datasets)


DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1725
    })
})


In [5]:
# 6.将tokenized_datasets中的list转换为torch.Tensor
tokenized_datasets.set_format("torch")
pprint.pprint(tokenized_datasets["train"].column_names)

['labels', 'input_ids', 'token_type_ids', 'attention_mask']


In [7]:
# 7.创建torch.utils.data.DataLoader
# 8.创建torch.optim.adamw.AdamW
# 9.创建transformers.get_scheduler，使得学习率从5e-5向0线性递减

train_dataloader = torch.utils.data.DataLoader(
    tokenized_datasets["train"],
    shuffle=True,
    batch_size=16,
    collate_fn=data_collator
)
eval_dataloader = torch.utils.data.DataLoader(
    tokenized_datasets["validation"],
    batch_size=16,
    collate_fn=data_collator
)
pprint.pprint(
    [{k: v.shape for k, v in batch.items()} for batch in train_dataloader][:3]
)

optimizer = AdamW(
    model.parameters(),
    lr=5e-5,
)

num_epoches = 3
num_training_steps = num_epoches * len(train_dataloader)
lr_scheduler = transformers.get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)
pprint.pprint({
    "num_training_steps": num_training_steps
})

[{'attention_mask': torch.Size([16, 82]),
  'input_ids': torch.Size([16, 82]),
  'labels': torch.Size([16]),
  'token_type_ids': torch.Size([16, 82])},
 {'attention_mask': torch.Size([16, 100]),
  'input_ids': torch.Size([16, 100]),
  'labels': torch.Size([16]),
  'token_type_ids': torch.Size([16, 100])},
 {'attention_mask': torch.Size([16, 82]),
  'input_ids': torch.Size([16, 82]),
  'labels': torch.Size([16]),
  'token_type_ids': torch.Size([16, 82])}]
{'num_training_steps': 690}


In [8]:
# 10.开始训练

model.train() # 让model从评估模式转为训练模式，而不是像Trainer.train()那样直接开练

progress_bar = tqdm.auto.tqdm(range(num_training_steps))
device: torch.device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
for epoch in range(num_epoches):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

  0%|          | 0/690 [00:00<?, ?it/s]

  attn_output = torch.nn.functional.scaled_dot_product_attention(


In [9]:
# 11.对模型进行评估

metric = evaluate.load("glue", "mrpc")
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

{'accuracy': 0.8553921568627451, 'f1': 0.8991452991452992}

使用Accelerator包装torch.device
```diff
+ from accelerate import Accelerator
  from transformers import AdamW, AutoModelForSequenceClassification, get_scheduler

+ accelerator = Accelerator()

  model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
  optimizer = AdamW(model.parameters(), lr=3e-5)

- device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
- model.to(device)

+ train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare(
+     train_dataloader, eval_dataloader, model, optimizer
+ )

  num_epochs = 3
  num_training_steps = num_epochs * len(train_dataloader)
  lr_scheduler = get_scheduler(
      "linear",
      optimizer=optimizer,
      num_warmup_steps=0,
      num_training_steps=num_training_steps
  )

  progress_bar = tqdm(range(num_training_steps))

  model.train()
  for epoch in range(num_epochs):
      for batch in train_dataloader:
-         batch = {k: v.to(device) for k, v in batch.items()}
          outputs = model(**batch)
          loss = outputs.loss
-         loss.backward()
+         accelerator.backward(loss)

          optimizer.step()
          lr_scheduler.step()
          optimizer.zero_grad()
          progress_bar.update(1)
```