In [3]:
from datasets import load_dataset
import warnings
warnings.filterwarnings("ignore")
raw_datasets = load_dataset("glue","mrpc")
raw_datasets


DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

In [4]:
raw_train_dataset = raw_datasets["train"]
raw_train_dataset[0]

{'sentence1': 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .',
 'sentence2': 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .',
 'label': 1,
 'idx': 0}

In [5]:
from transformers import AutoTokenizer
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [6]:
inputs = tokenizer("Hello, this one sentence!", "And this sentence goes with it.")
inputs
# tokentype指定那个是第一句，那个是第二句

{'input_ids': [101, 7592, 1010, 2023, 2028, 6251, 999, 102, 1998, 2023, 6251, 3632, 2007, 2009, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [7]:
tokenizer.convert_ids_to_tokens(inputs["input_ids"])

['[CLS]',
 'hello',
 ',',
 'this',
 'one',
 'sentence',
 '!',
 '[SEP]',
 'and',
 'this',
 'sentence',
 'goes',
 'with',
 'it',
 '.',
 '[SEP]']

In [8]:
def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)

In [9]:
# 官方推荐速度快
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
tokenized_datasets['train'][0]


Map:   0%|          | 0/408 [00:00<?, ? examples/s]

Map: 100%|██████████| 408/408 [00:00<00:00, 9951.83 examples/s]


{'sentence1': 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .',
 'sentence2': 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .',
 'label': 1,
 'idx': 0,
 'input_ids': [101,
  2572,
  3217,
  5831,
  5496,
  2010,
  2567,
  1010,
  3183,
  2002,
  2170,
  1000,
  1996,
  7409,
  1000,
  1010,
  1997,
  9969,
  4487,
  23809,
  3436,
  2010,
  3350,
  1012,
  102,
  7727,
  2000,
  2032,
  2004,
  2069,
  1000,
  1996,
  7409,
  1000,
  1010,
  2572,
  3217,
  5831,
  5496,
  2010,
  2567,
  1997,
  9969,
  4487,
  23809,
  3436,
  2010,
  3350,
  1012,
  102],
 'token_type_ids': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
 

In [10]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer)

In [11]:
samples = tokenized_datasets["train"][:8]
# 不需要这些列
samples = {k: v for k, v in samples.items() if k not in ["idx", "sentence1", "sentence2"]}
[len(x) for x in samples["input_ids"]]


[50, 59, 47, 67, 59, 50, 62, 32]

In [12]:
# 默认最大的把所有数据做成batch
batch = data_collator(samples)
{k: v.shape for k, v in batch.items()}

{'input_ids': torch.Size([8, 67]),
 'token_type_ids': torch.Size([8, 67]),
 'attention_mask': torch.Size([8, 67]),
 'labels': torch.Size([8])}

In [13]:
from transformers import TrainingArguments
training_args = TrainingArguments("test_trainer")
training_args

TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=False,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_steps=None,
eval_strategy=IntervalStrategy.NO,
evaluation_strategy=None,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1

In [14]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
from transformers import Trainer
trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [16]:
trainer.train()

  attn_output = torch.nn.functional.scaled_dot_product_attention(
 36%|███▋      | 500/1377 [01:07<01:53,  7.70it/s]

{'loss': 0.5115, 'grad_norm': 1.6499571800231934, 'learning_rate': 3.184458968772695e-05, 'epoch': 1.09}


 73%|███████▎  | 1000/1377 [02:13<00:48,  7.80it/s]

{'loss': 0.3083, 'grad_norm': 1.771242380142212, 'learning_rate': 1.3689179375453886e-05, 'epoch': 2.18}


100%|██████████| 1377/1377 [03:05<00:00,  7.44it/s]

{'train_runtime': 185.1403, 'train_samples_per_second': 59.436, 'train_steps_per_second': 7.438, 'train_loss': 0.34425201637639286, 'epoch': 3.0}





TrainOutput(global_step=1377, training_loss=0.34425201637639286, metrics={'train_runtime': 185.1403, 'train_samples_per_second': 59.436, 'train_steps_per_second': 7.438, 'total_flos': 405114969714960.0, 'train_loss': 0.34425201637639286, 'epoch': 3.0})

In [17]:
predictions = trainer.predict(tokenized_datasets["validation"])
print(predictions.predictions.shape, predictions.label_ids.shape)

100%|██████████| 51/51 [00:01<00:00, 33.29it/s]

(408, 2) (408,)





In [20]:
import numpy as np
preds = np.argmax(predictions.predictions, axis=-1)
preds

array([1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0,
       1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1,
       0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1,
       1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1,

In [21]:
from datasets import load_metric
metric = load_metric("glue", "mrpc")
# 评估
metric.compute(predictions=preds, references=predictions.label_ids)


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'accuracy': 0.8700980392156863, 'f1': 0.9106239460370995}

In [22]:
def compute_metrics(eval_preds):
    metric = load_metric("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [23]:
training_args = TrainingArguments("test_trainer", num_train_epochs=3, evaluation_strategy="epoch")   
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
trainer.train()

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
                                                  
 33%|███▎      | 459/1377 [01:03<01:48,  8.49it/s]

{'eval_loss': 0.3848593533039093, 'eval_accuracy': 0.8455882352941176, 'eval_f1': 0.8872987477638641, 'eval_runtime': 3.653, 'eval_samples_per_second': 111.688, 'eval_steps_per_second': 13.961, 'epoch': 1.0}


 36%|███▋      | 500/1377 [01:08<01:55,  7.62it/s]

{'loss': 0.5312, 'grad_norm': 2.8599722385406494, 'learning_rate': 3.184458968772695e-05, 'epoch': 1.09}


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
                                                  
 67%|██████▋   | 918/1377 [02:10<01:00,  7.55it/s]

{'eval_loss': 0.5505664348602295, 'eval_accuracy': 0.8186274509803921, 'eval_f1': 0.8794788273615635, 'eval_runtime': 4.7805, 'eval_samples_per_second': 85.346, 'eval_steps_per_second': 10.668, 'epoch': 2.0}


 73%|███████▎  | 1000/1377 [02:21<00:48,  7.75it/s]

{'loss': 0.3123, 'grad_norm': 0.11195047199726105, 'learning_rate': 1.3689179375453886e-05, 'epoch': 2.18}


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
                                                   
100%|██████████| 1377/1377 [03:16<00:00,  7.01it/s]

{'eval_loss': 0.7309941053390503, 'eval_accuracy': 0.8382352941176471, 'eval_f1': 0.8903654485049833, 'eval_runtime': 3.4954, 'eval_samples_per_second': 116.724, 'eval_steps_per_second': 14.59, 'epoch': 3.0}
{'train_runtime': 196.5103, 'train_samples_per_second': 55.997, 'train_steps_per_second': 7.007, 'train_loss': 0.3598871168747447, 'epoch': 3.0}





TrainOutput(global_step=1377, training_loss=0.3598871168747447, metrics={'train_runtime': 196.5103, 'train_samples_per_second': 55.997, 'train_steps_per_second': 7.007, 'total_flos': 405114969714960.0, 'train_loss': 0.3598871168747447, 'epoch': 3.0})