# One iteration on a single batch (sentiment classification)

In [3]:
import torch
from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification

# Same as before
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
sequences = [
    "I've been waiting for a HuggingFace course my whole life.",
    "This course is amazing!",
]
batch = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")

# This is new
batch["labels"] = torch.tensor([1, 1])

optimizer = AdamW(model.parameters())
loss = model(**batch).loss
loss.backward()
optimizer.step()

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

# MRPC dataset (from the GLUE benchmark), paraphrase detection


In [4]:
from datasets import load_dataset, DatasetDict

raw_datasets = load_dataset("glue", "mrpc")
raw_datasets = DatasetDict({k: v.select(range(5)) for k, v in raw_datasets.items()})
print(raw_datasets)

Downloading builder script:   0%|          | 0.00/7.78k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/4.47k [00:00<?, ?B/s]

Downloading and preparing dataset glue/mrpc (download: 1.43 MiB, generated: 1.43 MiB, post-processed: Unknown size, total: 2.85 MiB) to /Users/yenson/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data: 0.00B [00:00, ?B/s]

Downloading data: 0.00B [00:00, ?B/s]

Downloading data: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/3668 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/408 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1725 [00:00<?, ? examples/s]

Dataset glue downloaded and prepared to /Users/yenson/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 5
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 5
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 5
    })
})


In [5]:
example = raw_datasets["train"][0]
print(example, end="\n\n")
print(raw_datasets["train"].features)

{'sentence1': 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .', 'sentence2': 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .', 'label': 1, 'idx': 0}

{'sentence1': Value(dtype='string', id=None), 'sentence2': Value(dtype='string', id=None), 'label': ClassLabel(num_classes=2, names=['not_equivalent', 'equivalent'], id=None), 'idx': Value(dtype='int32', id=None)}


# Tokenization

In [6]:
from transformers import AutoTokenizer

checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

tokenize_function = lambda example: tokenizer(
    example["sentence1"],
    example["sentence2"],
    truncation = True
)

tokenized_datasets = raw_datasets.map(
    tokenize_function, 
    batched=True,
    remove_columns = ["idx", "sentence1", "sentence2"]
)

print(tokenized_datasets)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 5
    })
    validation: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 5
    })
    test: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 5
    })
})


In [7]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [8]:
samples = tokenized_datasets["train"][:8]
print(samples.keys())
print([len(x) for x in samples["input_ids"]])   # lengths before

dict_keys(['label', 'input_ids', 'token_type_ids', 'attention_mask'])
[50, 59, 47, 67, 59]


In [9]:
batch = data_collator(samples)
print(batch["input_ids"].shape)                 # lengths after

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


torch.Size([5, 67])


# Fine tuning

In [10]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    checkpoint, num_labels=raw_datasets["train"].features["label"].num_classes)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BERT has not been pretrained on classifying pairs of sentences, so the head of the pretrained model has been discarded and a new head suitable for sequence classification has been added instead. The warnings indicate that some weights were not used (the ones corresponding to the dropped pretraining head) and that some others were randomly initialized (the ones for the new head). It concludes by encouraging you to train the model, which is exactly what we are going to do now.

In [11]:
from transformers import TrainingArguments

training_args = TrainingArguments("test-trainer")

In [12]:
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [13]:
trainer.train()

***** Running training *****
  Num examples = 5
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 3


  0%|          | 0/3 [00:00<?, ?it/s]



Training completed. Do not forget to share your model on huggingface.co/models =)




{'train_runtime': 2.0796, 'train_samples_per_second': 7.213, 'train_steps_per_second': 1.443, 'train_loss': 0.9916510581970215, 'epoch': 3.0}


TrainOutput(global_step=3, training_loss=0.9916510581970215, metrics={'train_runtime': 2.0796, 'train_samples_per_second': 7.213, 'train_steps_per_second': 1.443, 'train_loss': 0.9916510581970215, 'epoch': 3.0})

# Adding evaluation metrics to Trainer

In [14]:
predictions = trainer.predict(tokenized_datasets["validation"])
print(predictions.predictions.shape, predictions.label_ids.shape)

***** Running Prediction *****
  Num examples = 5
  Batch size = 8


  0%|          | 0/1 [00:00<?, ?it/s]

(5, 2) (5,)


In [16]:
# demo

import evaluate
import numpy as np

preds = np.argmax(predictions.predictions, axis=-1)

metric = evaluate.load("glue", "mrpc")
print(metric.compute(predictions=preds, references=predictions.label_ids))

{'accuracy': 0.6, 'f1': 0.0}


In [17]:
def compute_metrics(eval_preds):
    metric = evaluate.load("glue", "mrpc")
    preds = np.argmax(eval_preds.predictions, axis=-1)
    return metric.compute(predictions=preds, references=eval_preds.label_ids)

In [18]:
training_args = TrainingArguments("test-trainer", evaluation_strategy="epoch")
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
loading configuration file config.json from cache at /Users/yenson/.cache/huggingface/hub/models--bert-base-uncased/snapshots/bdb420bf56ef3f72ee07cd75ab6df1b765b6012a/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position

  0%|          | 0/3 [00:00<?, ?it/s]

***** Running Evaluation *****
  Num examples = 5
  Batch size = 8


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.6839362382888794, 'eval_accuracy': 0.6, 'eval_f1': 0.0, 'eval_runtime': 0.615, 'eval_samples_per_second': 8.13, 'eval_steps_per_second': 1.626, 'epoch': 1.0}


***** Running Evaluation *****
  Num examples = 5
  Batch size = 8


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.6947287321090698, 'eval_accuracy': 0.4, 'eval_f1': 0.4, 'eval_runtime': 0.6502, 'eval_samples_per_second': 7.69, 'eval_steps_per_second': 1.538, 'epoch': 2.0}


***** Running Evaluation *****
  Num examples = 5
  Batch size = 8


  0%|          | 0/1 [00:00<?, ?it/s]



Training completed. Do not forget to share your model on huggingface.co/models =)




{'eval_loss': 0.7007545232772827, 'eval_accuracy': 0.4, 'eval_f1': 0.5714285714285715, 'eval_runtime': 0.5985, 'eval_samples_per_second': 8.354, 'eval_steps_per_second': 1.671, 'epoch': 3.0}
{'train_runtime': 3.4206, 'train_samples_per_second': 4.385, 'train_steps_per_second': 0.877, 'train_loss': 0.667027473449707, 'epoch': 3.0}


TrainOutput(global_step=3, training_loss=0.667027473449707, metrics={'train_runtime': 3.4206, 'train_samples_per_second': 4.385, 'train_steps_per_second': 0.877, 'train_loss': 0.667027473449707, 'epoch': 3.0})