### Pytorch based training loop
Fine tuning the model without using the training class of HF ( which gives better control on the training epochs. )

In [23]:
! pip install -U --quiet accelerate
! pip install -U --quiet transformers
! pip install -U --quiet evaluate
! pip install -U --quiet datasets

In [24]:
import torch
from tabulate import tabulate
from transformers import AutoTokenizer, AdamW, AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments
from transformers import Trainer
import numpy as np
import evaluate
from datasets import load_dataset

#AdamW optimizer - https://arxiv.org/abs/1711.05101
from torch.optim import AdamW
from transformers import get_scheduler
from tqdm.auto import tqdm

import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [25]:
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
model.to(device)

raw_datasets = load_dataset("glue", "mrpc")
raw_datasets

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

In [26]:
raw_train_dataset = raw_datasets["train"]
print(tabulate(zip(raw_train_dataset[10:15]['sentence1'], raw_train_dataset[10:15]['sentence2'], raw_train_dataset[10:15]['label']), headers=["Sentence 1", "Sentence 2", "Label"]))

Sentence 1                                                                                                                               Sentence 2                                                                                                                                                        Label
---------------------------------------------------------------------------------------------------------------------------------------  --------------------------------------------------------------------------------------------------------------------------------------------------------------  -------
Legislation making it harder for consumers to erase their debts in bankruptcy court won overwhelming House approval in March .           Legislation making it harder for consumers to erase their debts in bankruptcy court won speedy , House approval in March and was endorsed by the White House .        0
The Nasdaq composite index increased 10.73 , or 0.7 percent , to 1,514.77 .          

In [27]:
def tokenize_sentences(row):
    return tokenizer(row['sentence1'], row['sentence2'], truncation=True)

tokenized_dataset = raw_datasets.map(tokenize_sentences, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


Map:   0%|          | 0/1725 [00:00<?, ? examples/s]

When using a dataset with a particular machine learning framework the format needs adjusting as in HF manual here.

https://huggingface.co/docs/datasets/use_dataset#tokenize-text

In [28]:
tokenized_dataset = tokenized_dataset.remove_columns(["sentence1", "sentence2", "idx"])
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")
tokenized_dataset.set_format("torch")
tokenized_dataset["train"].column_names

['labels', 'input_ids', 'token_type_ids', 'attention_mask']

### Load the dataset into dataloaders.

https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader

In [29]:
from torch.utils.data import DataLoader

train_data = DataLoader(tokenized_dataset['train'], batch_size=8, shuffle=True, collate_fn=data_collator)
val_data = DataLoader(tokenized_dataset['validation'], batch_size=8, collate_fn=data_collator)

for batch in train_data:
    break
print({k: v.shape for k, v in batch.items()})
outputs = model(**batch.to(device))
print(outputs.loss, outputs.logits.shape)

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'labels': torch.Size([8]), 'input_ids': torch.Size([8, 69]), 'token_type_ids': torch.Size([8, 69]), 'attention_mask': torch.Size([8, 69])}
tensor(0.2817, device='cuda:0', grad_fn=<NllLossBackward0>) torch.Size([8, 2])


In [30]:
optimizer = AdamW(model.parameters(), lr=5e-5)

num_epochs = 5
num_training_steps = num_epochs * len(train_data)

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

print("(num_epochs)", num_epochs, "x", len(train_data),"(len(train_data))",   "=", num_training_steps, "(Number of training sptes)")

(num_epochs) 5 x 459 (len(train_data)) = 2295 (Number of training sptes)


In [31]:
progress_bar = tqdm(range(num_training_steps))

model.train()

for epoch in range(num_epochs):
    for batch in train_data:
        output = model(**batch.to(device))
        loss = output.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

  0%|          | 0/2295 [00:00<?, ?it/s]

In [32]:
metric = evaluate.load("glue", "mrpc")

model.eval()
for batch in val_data:
  output = model(**batch.to(device))
  logits = output.logits
  predictions = torch.argmax(logits, dim=-1)
  metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()


{'accuracy': 0.8676470588235294, 'f1': 0.9075342465753423}