## Fine-tuning a Pretrained Model


In [None]:
import torch
from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification

# Same as before
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
sequences = [
    "I've been waiting for a HuggingFace course my whole life.",
    "This course is amazing!",
]
batch = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")

# This is new
batch["labels"] = torch.tensor([1, 1])

optimizer = torch.optim.AdamW(model.parameters())
loss = model(**batch).loss
loss.backward()
optimizer.step()


### Loading datasets
Sample from Microsoft Research Paraphrase Corpus dataset, which is part of GLUE benchmark set

In [None]:
from datasets import load_dataset

raw_datasets = load_dataset("glue", "mrpc") # GLUE is a benchmark group of datasets
raw_datasets

The tokenizer can handle sentence pairs. Some models, like BERT, add a tensor to indicate which sentence a token belongs to.

In [None]:
from transformers import AutoTokenizer

checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
sentence1 = raw_datasets["train"][15]["sentence1"]
sentence2 = raw_datasets["train"][15]["sentence2"]
tokenized_sentences = tokenizer(sentence1, sentence2)

As each element gets batched in pre-processing, it will need to be padded to the maximum length within the batch. We can do this via `dynamic padding`.

In [None]:
from transformers import DataCollatorWithPadding

def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

samples = tokenized_datasets["train"][:8]
samples = {k: v for k, v in samples.items() if k not in ["idx", "sentence1", "sentence2"]}
# the length of each token vector for each sample
[len(x) for x in samples["input_ids"]]

batch = data_collator(samples)
# all token vectors in the batch have been padded to the same size
{k: v.shape for k, v in batch.items()}

### Practicing dynamic padding on my own

In [None]:
from datasets import load_dataset
from transformers import DataCollatorWithPadding, AutoTokenizer, Trainer

# Practice jusing Stanford Sentiment Treebank, which predictcs sentiment of 
# movie reviews
raw_datasets = load_dataset("glue", "sst2")
raw_datasets

# define the tokenizer
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
    return tokenizer(example["sentence"], truncation=True)

# apply the tokenizer
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True) # what does the batched arg do?

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# subset to first 10 training samples from sst-2
samples = tokenized_datasets["train"][:10]
# use data_collator for dynamic padding
batch = data_collator(samples)

# loop through the keys and values in the batch and return the shape of the
# values for each key
#{k: v.shape for k, v in batch.items()}

### Finetuning with the Trainer API

#### Training
Frist let's re-run some code from the MPRC example

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding, Trainer
import torch

# Check that MPS is available
if not torch.backends.mps.is_available():
    if not torch.backends.mps.is_built():
        print("MPS not available because the current PyTorch install was not "
              "built with MPS enabled.")
    else:
        print("MPS not available because the current MacOS version is not 12.3+ "
              "and/or you do not have an MPS-enabled device on this machine.")

else:
    mps_device = torch.device("mps")

raw_datasets = load_dataset("glue", "mrpc") # GLUE is a benchmark group of datasets
#raw_datasets_small_train = copy.copy(raw_datasets)
#small_train = raw_datasets['train'][0:500]
#raw_datasets_small_train['train'] = small_train

checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Now we can train the model

In [None]:
from transformers import TrainingArguments, AutoModelForSequenceClassification, Trainer

training_args = TrainingArguments("test-trainer")
training_args.device = mps_device

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2).to()

In [None]:
trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer
)

In [None]:
# WARNING: running locally on my CPU took ~30 min.
trainer.train()

In [None]:
torch.backends.mps.is_available()

#### Evaluation

In [None]:
from datasets import load_metric
import numpy as np

predictions = trainer.predict(tokenized_datasets["validation"])
preds = np.argmax(predictions.predictions, axis=-1)

metric = load_metric("glue", "mrpc")
metric.compute(predictions=preds, references=predictions.label_ids)



In [None]:

def compute_metrics(eval_preds):
    metric = load_metric("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    
    return metric.compute(predictions=predictions, references=labels)


training_args = TrainingArguments("test-trainer", evaluation_strategy="epoch")
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

Re-run training, it will now report out metrics after each epoch

In [None]:
trainer.train()

### Practicing fine-tuning on my own
First, re-runing the code from my practice on SST dataset above 

In [None]:
from datasets import load_dataset
from transformers import DataCollatorWithPadding, AutoTokenizer

# Practice jusing Stanford Sentiment Treebank, which predictcs sentiment of 
# movie reviews
raw_datasets = load_dataset("rotten_tomatoes")
raw_datasets

# define the tokenizer
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
    return tokenizer(example["text"], truncation=True)

# apply the tokenizer
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True) # what does the batched arg do?

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Now to add the training section

In [None]:
from transformers import TrainingArguments, AutoModelForSequenceClassification, Trainer

training_args = TrainingArguments("test-trainer")

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

In [None]:
trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [None]:
#WARNING: Running this code locally on CPU takes ~1 hr.
trainer.train()

And now to evaluate performance (separate from Trainer in this example)

In [None]:
import numpy as np

preds = np.argmax(predictions.predictions, axis=-1)

from datasets import load_metric

metric = load_metric("glue", "sst2")
metric.compute(predictions=preds, references=predictions.label_ids)

## Full Training Pipeline
Repeat some processing code from before

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)


def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)


tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Remove/change col names

In [None]:
tokenized_datasets = tokenized_datasets.remove_columns(["sentence1", "sentence2", "idx"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")
tokenized_datasets["train"].column_names

Define data loader  

In [None]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_datasets["train"], shuffle=True, batch_size=8, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    tokenized_datasets["validation"], batch_size=8, collate_fn=data_collator
)

Instantiate the model

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Test with our preview batch

In [None]:
outputs = model(**batch)
print(outputs.loss, outputs.logits.shape)

Add optimizer

In [None]:
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

Add learning rate scheduler

In [None]:
from transformers import get_scheduler

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
print(num_training_steps)

Use GPU if accessible

In [None]:
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
device

Add a progress bar

In [None]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)