## Fine-tuning a Pretrained Model


In [None]:
import torch
from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification

# Same as before
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
sequences = [
    "I've been waiting for a HuggingFace course my whole life.",
    "This course is amazing!",
]
batch = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")

# This is new
batch["labels"] = torch.tensor([1, 1])

optimizer = torch.optim.AdamW(model.parameters())
loss = model(**batch).loss
loss.backward()
optimizer.step()


### Loading datasets
Sample from Microsoft Research Paraphrase Corpus dataset, which is part of GLUE benchmark set

In [None]:
from datasets import load_dataset

raw_datasets = load_dataset("glue", "mrpc") # GLUE is a benchmark group of datasets
raw_datasets

The tokenizer can handle sentence pairs. Some models, like BERT, add a tensor to indicate which sentence a token belongs to.

In [None]:
from transformers import AutoTokenizer

checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
sentence1 = raw_datasets["train"][15]["sentence1"]
sentence2 = raw_datasets["train"][15]["sentence2"]
tokenized_sentences = tokenizer(sentence1, sentence2)

As each element gets batched in pre-processing, it will need to be padded to the maximum length within the batch. We can do this via `dynamic padding`.

In [None]:
from transformers import DataCollatorWithPadding

def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

samples = tokenized_datasets["train"][:8]
samples = {k: v for k, v in samples.items() if k not in ["idx", "sentence1", "sentence2"]}
# the length of each token vector for each sample
[len(x) for x in samples["input_ids"]]

batch = data_collator(samples)
# all token vectors in the batch have been padded to the same size
{k: v.shape for k, v in batch.items()}

### Practicing dynamic padding on my own

In [None]:
from datasets import load_dataset
from transformers import DataCollatorWithPadding

# Practice jusing Stanford Sentiment Treebank, which predictcs sentiment of 
# movie reviews
raw_datasets = load_dataset("glue", "sst2")
raw_datasets

# define the tokenizer
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
    return tokenizer(example["sentence"], truncation=True)

# apply the tokenizer
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True) # what does the batched arg do?

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# subset to first 10 training samples from sst-2
samples = tokenized_datasets["train"][:10]

# use data_collator for dynamic padding
batch = data_collator(samples)

# loop through the keys and values in the batch and return the shape of the
# values for each key
{k: v.shape for k, v in batch.items()}

### Finetuning with the Trainer API

#### Training

In [None]:
from transformers import TrainingArguments, AutoModelForSequenceClassification, Trainer

training_args = TrainingArguments("test-trainer")

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

In [None]:
trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

trainer.train()

#### Evaluation