# Fine-tuning

## load datasets

In [1]:
from datasets import load_dataset

raw_datasets = load_dataset("glue", "mrpc")
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

## Preprocessing (tokenization + padding)

In [2]:
from transformers import AutoTokenizer, DataCollatorWithPadding

checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def tokenize_function(examples):
    return tokenizer(
        examples["sentence1"],
        examples["sentence2"],
        truncation=True)
tokenized_datasets = raw_datasets.map(
    tokenize_function,
    batched=True,
)

tokenized_datasets = tokenized_datasets.remove_columns(["sentence1", "sentence2", "idx"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")
tokenized_datasets["train"].column_names


['labels', 'input_ids', 'token_type_ids', 'attention_mask']

In [3]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_datasets["train"], shuffle=True, batch_size=8, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    tokenized_datasets["validation"], batch_size=8, collate_fn=data_collator
)

In [4]:
for batch in train_dataloader:
    break
{k: v.shape for k, v in batch.items()}

{'labels': torch.Size([8]),
 'input_ids': torch.Size([8, 65]),
 'token_type_ids': torch.Size([8, 65]),
 'attention_mask': torch.Size([8, 65])}

## Load the model

In [5]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
outputs = model(**batch)
print(outputs.loss, outputs.logits.shape)

tensor(0.5601, grad_fn=<NllLossBackward0>) torch.Size([8, 2])


## Load the metrics

In [7]:
import evaluate

metric = evaluate.load("glue", "mrpc")

## Baseline results

In [8]:
import torch

In [9]:
model.eval()
with torch.no_grad():
    for batch in eval_dataloader:
        logits = model(**batch).logits
        predictions = logits.argmax(dim=-1)
        metric.add_batch(predictions=predictions, references=batch["labels"])
metric.compute()

{'accuracy': 0.6838235294117647, 'f1': 0.8122270742358079}

## Fine-tuning

In [10]:
from torch.utils.tensorboard import SummaryWriter

# default `log_dir` is "runs" - we'll be more specific here
writer = SummaryWriter('runs/mrpc-bert-base-uncased')

In [None]:
LEARNING_RATE = 5e-5
DECAY = 0.01
NUM_EPOCHS = 5
WARMUP_STEPS = 3

device = torch.device("mps")
model.to(device)
device

### Optimizer

In [None]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=DECAY)

### Scheduler

In [None]:
from transformers import get_scheduler

num_training_steps = NUM_EPOCHS * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=WARMUP_STEPS,
    num_training_steps=num_training_steps,
)
print(num_training_steps)

In [None]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

running_train_loss = 0.0
running_eval_loss = 0.0

for epoch in range(0, NUM_EPOCHS):
    print(f"Epoch {epoch + 1}/{NUM_EPOCHS}")
    model.train()
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        running_train_loss += loss.item()
        preds = outputs.logits.argmax(dim=-1)
        metric.add_batch(predictions=preds, references=batch["labels"])

        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    # Log training loss and metrics
    avg_train_loss = running_train_loss / len(train_dataloader)
    avg_train_metric = metric.compute()
    writer.add_scalar('Loss/train', avg_train_loss, epoch)
    for k, v in avg_train_metric.items():
        writer.add_scalar(f'Metric/train/{k}', v, epoch)
    
    if epoch % 5 == 0:
        with torch.no_grad():
            model.eval()
            for batch in eval_dataloader:
                batch = {k: v.to(device) for k, v in batch.items()}
                outputs = model(**batch)
                loss = outputs.loss
                running_eval_loss += loss.item()
                preds = outputs.logits.argmax(dim=-1)
                metric.add_batch(predictions=preds, references=batch["labels"])
            # Log evaluation loss and metrics
            avg_eval_loss = running_eval_loss / len(eval_dataloader)
            avg_eval_metric = metric.compute()
            writer.add_scalar('Loss/eval', avg_eval_loss, epoch)
            for k, v in avg_eval_metric.items():
                writer.add_scalar(f'Metric/eval/{k}', v, epoch)
    # Reset running losses and metrics for the next epoch
    running_train_loss = 0.0
    running_eval_loss = 0.0
# Close the writer
writer.close()

In [None]:
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

In [None]:
# Test the model on few examples of the test set
from transformers import pipeline
classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)
examples = raw_datasets["test"][:100]

success_rate = 0
for sentence1, sentence2, lable in zip(
    examples["sentence1"], examples["sentence2"], examples["label"]
):
    print(f"Sentence 1: {sentence1}")
    print(f"Sentence 2: {sentence2}")
    print(f"Label: {lable}")
    print("Prediction:", classifier(f"{sentence1} [SEP] {sentence2}"))
    print()
    if  '1' in classifier(f"{sentence1} [SEP] {sentence2}")[0]["label"]  and 1 == lable or \
       '0' in classifier(f"{sentence1} [SEP] {sentence2}")[0]["label"]  and 0 == lable:
        success_rate += 1

print(f"Success rate: {success_rate / 100 * 100:.2f}%")

In [None]:
classifier.save_pretrained("mrpc-torch-bert-base-uncased")