In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

#prepare dataset
raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
    return tokenizer(example["sentence2"], example["sentence2"], truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


  from .autonotebook import tqdm as notebook_tqdm
Map: 100%|██████████| 408/408 [00:00<00:00, 7366.73 examples/s]


In [2]:
#remove "sentence1" and "sentence2" "idx" column
#rename "label" to "labels"
#set format to pytorch tensor instead of normal list

tokenized_datasets = tokenized_datasets.remove_columns(["sentence1", "sentence2", "idx"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")
tokenized_datasets["train"].column_names

['labels', 'input_ids', 'token_type_ids', 'attention_mask']

In [3]:
#define dataloader
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_datasets["train"], shuffle=True, batch_size=8, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    tokenized_datasets["validation"], batch_size=8, collate_fn=data_collator
)

In [4]:
#check
for batch in train_dataloader:
    break
{k: v.shape for k, v in batch.items()}

{'labels': torch.Size([8]),
 'input_ids': torch.Size([8, 83]),
 'token_type_ids': torch.Size([8, 83]),
 'attention_mask': torch.Size([8, 83])}

In [5]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
outputs = model(**batch)
print(outputs.loss, outputs.logits.shape)

tensor(0.6918, grad_fn=<NllLossBackward0>) torch.Size([8, 2])


In [7]:
#initiate optimizer AdamW
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

In [8]:
#initiate learning rate scheduler
from transformers import get_scheduler

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear", #lr decay from 5e-5 to 0 linearly
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

print(num_training_steps)

1377


In [9]:
import torch
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
device

device(type='cuda')

In [11]:
#training loop
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))
model.train()

for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

100%|██████████| 1377/1377 [03:35<00:00,  6.10it/s]

In [12]:
#evaluation loop
import evaluate

metric = evaluate.load("glue", "mrpc")
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

{'accuracy': 0.6862745098039216, 'f1': 0.7894736842105263}

In [21]:
#tracking loss with weights and biases wandb
from transformers import Trainer, TrainingArguments, EarlyStoppingCallback
import wandb
import numpy as np

#initiate wandb for training
wandb.init(project="transformer-fine-tuning", name="bert-mrps-analysis")

#compute metric function for Trainer
def compute_metrics(eval_preds):
    metric = evaluate.load("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    return metric.compute(predictions=predictions,  references=labels)


training_args = TrainingArguments(
    output_dir="./learning_curve1_results",
    eval_strategy="steps",
    eval_steps=50,
    save_steps=100,
    logging_steps=10,  # Log metrics every 10 steps
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    report_to="wandb",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


0,1
eval/accuracy,▁▄█▄
eval/f1,▁▅█▄
eval/loss,▇█▁▁
eval/runtime,▁█▁▁
eval/samples_per_second,█▁▇█
eval/steps_per_second,█▁▇█
train/epoch,▁▁▂▂▂▂▃▃▃▄▄▄▄▅▅▅▆▆▆▆▇▇▇▇██
train/global_step,▁▁▂▂▂▂▃▃▃▄▄▄▄▅▅▅▆▆▆▆▇▇▇▇██
train/grad_norm,▃▁▂█▅▃▂▄▄▇▃▄▃▃▂▂▃▂▂▃▅▂
train/learning_rate,██▇▇▇▆▆▆▅▅▅▄▄▄▃▃▃▂▂▂▁▁

0,1
eval/accuracy,0.64951
eval/f1,0.74047
eval/loss,0.97502
eval/runtime,4.0009
eval/samples_per_second,101.976
eval/steps_per_second,6.498
train/epoch,0.95652
train/global_step,220.0
train/grad_norm,6.24131
train/learning_rate,3e-05


Step,Training Loss,Validation Loss,Accuracy,F1
50,0.1967,1.700763,0.681373,0.775087
100,0.0652,1.960487,0.705882,0.808307
150,0.211,1.79689,0.654412,0.753065
200,0.1934,1.856415,0.698529,0.809302
250,0.1443,1.108708,0.666667,0.763889
300,0.1391,1.644576,0.664216,0.769748
350,0.1523,1.483007,0.681373,0.786885
400,0.1739,1.703765,0.713235,0.811594
450,0.2279,1.605157,0.664216,0.765009
500,0.041,1.679048,0.664216,0.7642


TrainOutput(global_step=690, training_loss=0.11609233098941436, metrics={'train_runtime': 680.3415, 'train_samples_per_second': 16.174, 'train_steps_per_second': 1.014, 'total_flos': 440104628967600.0, 'train_loss': 0.11609233098941436, 'epoch': 3.0})

In [14]:
from transformers import EarlyStoppingCallback #detect overfitting
from transformers import Trainer, TrainingArguments, EarlyStoppingCallback
import wandb
import numpy as np
import evaluate

def compute_metrics(eval_preds):
    metric = evaluate.load("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    return metric.compute(predictions=predictions,  references=labels)


training_args = TrainingArguments(
    output_dir="./learning_curve1_results",
    eval_strategy="steps",
    eval_steps=100,
    save_strategy="steps",
    save_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss", #set condition
    greater_is_better=False, # need for early stopping (for loss as lower loss is better)
    num_train_epochs=10,  # high as it will stop half way
)

# add early stopping to prevent overfitting
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)