# How Trainer works

In [28]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding
from torch.utils.data import DataLoader

In [45]:
## Data Preprocessing ##
data = load_dataset('glue', 'mrpc')

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

tokenized_data = data.map(lambda x: tokenizer(x['sentence1'], x['sentence2'], truncation=True))

data_collator = DataCollatorWithPadding(tokenizer) 

### DataLoader ###
# {k:v for k,v in tokenized_data.items() if k not in ['sentence1','sentence2','idx']}

tokenized_data = tokenized_data.remove_columns(['sentence1','sentence2','idx'])
tokenized_data.set_format('torch')

train_loader = DataLoader(tokenized_data['train'],shuffle=True, batch_size=8,collate_fn=data_collator)

eval_loader = DataLoader(tokenized_data['validation'], shuffle=True, batch_size=8, collate_fn=data_collator)

for batch in train_loader:
    break
{k:v.shape for k,v in batch.items()}

{'input_ids': torch.Size([8, 73]),
 'token_type_ids': torch.Size([8, 73]),
 'attention_mask': torch.Size([8, 73]),
 'labels': torch.Size([8])}

In [60]:
## Model ##
from transformers import AutoModelForSequenceClassification
from torch.optim import AdamW
import torch
from transformers import get_scheduler

classifier = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased')

test_output = classifier(**batch)

print('loss:\n', test_output.loss, '\nlogits:\n', test_output.logits.shape)
# probability of classes
# torch.nn.functional.softmax(test_output.logits)

optimizer = AdamW(classifier.parameters(), lr=1e-5, weight_decay=0.01) #bitesandbyter for memory-efficient optimization, lower lr 

## lr scheduler = linear decay from 5e-5-0. training step = # epochs*training batches (len(dataloader))
num_epochs = 3
num_training_steps = num_epochs * len(train_loader)

lr_scheduler = get_scheduler(
    'linear',
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps)
print(num_training_steps)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


loss:
 tensor(1.0306, grad_fn=<NllLossBackward0>) 
logits:
 torch.Size([8, 2])
1377


In [68]:
## Training Loop ##
import torch
from tqdm.auto import tqdm

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)

classifier.to(device)

progress = tqdm(range(num_training_steps))

classifier.train()

for epoch in range(num_epochs):
    for batch in train_loader:
        batch = {k:v.to(device) for k,v in batch.items()}
        outputs = classifier(**batch)
        loss = outputs.loss
        loss.backward() # backpropagation (computes gradients w.r.t. weights)

        optimizer.step() # update model patameters (weights)
        lr_scheduler.step() # adjust learning rate
        optimizer.zero_grad() # reset all gradients
        progress.update(1) # advance progress bar by 1

cuda


  0%|          | 0/1377 [00:00<?, ?it/s]