### L0: Dummy Clasification example

In [None]:
import torch 
from torch.optim import AdamW
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [None]:
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

In [None]:
sentences = ['I think I am in love', 'I think about dropping out of school to pursue my dream']

batch = tokenizer(sentences, padding='max_length', max_length=512, return_tensors='pt')
batch['labels'] = torch.tensor([1, 1])

optimizer = AdamW(model.parameters())
loss = model(**batch).loss
loss.backward()
optimizer.step()


In [None]:
print(model.config)

### L1: Classification - Are the 2 sentences conveying similar meaning?

In [None]:
from datasets import load_dataset

In [None]:
raw_datasets = load_dataset("glue", "mrpc")

In [None]:
print(raw_datasets["train"].features)

In [None]:
# --- explicit
sentence1, sentence2 = raw_datasets["train"][0]["sentence1"], raw_datasets["train"][0]["sentence2"]
ex_inputs = tokenizer(sentence1, sentence2, padding=True, truncation=True)
tokens = tokenizer.convert_ids_to_tokens(ex_inputs["input_ids"])

In [None]:
# --- tokenizer all dataset using mapping

def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True, padding='max_length')
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

### L2: Testing out Dynamic Padding with data collator

In [None]:
from transformers import DataCollatorWithPadding

def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
samples = tokenized_datasets["train"][:10]

# create dictionary with input_ids, token_type_ids, attention_mask, label
samples = {k:v for k, v in samples.items() if k not in ['idx', 'sentence1', 'sentence2']}


In [None]:
print(samples.keys())
print([len(sample) for sample in samples['input_ids']])

In [None]:
batched = data_collator(samples)

### L3: Fine-tuning using TrainerAPI

Docs:
- [Training Arguments Docs](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.TrainingArguments)
- [Fine-Tuning LLM Cookbook](https://huggingface.co/learn/cookbook/en/fine_tuning_code_llm_on_single_gpu)
- [Trainer Docs](https://huggingface.co/docs/transformers/en/main_classes/trainer)

In [None]:
from datasets import load_dataset
from transformers import TrainingArguments
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding

raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

training_args = TrainingArguments("test-trainer")

In [None]:
from transformers import Trainer

trainer = Trainer(
    model, 
    training_args, 
    train_dataset=tokenized_datasets["train"], 
    eval_dataset=tokenized_datasets["validation"], 
    data_collator=data_collator, 
    processing_class=tokenizer, 
)

trainer.train()

In [None]:
import numpy as np
import evaluate

predictions = trainer.predict(tokenized_datasets["validation"])


In [None]:
preds = np.argmax(predictions.predictions, axis=-1)
metric = evaluate.load("glue", "mrpc")
metric.compute(predictions=preds, references=predictions.label_ids)

### L3.5: Cleaner TrainerAPI

In [None]:
from datasets import load_dataset
from transformers import TrainingArguments
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding

raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)
    
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    metric = evaluate.load("glue", "mrpc")
    preds = np.argmax(logits, axis=-1)
    return metrics.compute(predictions=preds, references=labels)

In [None]:
model = model.to('cpu') # fix fp16 error => ValueError: fp16 mixed precision with MPS device requires a Pytorch >= 2.5.0

In [None]:
training_args = TrainingArguments(
    "test-trainer", 
    eval_strategy="epoch",  
    num_train_epochs=1, 
    learning_rate=5e-5,
    # optim='adamw_torch', 
    fp16=False, 
    # per_device_train_batch_size=4,
    # gradient_accumulation_steps=4,
    # lr_scheduler_type="cosine"
)
trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
)
trainer.train()

### L4: Fine-Tuning using modern PyTorch

Steps:
1. Initialize `DataLoader` => load data, tokenize, data collator
2. Initialize optimizer, scheduler
3. Write Training and Evaluation loop
4. Use Accelerator

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification

raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "distilbert/distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [None]:
def tokenize_function(examples):
    return tokenizer(examples["sentence1"], examples["sentence2"], truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

In [None]:
print(tokenized_datasets)
print(model.config)

#### Addendum A: how to get model inputs

Strategies:
- Look at tokenizer input
- Look at model class source
    - BERT: input_ids, attention_mask, token_type_ids (optional)
    - GPT-2: input_ids, attention_mask
    - T5: input_ids, attention_mask, decoder_input_ids (for generation)


In [None]:
dummy_inputs = tokenizer("some dummy sentence", return_tensors='pt')
print(dummy_inputs.keys())

In [None]:
print(model.dummy_inputs.keys())

In [None]:
# print(model.forward.__doc__)

#### 

In [None]:
# prepare datasets to only have the following columns: "attention_mask", "input_ids", "labels", "token_type_ids" 

tokenized_datasets = tokenized_datasets.remove_columns(["sentence1", "sentence2", "idx"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")

In [None]:
tokenized_datasets["train"].column_names

In [None]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_datasets["train"], shuffle=True, batch_size=8, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    tokenized_datasets["validation"], batch_size=8, collate_fn=data_collator
)

In [None]:
# -- sanity check: verify that there are no issues
for batch in train_dataloader:
    break
{k: v.shape for k, v in batch.items()}

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

In [None]:
# --- sanity check: try out with one batch
outputs = model(**batch)

In [None]:
from transformers import get_scheduler

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

num_epochs = 1
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear", 
    optimizer=optimizer, 
    num_warmup_steps=0, 
    num_training_steps=num_training_steps
)


In [None]:
# ---- BASIC TRAINING LOOP -----

from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

In [None]:
# --- TRAINING LOOP W/ Gradient clipping and gradient accumulation ---

# with torch.cuda.amp import autocast, GradScaler

accumulation_steps = 4
max_grad_norm = 1.0
progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for step, batch in enumerate(train_dataloader):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        
        loss = outputs.loss
        loss = loss / accumulation_steps
        loss.backward()

        if (step + 1) % accumulation_steps == 0:
            # gradient clipping
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
            
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
        
        progress_bar.update(1)

In [None]:
# --- EVALUATION LOOP ---
import evaluate 
metric = evaluate.load("glue", "mrpc")
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch['labels'])
metric.compute()

### L5: PyTorch Training Loop with Accelerator

In [None]:
from accelerate import Accelerator

accelerator = Accelerator() # ValueError: fp16 mixed precision with MPS device requires a Pytorch >= 2.5.0
model = AutoModelForSequenceClassification(checkpoint, num_labels=2)
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

train_dl, eval_dl, model, optimizer = accelerator.prepare(
    train_dataloader, eval_dataloader, model, optimizer
)

num_epochs = 1
num_training_steps = num_epochs * len(train_dl)

lr_scheduler = get_scheduler(
    "linear", 
    optimizer=optimizer, 
    num_warmup_steps=0, 
    num_training_steps=num_training_steps,
)

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dl:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)