# Imports and Setup

In [1]:
import os 

from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification

import numpy as np
import torch
from torch.utils.data import DataLoader 
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

from tqdm import tqdm

# Load and Tokenize IMDB Dataset

In [3]:
splits = ['train', 'test']
subset_size = 10_000

dataset = {
    split: load_dataset('imdb', split=split).shuffle(seed=101).select(range(subset_size))
    for split in splits
}

tokenizer = AutoTokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token
tokenized_dataset = {
    name: dset.map(
        lambda x: tokenizer(x['text'], padding=True, truncation=True),
        batched=True
    ) for name, dset in dataset.items()
}

for dset in tokenized_dataset.values():
    dset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

dataloader = {name: DataLoader(dset, batch_size=64) for name, dset in tokenized_dataset.items()}

model = AutoModelForSequenceClassification.from_pretrained(
    'gpt2',
    num_labels=2,
    id2label={0: 'Negative', 1: 'Positive'}, 
    label2id={'Negative': 0, 'Positive': 1},
    pad_token_id=tokenizer.pad_token_id
)        
model.to('cuda')

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (score): Linear(in_features=768, out_features=2, bias=False)
)

# Define Evaluation Metrics

In [4]:
def get_metrics(predictions, labels):
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions)
    accuracy = accuracy_score(labels, predictions)
    return {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

def evaluate_batch(model, tokenized_inputs, labels):
    with torch.no_grad():
        outputs = model(**tokenized_inputs)
        predictions = torch.argmax(outputs.logits, dim=-1).cpu().numpy()
        labels = labels.cpu().numpy()
    return predictions, labels

# Evaluate Pretrained GPT-2 Classifier

In [5]:
batch_labels, batch_predictions = [], []

with torch.no_grad():
    for batch in tqdm(dataloader['test']):
        batch = {k: v.to('cuda') for k, v in batch.items()}
        predictions, labels = evaluate_batch(model, {'input_ids': batch['input_ids'], 'attention_mask': batch['attention_mask']}, batch['label'])
        batch_labels.append(labels)        
        batch_predictions.append(predictions)

get_metrics(np.concatenate(batch_predictions), np.concatenate(batch_labels))

100%|██████████| 157/157 [02:24<00:00,  1.08it/s]


{'accuracy': 0.4994,
 'f1': array([0.66577647, 0.00318598]),
 'precision': array([0.49969934, 0.36363636]),
 'recall': array([0.9972, 0.0016])}

The accuracy is almost 50%, and since the dataset is balanced, the model's predictions are random. 

## Some helper functions

The followings reuse some of the code from before and wraps them in functions in order to facilitate the experiments setups. 

In [None]:
from collections import defaultdict

def count_parameters(model):
    param_dict = defaultdict(int)
    param_sum = 0
    trainable_sum = 0
    for param in model.parameters():
        param_sum += param.numel()
        param_dict[str(param.dtype)] += param.numel()

        if param.requires_grad:
            trainable_sum += param.numel()
    return {
        'total parameters': f'{param_sum / 1e6:.2f} Million',
        'trainable parameters': trainable_sum,
        'parameter count per data type': dict(param_dict)

    }

In [1]:
import numpy as np
import torch
from peft import LoraConfig, TaskType, get_peft_model, prepare_model_for_kbit_training
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
)

LORA_CONFIG = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    r=4,
    lora_alpha=16,
    lora_dropout=0.1,
    target_modules=["attn.c_attn", "attn.c_proj", "mlp.c_fc", "mlp.c_proj"],
    fan_in_fan_out=True,
)

BNB_CONFIG = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

path_notebook = "/home/srn/Documents/code/gen_ai_course/p1_PEFT_GPT2/"


def get_tokenizer() -> AutoTokenizer:
    tokenizer = AutoTokenizer.from_pretrained("gpt2")
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

    return tokenizer


def get_tokenized_datasets(train_samples: int, validation_samples: int, test_samples: int, seed: int) -> dict:
    splits = ["train", "test"]
    dataset = {split: load_dataset("imdb", split=split) for split in splits}

    dataset["train"] = dataset["train"].shuffle(seed).select(range(train_samples))
    validation_test_dsets = dataset["test"].train_test_split(
        train_size=validation_samples, test_size=test_samples, shuffle=True, seed=seed
    )
    dataset["validation"] = validation_test_dsets["train"]
    dataset["test"] = validation_test_dsets["test"]

    print(dataset)

    tokenizer = get_tokenizer()

    tokenized_dataset = {
        name: dset.map(lambda x: tokenizer(x["text"], padding=True, truncation=True), batched=True)
        for name, dset in dataset.items()
    }

    for dset in tokenized_dataset.values():
        dset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

    return tokenized_dataset


def compute_metrics(eval_prediction):
    predictions, labels = eval_prediction
    predictions = np.argmax(predictions, axis=1)
    return {"accuracy": (predictions == labels).mean()}


def freeze_model(model) -> None:
    for param in model.parameters():
        param.requires_grad = False


def prepare_model(tokenizer, bits_and_bytes_config=None, lora_config=None, use_gradient_checkpointing=False):
    if bits_and_bytes_config:
        assert lora_config is not None

    model = AutoModelForSequenceClassification.from_pretrained(
        "gpt2",
        num_labels=2,
        id2label={0: "Negative", 1: "Positive"},
        label2id={"Negative": 0, "Positive": 1},
        pad_token_id=tokenizer.pad_token_id,
        quantization_config=bits_and_bytes_config,
        low_cpu_mem_usage=True,
    )

    if bits_and_bytes_config:
        model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=use_gradient_checkpointing)

    if lora_config:
        # lora config freezes the model parameters of the base
        return get_peft_model(model, lora_config)
    else:
        # need to manually freeze the base mode
        freeze_model(model.base_model)
        return model


def get_model_info(model):
    num_trainable_tensors = 0
    num_trainable_parameters = 0
    trainable_bytes = 0

    dtype_bytes = {torch.float32: 4, torch.float16: 2, torch.bfloat16: 2, torch.uint8: 1, torch.int8: 1}
    dtypes_trainable = []
    dtypes_non_trainable = []
    for param in model.parameters():
        if param.requires_grad:
            num_trainable_tensors += 1
            num_trainable_parameters += param.numel()
            trainable_bytes += param.numel() * dtype_bytes[param.dtype]
            dtypes_trainable.append(param.dtype)
        else:
            dtypes_non_trainable.append(param.dtype)
    return {
        "trainable parameters (1e6)": num_trainable_parameters / 1e6,
        "trainable parameters size (MB)": trainable_bytes / (1024 * 1024),
        "trainable tensors": num_trainable_tensors,
        "memory footprint (MB)": model.get_memory_footprint() / (1024 * 1024),
        "dtypes trainable": set(dtypes_trainable),
        "dtypes non-trainable": set(dtypes_non_trainable),
    }


def get_trainer(
    model, tokenized_dataset, tokenizer, n_epochs, max_steps, batch_size, lr, weight_decay, eval_steps, tensorboard_name
) -> Trainer:

    training_args = TrainingArguments(
        output_dir=path_notebook + "/evals/baseline" + tensorboard_name,
        learning_rate=lr,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=n_epochs,
        max_steps=max_steps,
        weight_decay=weight_decay,
        eval_strategy="steps",
        save_strategy="steps",
        eval_steps=eval_steps,
        load_best_model_at_end=True,
        report_to="tensorboard",
        push_to_hub=False,
    )
    trainer = Trainer(
        model,
        args=training_args,
        train_dataset=tokenized_dataset["train"],
        eval_dataset=tokenized_dataset["validation"],
        tokenizer=tokenizer,
        data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
        compute_metrics=compute_metrics,
    )

    return trainer

## Prepare Tokenizer, Tokenized Datasets and Models

Use a subset of dataset for training as before. Specifically:
- `10000` samples for training
- `1000` samples for validation and testing each

In [2]:
tokenizer = get_tokenizer()
tokenized_dataset = get_tokenized_datasets(train_samples=10000, validation_samples=1000, test_samples=1000, seed=101)

{'train': Dataset({
    features: ['text', 'label'],
    num_rows: 10000
}), 'test': Dataset({
    features: ['text', 'label'],
    num_rows: 1000
}), 'validation': Dataset({
    features: ['text', 'label'],
    num_rows: 1000
})}


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

We use the following three models for training and evaluation

### Base Model with the classifier layer

In [100]:
model_f32 = prepare_model(tokenizer)

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### LoRA

In [101]:
model_f32_lora = prepare_model(tokenizer, lora_config=LORA_CONFIG)

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### LoRA with Quantization

In [108]:
model_nf4_lora_with_chkpoint = prepare_model(
    tokenizer, bits_and_bytes_config=BNB_CONFIG, lora_config=LORA_CONFIG, use_gradient_checkpointing=True
)

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Some Notes and Explanation of LoRA and Quantized Model Architectures

In the above, we prepared 3 different models for training:

- `model_f32` is the vanilla GPT2 with a classifier head. It contains only a handful of trainable parameters since we only have two categories and the output is `768` dimensional (about `1.5 K` parameters)
- `model_f32_lora` is the LoRA adjust models. This models has about `600_000` trainable parameters since we use a low rank approximation of dimension `4` 

```python
LORA_CONFIG = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    r=4,
    lora_alpha=16,
    lora_dropout=0.1,
    target_modules=["attn.c_attn", "attn.c_proj", "mlp.c_fc", "mlp.c_proj"],
    fan_in_fan_out=True,
)
```

which is then attached to all the frozen modules in both attention and mlp layers in all the block (GPT2 has 12 block in total). 

One can inspect one of these blocks more closely

```python

>>> block_lora_f32 = model_f32_lora.base_model.model.transformer.h[0].attn.c_attn
>>> block_lora_f32

lora.Linear(
  (base_layer): Conv1D(nf=2304, nx=768)
  (lora_dropout): ModuleDict(
    (default): Dropout(p=0.1, inplace=False)
  )
  (lora_A): ModuleDict(
    (default): Linear(in_features=768, out_features=4, bias=False)
  )
  (lora_B): ModuleDict(
    (default): Linear(in_features=4, out_features=2304, bias=False)
  )
  (lora_embedding_A): ParameterDict()
  (lora_embedding_B): ParameterDict()
  (lora_magnitude_vector): ModuleDict()
)
```

with `lora_A` and `lora_B` being the low rank approximation, in this case, of 4. These `lora_A` and `lora_B` matrices are the trainable parameters of the model.

- `model_nf4_with_chkpoint` uses quantization as well as LoRA, plus recomputing the intermediate activations to save memory (called checkpointing), thus enabling using larger batch sizes and/or larger models. This stores certain layer weights in 4-bit and it appears that the number of parameters are reduces, but the reduction stems from the fact that under the hood, weights are stored in 8-bit variables (so two 4-bit parameters can be packed inside a single 8-bit bit parameter):

```python
BNB_CONFIG = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

```
 Similar to LoRA model, one can peek into individual attention heads to see how the quantization is performed (not all the details but at least gain some insight into the workings of the bitsandbytes module):
```python
>>> block_nf4 = model_nf4_lora_with_chkpoint.base_model.model.transformer.h[0].attn.c_attn
>>> block_nf4

lora.Linear4bit(
  (base_layer): Linear4bit(in_features=768, out_features=2304, bias=True)
  (lora_dropout): ModuleDict(
    (default): Dropout(p=0.1, inplace=False)
  )
  (lora_A): ModuleDict(
    (default): Linear(in_features=768, out_features=4, bias=False)
  )
  (lora_B): ModuleDict(
    (default): Linear(in_features=4, out_features=2304, bias=False)
  )
  (lora_embedding_A): ParameterDict()
  (lora_embedding_B): ParameterDict()
  (lora_magnitude_vector): ModuleDict()
)
```

The LoRA, trainable parameters, remain 32-bit:

```python
>>> for param in block_nf4.lora_A.parameters():
...    print(param.shape, param.dtype)

torch.Size([4, 768]) torch.float32
```

whereas the non-trainable parameters have been quantized to 4bit values:

```python
>>> for param in block_nf4.base_layer.parameters():
...    print(param.shape, param.dtype)

torch.Size([884736, 1]) torch.uint8
torch.Size([2304]) torch.float32
```

interestingly, the bias term remains 32-bit where the matrix weights have been quantized and packed into a 1D tensor. I am not sure why this has been done like this (in terms of the 1D storation of the tensor). The number of parameters and the dtype are consistent with 4-bit quantization, since we originally had `[768, 2304]` float-32 parameters. That's `1769472` parameters, and in 4-bit, we can store 2 of such parameters inside a byte (uint8), thus halving the number of parameters in each quantized tensor, matching the `[884736, 1]` shape.

# Training and Evaluation

### Training Base Model (Only the Classifier Layer)

In [103]:
trainer_f32 = get_trainer(
    model_f32,
    tokenized_dataset,
    tokenizer,
    n_epochs=100,
    batch_size=25,
    lr=1e-3,
    weight_decay=1e-2,
    eval_steps=50,
    max_steps=400,
    tensorboard_name="f32",
)
trainer_f32.train()
trainer_f32.evaluate(eval_dataset=tokenized_dataset["test"])

  trainer = Trainer(


Step,Training Loss,Validation Loss,Accuracy
50,No log,0.58306,0.661
100,No log,0.460398,0.797
150,No log,0.484903,0.749
200,No log,0.418062,0.811
250,No log,0.422771,0.801
300,No log,0.411517,0.811
350,No log,0.39784,0.82
400,No log,0.401065,0.815


{'eval_loss': 0.3832074701786041,
 'eval_accuracy': 0.836,
 'eval_runtime': 18.4467,
 'eval_samples_per_second': 54.21,
 'eval_steps_per_second': 2.168,
 'epoch': 1.0}

### Training LoRA Model

In [104]:
trainer_f32_lora = get_trainer(
    model_f32_lora,
    tokenized_dataset,
    tokenizer,
    n_epochs=100,
    batch_size=8,
    lr=1e-4,
    weight_decay=1e-2,
    eval_steps=50,
    max_steps=1250,
    tensorboard_name="f32_lora",
)
trainer_f32_lora.train()
trainer_f32_lora.evaluate(eval_dataset=tokenized_dataset["test"])

  trainer = Trainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss,Validation Loss,Accuracy
50,No log,0.740409,0.503
100,No log,0.690592,0.578
150,No log,0.562058,0.715
200,No log,0.329285,0.871
250,No log,0.290022,0.896
300,No log,0.458923,0.862
350,No log,0.326087,0.886
400,No log,0.267724,0.903
450,No log,0.2795,0.898
500,0.516800,0.291053,0.903


{'eval_loss': 0.2912628650665283,
 'eval_accuracy': 0.903,
 'eval_runtime': 21.0558,
 'eval_samples_per_second': 47.493,
 'eval_steps_per_second': 5.937,
 'epoch': 1.0}

Save the trained model

In [110]:
model_f32_lora.save_pretrained('saved_f32_lora')

Load back the saved model

In [127]:
from peft import PeftModel, PeftConfig

peft_config = PeftConfig.from_pretrained('saved_f32_lora')

model = AutoModelForSequenceClassification.from_pretrained(peft_config.base_model_name_or_path, pad_token_id=tokenizer.pad_token_id)

model = PeftModel.from_pretrained(model, 'saved_f32_lora')

trainer = get_trainer(
    model,
    tokenized_dataset,
    tokenizer,
    n_epochs=1,
    batch_size=16,
    lr=1e-4,
    weight_decay=1e-2,
    eval_steps=50,
    max_steps=-1,
    tensorboard_name="f32_lora",
)

trainer.evaluate(eval_dataset=tokenized_dataset["test"])

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


{'eval_loss': 0.29126283526420593,
 'eval_model_preparation_time': 0.0031,
 'eval_accuracy': 0.903,
 'eval_runtime': 20.5994,
 'eval_samples_per_second': 48.545,
 'eval_steps_per_second': 3.058}

### Training Quantized-LoRA Model

In [109]:
trainer_nf4_lora_with_chkpoint = get_trainer(
    model_nf4_lora_with_chkpoint,
    tokenized_dataset,
    tokenizer,
    n_epochs=1,
    batch_size=16,
    lr=1e-4,
    weight_decay=1e-2,
    eval_steps=50,
    max_steps=-1,
    tensorboard_name="nf4_lora_with_chkpoint",
)
trainer_nf4_lora_with_chkpoint.train()
trainer_nf4_lora_with_chkpoint.evaluate(eval_dataset=tokenized_dataset["test"])

  trainer = Trainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
  return fn(*args, **kwargs)


Step,Training Loss,Validation Loss,Accuracy
50,No log,0.736271,0.546
100,No log,0.63177,0.638
150,No log,0.414766,0.809
200,No log,0.294711,0.889
250,No log,0.289729,0.894
300,No log,0.288955,0.901
350,No log,0.280963,0.894
400,No log,0.27041,0.907
450,No log,0.260655,0.915
500,0.440500,0.254403,0.909


  return fn(*args, **kwargs)


{'eval_loss': 0.24462735652923584,
 'eval_accuracy': 0.905,
 'eval_runtime': 24.7907,
 'eval_samples_per_second': 40.338,
 'eval_steps_per_second': 2.541,
 'epoch': 1.0}

Save the trained model

In [111]:
model_nf4_lora_with_chkpoint.save_pretrained('saved_nf4_lora')

Load back the saved model

In [126]:
from peft import PeftModel, PeftConfig

peft_config = PeftConfig.from_pretrained('saved_nf4_lora')

model = AutoModelForSequenceClassification.from_pretrained(peft_config.base_model_name_or_path, load_in_4bit=True, pad_token_id=tokenizer.pad_token_id)

model = PeftModel.from_pretrained(model, 'saved_nf4_lora')

trainer = get_trainer(
    model,
    tokenized_dataset,
    tokenizer,
    n_epochs=1,
    batch_size=16,
    lr=1e-4,
    weight_decay=1e-2,
    eval_steps=50,
    max_steps=-1,
    tensorboard_name="nf4_lora_with_chkpoint",
)

trainer.evaluate(eval_dataset=tokenized_dataset["test"])

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


{'eval_loss': 0.300048828125,
 'eval_model_preparation_time': 0.0008,
 'eval_accuracy': 0.878,
 'eval_runtime': 19.8745,
 'eval_samples_per_second': 50.316,
 'eval_steps_per_second': 3.17}

# Conclusions/Suggestions

- Both LoRA and Quantized models achieved higher accuracy than the base model on the test set, 90% and 88% respectively versus the accuracy of 83% for the base model. 
- Quantized model with checkpointing, `model_nf4_lora_with_chkpoint`, can handle bigger batch sizes or a bigger model. It is better to use an even larger model in this case and compare it to GPT2-LoRA. 
- Comparing the three presented variants is non-trivial. While all three are trained on the same number of epochs, a better and more involved approach would be to match them by compute budget (GPU hours and memory).