# Lightweight Fine-Tuning Project

* PEFT technique: LoRA
* Model: gpt2
* Evaluation approach: accuracy, f1-score
* Fine-tuning dataset: https://huggingface.co/datasets/zeroshot/twitter-financial-news-topic

In [1]:
MODEL_NAME = "openai-community/gpt2"
DATASET_NAME = "zeroshot/twitter-financial-news-topic"

ID2LABEL_DICT = {
    0: "Analyst Update",
    1: "Fed | Central Banks",
    2: "Company | Product News",
    3: "Treasuries | Corporate Debt",
    4: "Dividend",
    5: "Earnings",
    6: "Energy | Oil",
    7: "Financials",
    8: "Currencies",
    9: "General News | Opinion",
    10: "Gold | Metals | Materials",
    11: "IPO",
    12: "Legal | Regulation",
    13: "M&A | Investments",
    14: "Macro",
    15: "Markets",
    16: "Politics",
    17: "Personnel Change",
    18: "Stock Commentary",
    19: "Stock Movement",
}

LABEL2ID_DICT = {
   "Analyst Update": 0,
   "Fed | Central Banks": 1,
   "Company | Product News": 2,
   "Treasuries | Corporate Debt": 3,
   "Dividend": 4,
   "Earnings": 5,
   "Energy | Oil": 6,
   "Financials": 7,
   "Currencies": 8,
   "General News | Opinion": 9,
    "Gold | Metals | Materials": 1,
    "IPO": 1,
    "Legal | Regulation": 1,
    "M&A | Investments": 1,
    "Macro": 1,
    "Markets": 1,
    "Politics": 1,
    "Personnel Change": 1,
    "Stock Commentary": 1,
    "Stock Movement": 1,
}

SPLIT_TYPES = ["train", "validation"]

In [3]:
import torch
import pandas
import numpy as np
from datasets import load_dataset
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoConfig
from transformers import AutoModelForSequenceClassification, BitsAndBytesConfig

from transformers import DataCollatorWithPadding, Trainer, TrainingArguments

from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import precision_recall_fscore_support, classification_report

## Loading and Evaluating a Foundation Model

TODO: In the cells below, load your chosen pre-trained Hugging Face model and evaluate its performance prior to fine-tuning. This step includes loading an appropriate tokenizer and dataset.

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
# First, let's load the datasets from twitter financial-news-topic dataset


fin_dataset_validation = load_dataset(DATASET_NAME, split="validation")
fin_dataset_train = load_dataset(DATASET_NAME, split="train")

In [6]:
fin_dataset_validation

Dataset({
    features: ['text', 'label'],
    num_rows: 4117
})

In [7]:
fin_dataset_validation = fin_dataset_validation.rename_column("label", "labels")
fin_dataset_train = fin_dataset_train.rename_column("label", "labels")

#### Now, let's load gpt2 tokenizer's and tokenize our datasets

In [8]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" 

def apply_tokenization(data):
    return tokenizer(data["text"], padding=True, truncation=True, return_tensors="pt")

Because I need my tensors with the same size, I need to use padding in the tokenization

Because GPT-2 has a fixed context window of 1024 tokens, anything longer than that can't be processed. For that, I'm using truncation equals True at the tokenization.

In [9]:
tok_fin_dataset_validation = fin_dataset_validation.map(apply_tokenization, batched=True)
tok_fin_dataset_train = fin_dataset_train.map(apply_tokenization, batched=True)

Map:   0%|          | 0/16990 [00:00<?, ? examples/s]

In [10]:
tok_fin_dataset_validation.set_format("torch",
                                      columns=["input_ids", "attention_mask", "labels"])

tok_fin_dataset_train.set_format("torch",
                                      columns=["input_ids", "attention_mask", "labels"])

#### Now, let's load the gpt2 model

In [10]:
gpt2 = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels = len(ID2LABEL_DICT.keys()),
    id2label = ID2LABEL_DICT,
    label2id = LABEL2ID_DICT
) 

gpt2.config.pad_token_id = tokenizer.pad_token_id
gpt2.eval()

GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (score): Linear(in_features=768, out_features=20, bias=False)
)

In [13]:
gpt2 = AutoModelForSequenceClassification.from_pretrained('./temp/gpt2', local_files_only = True)
gpt2.config.pad_token_id = tokenizer.pad_token_id
gpt2.to(device)

GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (score): Linear(in_features=768, out_features=20, bias=False)
)

In [14]:
collator = DataCollatorWithPadding(
    tokenizer, 
    pad_to_multiple_of=8
)

validation_loader = DataLoader(
    tok_fin_dataset_validation, 
    batch_size=16, 
    shuffle=False, 
    collate_fn=collator
)

In [15]:
def compute_metrics_forward(eval_pred):
    preds, labels = eval_pred
#     preds = np.argmax(predictions, axis=1)

    accuracy = accuracy_score(labels, preds)
    (precision_w, 
     recall_w, 
     f1_score_w, 
     _
    ) = precision_recall_fscore_support(labels, 
                                                  preds, average="weighted", 
                                                  zero_division=0)
    eval_obj = {"accuracy": accuracy, 
                "precision_weighted": precision_w, 
                "recall_weighted": recall_w, 
                "f1_weighted": f1_score_w}
    
    return eval_obj

def compute_metrics_training(eval_pred):
    predictions, labels = eval_pred
    preds = np.argmax(predictions, axis=1)

    accuracy = accuracy_score(labels, preds)
    (precision_w, 
     recall_w, 
     f1_score_w, 
     _
    ) = precision_recall_fscore_support(labels, 
                                                  preds, average="weighted", 
                                                  zero_division=0)
    eval_obj = {"accuracy": accuracy, 
                "precision_weighted": precision_w, 
                "recall_weighted": recall_w, 
                "f1_weighted": f1_score_w}
    
    return eval_obj

In [16]:
def predict_and_evaluate(model, data_loader):
    predictions, labels = [], []

    with torch.inference_mode():
        for batch in data_loader:
            batch = {key: value.to(device) for key, value in batch.items()}
            logits = model(**batch).logits
            predictions.extend(logits.argmax(-1).cpu().tolist())
            labels.extend(batch["labels"].cpu().tolist())
    
    eval_pred = (predictions, labels)
    eval_result = compute_metrics_forward(eval_pred)
    
    return eval_result

#### Let's make the predictions for the validation set and evaluate them

In [65]:
gpt2_eval = predict_and_evaluate(gpt2, validation_loader)
pandas.DataFrame([gpt2_eval])

Unnamed: 0,accuracy,precision_weighted,recall_weighted,f1_weighted
0,0.007773,6.1e-05,0.007773,0.000121


In [74]:
gpt2.save_pretrained('./temp/gpt2')

## Performing Parameter-Efficient Fine-Tuning

TODO: In the cells below, create a PEFT model from your loaded model, run a training loop, and save the PEFT model weights.

#### Saving the model

## Performing Inference with a PEFT Model

TODO: In the cells below, load the saved PEFT model weights and evaluate the performance of the trained PEFT model. Be sure to compare the results to the results from prior to fine-tuning.

In [17]:
from peft import LoraConfig, get_peft_model

- r --> rank of the matrices used in LoRA
- lora_alpha --> Scaling factor, with higher values increasing the learning rate
- target_modules --> At the LoRA paper, it was empirically showed that fine tune query and value attention layers is an efficient performance booster
- lora_dropout --> It acts as a regularization technique to prevent overfitting
- bias --> bias parameters should be trained only for LoRA matrices
- modules_to_save --> We use this argument when the layers we want to train are not considered part of the transformer block we are adapting

In [15]:
lora_config = LoraConfig(
    r=128,
    lora_alpha=128,
    lora_dropout=0.05,
    bias="lora_only",
    modules_to_save=["score"],
)


lora_model = get_peft_model(gpt2, lora_config)



In [16]:
lora_model.print_trainable_parameters()

trainable params: 4,776,960 || all params: 129,189,120 || trainable%: 3.6976488422554468


In [18]:
lora_model.config.pad_token_id = tokenizer.pad_token_id
lora_model.to(device)

PeftModel(
  (base_model): LoraModel(
    (model): GPT2ForSequenceClassification(
      (transformer): GPT2Model(
        (wte): Embedding(50257, 768)
        (wpe): Embedding(1024, 768)
        (drop): Dropout(p=0.1, inplace=False)
        (h): ModuleList(
          (0-11): 12 x GPT2Block(
            (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (attn): GPT2Attention(
              (c_attn): Linear(
                in_features=768, out_features=2304, bias=True
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=768, out_features=128, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=128, out_features=2304, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()


In [19]:
peft_trainer = Trainer(
    model=lora_model,
    args=TrainingArguments(
        output_dir = "./temp/peft",
        remove_unused_columns = False,
        learning_rate = 5e-4,
        per_device_train_batch_size = 4,
        per_device_eval_batch_size = 4,
        evaluation_strategy = 'epoch',
        save_strategy = "epoch",
        num_train_epochs = 2,
        weight_decay = 0.01,
        load_best_model_at_end = True,
        label_names = ["labels"],
    ),
    train_dataset = tok_fin_dataset_train,
    eval_dataset = tok_fin_dataset_validation,
    tokenizer = tokenizer,
    data_collator = collator,
    compute_metrics = compute_metrics_training
)

peft_trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision Weighted,Recall Weighted,F1 Weighted
1,0.6766,0.601558,0.84309,0.849581,0.84309,0.842431
2,0.4228,0.451449,0.890697,0.89101,0.890697,0.8905


TrainOutput(global_step=8496, training_loss=0.6817873022650595, metrics={'train_runtime': 945.9419, 'train_samples_per_second': 35.922, 'train_steps_per_second': 8.982, 'total_flos': 2570814278516736.0, 'train_loss': 0.6817873022650595, 'epoch': 2.0})

In [20]:
peft_trainer.save_model('./temp/peft_model')

# Loading the PEFT Model

In [19]:
from peft import PeftConfig, AutoPeftModelForSequenceClassification

peft_dir = './temp/peft_model'

loaded_lora_config = PeftConfig.from_pretrained(peft_dir)

peft_loaded = AutoPeftModelForSequenceClassification.from_pretrained(
    peft_dir, 
    config = loaded_lora_config,
    num_labels = len(ID2LABEL_DICT.keys())
)


peft_loaded.eval()
peft_loaded.config.pad_token_id = tokenizer.pad_token_id
peft_loaded.to(device)

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at openai-community/gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): GPT2ForSequenceClassification(
      (transformer): GPT2Model(
        (wte): Embedding(50257, 768)
        (wpe): Embedding(1024, 768)
        (drop): Dropout(p=0.1, inplace=False)
        (h): ModuleList(
          (0-11): 12 x GPT2Block(
            (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (attn): GPT2Attention(
              (c_attn): Linear(
                in_features=768, out_features=2304, bias=True
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=768, out_features=128, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=128, out_features=2304, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embed

We can also merge the LoRA params with the model itself, creating a new model

In [22]:
fn_model = peft_loaded.merge_and_unload()

In [23]:
fn_model.save_pretrained('./temp/fn_lora_model')

In [20]:
fn_model = AutoModelForSequenceClassification.from_pretrained(
    './temp/fn_lora_model',
    num_labels = len(ID2LABEL_DICT.keys())
)

fn_model.config.pad_token_id = tokenizer.pad_token_id
fn_model.to(device)

GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (score): Linear(in_features=768, out_features=20, bias=False)
)

# Let's train a QLora, with a quantized GPT2

In [26]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True, # to quantize the model to 4-bit
    bnb_4bit_quant_type="nf4", # special 4-bit data type for initialized from a normal distribution
    bnb_4bit_use_double_quant=True, # to use a nested quantization scheme to quantize the already quantized weights
    bnb_4bit_compute_dtype=torch.bfloat16 # to use bfloat16 for faster computation
   
)

In [27]:
q_gpt2 = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels = len(ID2LABEL_DICT.keys()),
    id2label = ID2LABEL_DICT,
    label2id = LABEL2ID_DICT,
    quantization_config=quantization_config
)

q_gpt2.config.pad_token_id = tokenizer.pad_token_id
q_gpt2.eval()

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at openai-community/gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Linear4bit(in_features=768, out_features=2304, bias=True)
          (c_proj): Linear4bit(in_features=768, out_features=768, bias=True)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Linear4bit(in_features=768, out_features=3072, bias=True)
          (c_proj): Linear4bit(in_features=3072, out_features=768, bias=True)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, ele

In [42]:
q_lora_config = LoraConfig(
    r=64,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="lora_only",
    modules_to_save=["score"],
    task_type="SEQ_CLS"
)

In [43]:
from peft import prepare_model_for_kbit_training

q_gpt2 = prepare_model_for_kbit_training(q_gpt2)
q_lora_model = get_peft_model(q_gpt2, q_lora_config)

In [46]:
q_lora_trainer = Trainer(
    model=q_lora_model,
    args=TrainingArguments(
        output_dir = "./temp/qlora",
        remove_unused_columns = False,
        learning_rate = 5e-4,
        per_device_train_batch_size = 16,
        per_device_eval_batch_size = 16,
        evaluation_strategy = 'epoch',
        save_strategy = "epoch",
        num_train_epochs = 2,
        weight_decay = 0.01,
        load_best_model_at_end = True,
        label_names = ["labels"],
    ),
    train_dataset = tok_fin_dataset_train,
    eval_dataset = tok_fin_dataset_validation,
    tokenizer = tokenizer,
    data_collator = collator,
    compute_metrics = compute_metrics_training
)

q_lora_trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision Weighted,Recall Weighted,F1 Weighted
1,0.0,,0.017731,0.000314,0.017731,0.000618
2,0.0,,0.017731,0.000314,0.017731,0.000618


TrainOutput(global_step=2124, training_loss=0.0, metrics={'train_runtime': 1129.0314, 'train_samples_per_second': 30.097, 'train_steps_per_second': 1.881, 'total_flos': 2567034464698368.0, 'train_loss': 0.0, 'epoch': 2.0})

In [47]:
q_lora_trainer.save_model('./temp/qlora')

In [21]:
from peft import PeftModel

q_peft_dir = './temp/qlora'
loaded_qlora_config = PeftConfig.from_pretrained(q_peft_dir)

qlora_loaded = AutoPeftModelForSequenceClassification.from_pretrained(
    q_peft_dir, 
    config = loaded_qlora_config,
    num_labels = len(ID2LABEL_DICT.keys()),
    load_in_4bit=True
)


qlora_loaded.eval()
qlora_loaded.config.pad_token_id = tokenizer.pad_token_id
qlora_loaded.to(device)

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at openai-community/gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): GPT2ForSequenceClassification(
      (transformer): GPT2Model(
        (wte): Embedding(50257, 768)
        (wpe): Embedding(1024, 768)
        (drop): Dropout(p=0.1, inplace=False)
        (h): ModuleList(
          (0-11): 12 x GPT2Block(
            (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (attn): GPT2Attention(
              (c_attn): Linear4bit(
                in_features=768, out_features=2304, bias=True
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=768, out_features=64, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=64, out_features=2304, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_emb

# Now, let's evaluate the fine-tuned GPT2 vs the pre-trained GPT2

#### let's evaluate all the 4 different models: GPT2, GPT2+LoRA, GPT2MergedWithLoRA, GPT2QLoRA

In [22]:
name2model = {
    'GPT2': gpt2,
    'GPT2+LoRA': peft_loaded,
    'GPT2MergedLoRA': fn_model,
    'GPT2QLoRA': qlora_loaded
}

In [23]:
## Here, I'm running both GPT2 pre-trained model and fine tuned GPT2 with LoRA!!

eval_result_list = []
for model_name in name2model.keys():
    eval_result = {}
    eval_result["model"] = model_name
    
    results = predict_and_evaluate(name2model[model_name], validation_loader)
    eval_result = eval_result | results
    eval_result_list.append(eval_result)
    print(model_name)

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


GPT2
GPT2+LoRA
GPT2MergedLoRA




GPT2QLoRA


In [24]:
evaluation_models_results = pandas.DataFrame(eval_result_list)
evaluation_models_results

Unnamed: 0,model,accuracy,precision_weighted,recall_weighted,f1_weighted
0,GPT2,0.007773,6.1e-05,0.007773,0.000121
1,GPT2+LoRA,0.890697,0.89101,0.890697,0.8905
2,GPT2MergedLoRA,0.890697,0.89101,0.890697,0.8905
3,GPT2QLoRA,0.017731,0.000314,0.017731,0.000618


# We clearly see that the fine-tuned GPT2 with LoRA has far better results!!