<a href="https://colab.research.google.com/github/xtchen64/llm-playground/blob/main/notebook/xtchen64_PEFT_Finetune_Bloom560m_tagger.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Using PEFT & bitsandbytes to finetune a LoRa checkpoint




This notebook is adapted from the youtube video: https://www.youtube.com/watch?v=Us5ZFp16PaU&t=559s.

Changes:
1. We are using a smaller bloom model (560m instead of 7b)
2. Adjusted the training configs to run for more epochs, which will hopefully get us a better model for the quote tagging task.

TODO:
- Use another dataset (e.g. All Obama tweets: https://github.com/fivethirtyeight/data/blob/master/twitter-ratio/BarackObama.csv)

In [1]:
!pip install -q bitsandbytes datasets accelerate loralib
!pip install -q git+https://github.com/huggingface/transformers.git@main git+https://github.com/huggingface/peft.git

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [2]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
!nvidia-smi -L

GPU 0: NVIDIA A100-SXM4-40GB (UUID: GPU-e93a9647-3f18-4740-847b-2f1e24fa21bc)


### Setup the model

In [5]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"
import torch
import torch.nn as nn
import bitsandbytes as bnb
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained(
    "bigscience/bloom-560m",  # fine tune the base model
    # "xtchen64/bloom-560m-lora-tagger-34-epoch",  # train from my own checkpoint, doesn't work properly yet.
    load_in_8bit=True,
    device_map='auto',
)

tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-560m")

### Freezing the original weights


In [6]:
for param in model.parameters():
  param.requires_grad = False  # freeze the model - train adapters later
  if param.ndim == 1:
    # cast the small parameters (e.g. layernorm) to fp32 for stability
    param.data = param.data.to(torch.float32)

model.gradient_checkpointing_enable()  # reduce number of stored activations
model.enable_input_require_grads()  # enable computing gradients wrt inputs

class CastOutputToFloat(nn.Sequential):
  def forward(self, x): return super().forward(x).to(torch.float32)
model.lm_head = CastOutputToFloat(model.lm_head)

### Setting up the LoRa Adapters

In [7]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [8]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=8, #attention heads
    lora_alpha=16, #alpha scaling
    # target_modules=["q_proj", "v_proj"], #if you know which layers to target
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM" # set this for CLM or Seq2Seq
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

trainable params: 786432 || all params: 560001024 || trainable%: 0.14043402892063284


## Data

In [9]:
import transformers
import random

from datasets import load_dataset
data = load_dataset("Abirate/english_quotes")

In [10]:
type(data)

datasets.dataset_dict.DatasetDict

In [11]:
# split into train and validation
def split_dataset(dataset, train_ratio=0.7):
    """
    Splits the Hugging Face Dataset into training and validation sets.

    :param dataset: The Dataset instance to be split.
    :param train_ratio: The ratio of training data to total data (default is 0.7).
    :return: A dictionary with 'train' and 'validation' Dataset instances.
    """
    # Split the dataset into training and validation sets
    split_datasets = dataset.train_test_split(test_size=1 - train_ratio)

    return {"train": split_datasets["train"], "test": split_datasets["test"]}


split_data = split_dataset(data["train"], train_ratio=0.7)
train_data = split_data["train"]
test_data = split_data["test"]

In [12]:
def merge_columns(example):
    example["prediction"] = example["quote"] + " ->: " + str(example["tags"])
    return example

train_data = train_data.map(merge_columns)
test_data = train_data.map(merge_columns)

test_data["prediction"][:5]

Map:   0%|          | 0/1755 [00:00<?, ? examples/s]

Map:   0%|          | 0/1755 [00:00<?, ? examples/s]

["“Being a writer is a very peculiar sort of a job: it's always you versus a blank sheet of paper (or a blank screen) and quite often the blank piece of paper wins.” ->: ['writing']",
 "“There's plenty of sense in nonsense sometimes, if you wish to look for it.” ->: ['will-herondale']",
 "“It takes a great deal of bravery to stand up to our enemies, but just as much to stand up to our friends.” ->: ['albus-dumbledore', 'courage', 'friends']",
 "“In three words I can sum up everything I've learned about life: it goes on.” ->: ['life']",
 "“No matter how careful you are, there's going to be the sense you missed something, the collapsed feeling under your skin that you didn't experience it all. There's that fallen heart feeling that you rushed right through the moments where you should've been paying attention.Well, get used to that feeling. That's how your whole life will feel some day.This is all practice.” ->: ['life']"]

In [13]:
train_data[0]

{'quote': "“Being a writer is a very peculiar sort of a job: it's always you versus a blank sheet of paper (or a blank screen) and quite often the blank piece of paper wins.”",
 'author': 'Neil Gaiman',
 'tags': ['writing'],
 'prediction': "“Being a writer is a very peculiar sort of a job: it's always you versus a blank sheet of paper (or a blank screen) and quite often the blank piece of paper wins.” ->: ['writing']"}

In [14]:
train_data = train_data.map(lambda samples: tokenizer(samples['prediction']), batched=True)

Map:   0%|          | 0/1755 [00:00<?, ? examples/s]

In [15]:
test_data = test_data.map(lambda samples: tokenizer(samples['prediction']), batched=True)

Map:   0%|          | 0/1755 [00:00<?, ? examples/s]

In [16]:
train_data

Dataset({
    features: ['quote', 'author', 'tags', 'prediction', 'input_ids', 'attention_mask'],
    num_rows: 1755
})

### Training

In [17]:
from transformers import Trainer, TrainingArguments


# Training arguments with an evaluation strategy
training_args = TrainingArguments(
    per_device_train_batch_size=8,
    gradient_accumulation_steps=8,
    warmup_steps=100,
    num_train_epochs=50,  # Set the number of training epochs
    learning_rate=2e-3,
    fp16=True,
    logging_steps=1,
    output_dir='outputs',
    evaluation_strategy="epoch",
)

# Initialize Trainer with training and evaluation datasets
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=test_data,  # Add this line for evaluation dataset
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
)

model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

You're using a BloomTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
0,2.9106,2.809967
2,2.7353,2.449263
4,2.3963,2.150911
6,2.2062,1.834783
8,1.8343,1.544835
10,1.6803,1.343802
12,1.3933,1.16531
14,1.395,1.000893
16,1.0258,0.876524
18,1.1378,0.769815




TrainOutput(global_step=1350, training_loss=1.0084371962922591, metrics={'train_runtime': 6856.1332, 'train_samples_per_second': 12.799, 'train_steps_per_second': 0.197, 'total_flos': 2.534005521211392e+16, 'train_loss': 1.0084371962922591, 'epoch': 49.09})

## Share adapters on the 🤗 Hub

In [18]:
model.push_to_hub("xtchen64/bloom-560m-lora-tagger-50-epoch",
                  use_auth_token=True,
                  commit_message="basic training",
                  private=True)



adapter_model.safetensors:   0%|          | 0.00/3.15M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/xtchen64/bloom-560m-lora-tagger-50-epoch/commit/c2c9eaa3464eadfbc59e725323789ba057dc1317', commit_message='basic training', commit_description='', oid='c2c9eaa3464eadfbc59e725323789ba057dc1317', pr_url=None, pr_revision=None, pr_num=None)

## Load adapters from the Hub

In [19]:
import torch
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer

peft_model_id = "xtchen64/bloom-560m-lora-tagger-50-epoch"
config = PeftConfig.from_pretrained(peft_model_id)
model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path, return_dict=True, load_in_8bit=True, device_map='auto')
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

# Load the Lora model
model = PeftModel.from_pretrained(model, peft_model_id)
model.config.use_cache = True

Downloading (…)/adapter_config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)er_model.safetensors:   0%|          | 0.00/3.15M [00:00<?, ?B/s]

## Inference

In [36]:
model.config.use_cache = True
batch = tokenizer("“To be or not to be, that is the question.”", return_tensors='pt')

with torch.cuda.amp.autocast():
  output_tokens = model.generate(**batch, max_new_tokens=50)

print('\n\n', tokenizer.decode(output_tokens[0], skip_special_tokens=True))



 “To be or not to be, that is the question.” ->: ['being'], ['proposition'], 'question'], ['proposition'], '-suori'], ['truth'], ['suori'], 'will'], ['suori'], ['suori
