# Finetuning a qlora using huggingface methods


In [None]:
%%capture
!pip install peft
!pip install -U flash-attn
!pip install bitsandbytes
!pip install trl

### Load base model

In [None]:
import torch
from transformers import AutoModelForCausalLM, BitsAndBytesConfig, AutoTokenizer

import torch

def supports_flash_attention(device_id):
    """Check if a GPU supports FlashAttention."""
    major, minor = torch.cuda.get_device_capability(device_id)

    # Check if the GPU architecture is Ampere (SM 8.x) or newer (SM 9.0)
    is_sm8x = major == 8 and minor >= 0
    is_sm90 = major == 9 and minor == 0

    return is_sm8x or is_sm90

max_seq_length = 4000

hf_model = "facebook/opt-350m"

if supports_flash_attention(0):
  model = AutoModelForCausalLM.from_pretrained(hf_model, quantization_config=BitsAndBytesConfig(load_in_4bit=True), attn_implementation="flash_attention_2")
else:
  model = AutoModelForCausalLM.from_pretrained(hf_model, quantization_config=BitsAndBytesConfig(load_in_4bit=True))

tokenizer = AutoTokenizer.from_pretrained(hf_model)

`low_cpu_mem_usage` was None, now set to True since model is quantized.


### Add lora to base model

In [None]:
from peft import get_peft_model, LoraConfig, TaskType

# More info about parameters: https://huggingface.co/docs/peft/v0.11.0/en/package_reference/lora#peft.LoraConfig
lora_config = LoraConfig(
    r=16, # rank of lora matrices according to paper not much loss when set relatively low
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], # On which modules of the llm the lora weights are used
                      #"embed_tokens", "lm_head",], # Add for continual pretraining (unsloth)
    lora_alpha = 16, # scales the weights of the adapters (more influence on base model), 16 was recommended on reddit
    use_rslora = True, # scales lora_alpha with 1/sqrt(r), huggingface says this works better
    task_type=TaskType.CAUSAL_LM, # task is predicting next tokens given previous tokens, unsloth does not set this
    lora_dropout = 0, # Default on 0.05 in tutorial but unsloth says 0 is better
    #use_dora = True, # apperantly better but introduces overhead so model would need to be merged for inference.
    inference_mode = False
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 2,359,296 || all params: 333,555,712 || trainable%: 0.7073


### Load dataset

In [None]:
from datasets import load_dataset
dataset = load_dataset("pookie3000/pg_essays_split_1000_t", split = "train")
EOS_TOKEN = tokenizer.eos_token

def formatting_func(example):
    return example["text"] + EOS_TOKEN

Downloading data:   0%|          | 0.00/1.58M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1020 [00:00<?, ? examples/s]

### Training

In [None]:
from trl import SFTConfig, SFTTrainer
import transformers

is_bfloat16_supported = transformers.utils.import_utils.is_torch_bf16_gpu_available()
if is_bfloat16_supported:
  print("bfloat16 supported")
else:
  print("bfloat not supported")

# https://huggingface.co/docs/trl/main/en/sft_trainer#trl.SFTTrainer
# https://huggingface.co/docs/transformers/v4.40.2/en/main_classes/trainer#transformers.TrainingArguments (many arguments are defined in default trainer)
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    tokenizer=tokenizer,
    args = SFTConfig(
        num_train_epochs = 1,
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4, # helpful to simluate large batches when memory can't fit one
        warmup_steps = 5, # small learning rate in the beginning leads to smoother training
        output_dir="/outputs",
        max_seq_length = max_seq_length,
        dataset_text_field = "text", # automatically creates ConstantLengthDataset based on this dataset_text_filder,
        fp16 = not is_bfloat16_supported, # trains on fp16 instead of fp32 which is more efficient
        bf16 = is_bfloat16_supported, # more efficient then fp16
        optim = "adamw_8bit", # adamw in general recommended and 8bit for lower memory consumption
        learning_rate = 5e-4,
        weight_decay = 0.01, # seems necesary to prevent overfitting,
        logging_steps = 1, # log after every step the loss
        save_strategy = "no", # set to steps or epoch(save is done after each epoch)
        lr_scheduler_type = "linear",  # can set to other values but this seems the best
    )
)


bfloat16 supported


Map:   0%|          | 0/1020 [00:00<?, ? examples/s]



In [None]:
trainer.train()

OutOfMemoryError: CUDA out of memory. Tried to allocate 224.00 MiB. GPU 

## Inference

In [None]:
from transformers import TextIteratorStreamer
from threading import Thread
text_streamer = TextIteratorStreamer(tokenizer)
import textwrap
max_print_width = 100

inputs = tokenizer(
[
    "Once upon a time, in a galaxy, far far away,"
]*1, return_tensors = "pt").to("cuda")

generation_kwargs = dict(
    inputs,
    streamer = text_streamer,
    max_new_tokens = 256,
    use_cache = True,
)
thread = Thread(target = model.generate, kwargs = generation_kwargs)
thread.start()

length = 0
for j, new_text in enumerate(text_streamer):
    if j == 0:
        wrapped_text = textwrap.wrap(new_text, width = max_print_width)
        length = len(wrapped_text[-1])
        wrapped_text = "\n".join(wrapped_text)
        print(wrapped_text, end = "")
    else:
        length += len(new_text)
        if length >= max_print_width:
            length = 0
            print()
        print(new_text, end = "")
    pass
pass

</s>Once upon a time, in a galaxy, far faraway, there was a man who was a genius. He was a genius. 
He was a genius. He was a genius. He was a genius. He was a genius. He was a genius. He was a genius. 
He was a genius. He was a genius. He was a genius. He was a genius. He was a genius. He was a genius. 
He was a genius. He was a genius. He was a genius. He was a genius. He was a genius. He was a genius. 
He was a genius. He was a genius. He was a genius. He was a genius. He was a genius. He was a genius. 
He was a genius. He was a genius. He was a genius. He was a genius. He was a genius. He was a genius. 
He was a genius. He was a genius. He was a genius. He was a genius. He was a genius. He was a genius. 
He was a genius. He was a genius. He was a genius. He was a genius. He was a genius. He was a genius. 
He was a genius. He was a genius. He was a genius. He was a genius. He was a genius. He was a genius. 
He was

## Save model

In [None]:
model.push_to_hub("pookie3000/opt-350m-lora", token = "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")



adapter_model.safetensors:   0%|          | 0.00/9.46M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/pookie3000/opt-350m-lora/commit/11d64223d2ff4a49ba49bb6688f0d9bbe1ab080b', commit_message='Upload model', commit_description='', oid='11d64223d2ff4a49ba49bb6688f0d9bbe1ab080b', pr_url=None, pr_revision=None, pr_num=None)

### Model Merging

In [None]:
# https://huggingface.co/docs/peft/v0.7.1/en/package_reference/lora#peft.LoraModel.merge_and_unload
# https://huggingface.co/docs/peft/en/developer_guides/lora
merged_model = model.merge_and_unload()



In [None]:
hf_token = "todo"
merged_model.push_to_hub("pookie3000/opt-350m-lora-merged", token = token)

model.safetensors:   0%|          | 0.00/227M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/pookie3000/opt-350m-lora-merged/commit/c1f0f4a4ad452a912f5800f4b0074d7606f89a9c', commit_message='Upload OPTForCausalLM', commit_description='', oid='c1f0f4a4ad452a912f5800f4b0074d7606f89a9c', pr_url=None, pr_revision=None, pr_num=None)

## Load model back

:(((( This does not work for loading back in a quantized model for training, apperantly the adapter isnt set as an active adapter

In [None]:
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, BitsAndBytesConfig, AutoTokenizer


lora_adapter_hf = "pookie3000/pg_lora_completion_run2"

max_seq_length = 4000
config = PeftConfig.from_pretrained(lora_adapter_hf)
#model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path)
model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path)
use_for_further_training = True
lora_model = PeftModel.from_pretrained(model, lora_adapter_hf, is_trainable=use_for_further_training)
tokenizer = AutoTokenizer.from_pretrained(lora_adapter_hf)


config.json:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
`low_cpu_mem_usage` was None, now set to True since model is quantized.


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/172 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/168M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/464 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


This works but memory is not enough to actually train the model

In [None]:
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, BitsAndBytesConfig, AutoTokenizer

lora_adapter_hf = "pookie3000/pg_lora_completion_run2"

max_seq_length = 4000
config = PeftConfig.from_pretrained(lora_adapter_hf)
#model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path)
model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path, quantization_config=BitsAndBytesConfig(load_in_4bit=True))
tokenizer = AutoTokenizer.from_pretrained(lora_adapter_hf)
model.load_adapter(lora_adapter_hf)
model.enable_input_require_grads()

model.active_adapters()
model.peft_config

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


adapter_config.json:   0%|          | 0.00/732 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
`low_cpu_mem_usage` was None, now set to True since model is quantized.


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

KeyboardInterrupt: 