This notebook benchmarks different ways of loading/merging LoRA adapters fine-tuned with QLoRA. TL;DR For merging, we must quantize and then dequantize the model before merging.
More details in this article: [LoRA Adapters: When a Naive Merge Leads to Poor Performance](https://kaitchup.substack.com/p/lora-adapters-when-a-naive-merge)


In [None]:
!pip install -q -U bitsandbytes
!pip install -q -U transformers
!pip install -q -U peft
!pip install -q -U accelerate
!pip install -q -U datasets
!pip install -q -U trl


You will need your Hugging Face access token to get Llama 2 from the hub.

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

Run the next cell only if you want to fine-tune Llama 2 LoRA adapter on openassistant-guanaco, using Platypus hyperparameters. You can also get my fine-tuned adapter directly from the HF Hub (see next next cell).

In [None]:
import torch
from datasets import load_dataset
from peft import LoraConfig, PeftModel
from transformers import (
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    AutoTokenizer,
    TrainingArguments
)

from trl import SFTTrainer

model_name = "meta-llama/Llama-2-7b-hf"
#Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, add_eos_token=True, use_fast=True)
#Create a new token and add it to the tokenizer
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'right'

dataset = load_dataset("timdettmers/openassistant-guanaco")

compute_dtype = getattr(torch, "float16")
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=True,
)
model = AutoModelForCausalLM.from_pretrained(
          model_name, quantization_config=bnb_config, device_map={"": 0}
)


peft_config = LoraConfig(
        lora_alpha=16,
        lora_dropout=0.05,
        r=16,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules= ["gate_proj", "down_proj", "up_proj"]
)

training_arguments = TrainingArguments(
        output_dir="./results",
        evaluation_strategy="steps",
        do_eval=True,
        per_device_train_batch_size=2,
        gradient_accumulation_steps=8,
        per_device_eval_batch_size=4,
        log_level="debug",
        save_steps=100,
        logging_steps=100,
        learning_rate=4e-4,
        eval_steps=100,
        fp16=True,
        num_train_epochs=1,
        warmup_steps=100,
        lr_scheduler_type="cosine",
)

trainer = SFTTrainer(
        model=model,
        train_dataset=dataset['train'],
        eval_dataset=dataset['test'],
        peft_config=peft_config,
        dataset_text_field="text",
        max_seq_length=512,
        tokenizer=tokenizer,
        args=training_arguments,
)

trainer.train()

adapter = "./results/checkpoint-615/"

Run this cell only if you wish to use my adapter:

In [None]:
adapter = "kaitchup/Llama-2-7B-oasstguanaco-adapter"

This cell loads the tokenizer and defines a function to compute the perplexity of a model.
Remove ".to("cuda")" if you don't have enough VRAM, but it may take one hour.

In [None]:

import torch
from datasets import load_dataset
from peft import LoraConfig, PeftModel
from transformers import (
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    AutoTokenizer,
    TrainingArguments
)
from trl import SFTTrainer
from tqdm import tqdm



model_name = "meta-llama/Llama-2-7b-hf"
tokenizer = AutoTokenizer.from_pretrained(model_name, add_eos_token=True, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'right'

dataset = load_dataset("timdettmers/openassistant-guanaco")['test']


#return the perplexity of the model on the dataset
#The perplexity is computed on each example, individually, with a sliding window for examples longer than 512 tokens.
def ppl_model(model, tokenizer, dataset):
  nlls= []
  max_length = 2048
  stride = 512
  for s in tqdm(range(len(dataset['text']))):
      encodings = tokenizer(dataset['text'][s], return_tensors="pt")
      seq_len = encodings.input_ids.size(1)
      prev_end_loc = 0
      for begin_loc in range(0, seq_len, stride):
          end_loc = min(begin_loc + max_length, seq_len)
          trg_len = end_loc - prev_end_loc
          input_ids = encodings.input_ids[:, begin_loc:end_loc].to("cuda")
          target_ids = input_ids.clone()
          target_ids[:, :-trg_len] = -100
          with torch.no_grad():
              outputs = model(input_ids, labels=target_ids)
              neg_log_likelihood = outputs.loss
          nlls.append(neg_log_likelihood)
          prev_end_loc = end_loc
          if end_loc == seq_len:
              break
  ppl = torch.exp(torch.stack(nlls).mean())
  return ppl

Naive merge:
- Load the base model
- Load and activate the adapter
- Merge the adapter
- Serialize


In [None]:
model = AutoModelForCausalLM.from_pretrained(model_name)
model = PeftModel.from_pretrained(model, adapter)
model = model.merge_and_unload()
model.save_pretrained("./naive_merge/", safe_serialization=True)

Downloading (…)lve/main/config.json:   0%|          | 0.00/609 [00:00<?, ?B/s]

Downloading (…)fetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

Downloading (…)/adapter_config.json:   0%|          | 0.00/468 [00:00<?, ?B/s]

Downloading adapter_model.bin:   0%|          | 0.00/92.9M [00:00<?, ?B/s]

Evaluate merged model perplexity:

In [None]:
model = AutoModelForCausalLM.from_pretrained("./naive_merge/", device_map={"": 0})
ppl = ppl_model(model, tokenizer, dataset)
print(ppl)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

100%|██████████| 518/518 [03:30<00:00,  2.46it/s]

tensor(3.8537, device='cuda:0')





I run this to clean the memory between each attempt: del model, torch.cuda.empty_cache(), gc.collect()

Load and quantize the merge model.

In [None]:
del model
torch.cuda.empty_cache()
gc.collect()
compute_dtype = getattr(torch, "float16")
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=True,
)
model = AutoModelForCausalLM.from_pretrained("./naive_merge/", quantization_config=bnb_config, device_map={"": 0})
ppl = ppl_model(model, tokenizer, dataset)
print(ppl)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

100%|██████████| 518/518 [00:56<00:00,  9.19it/s]

tensor(3.7520, device='cuda:0')





Load the adapter on top of the base model without quantization.

In [None]:
del model
torch.cuda.empty_cache()
gc.collect()
model = AutoModelForCausalLM.from_pretrained(model_name, device_map={"": 0})
model = PeftModel.from_pretrained(model, adapter)
ppl = ppl_model(model, tokenizer, dataset)
print(ppl)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

100%|██████████| 518/518 [03:35<00:00,  2.40it/s]

tensor(3.8537, device='cuda:0')





Load the adapter on top of the base model with quantization.

In [None]:
del model
torch.cuda.empty_cache()
gc.collect()
compute_dtype = getattr(torch, "float16")
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=True,
)
model = AutoModelForCausalLM.from_pretrained(
          model_name, quantization_config=bnb_config, device_map={"": 0}
)
model = PeftModel.from_pretrained(model, adapter)
ppl = ppl_model(model, tokenizer, dataset)
print(ppl)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

100%|██████████| 518/518 [01:07<00:00,  7.68it/s]

tensor(3.7410, device='cuda:0')





---
The following code is the one you have to use to properly merge the adapter.
---


In [None]:
import torch
import peft
import json
import shutil
from peft.utils import _get_submodules
import os
import bitsandbytes as bnb
from bitsandbytes.functional import dequantize_4bit
from peft import PeftModel
from transformers import AutoModelForCausalLM, LlamaForCausalLM, LlamaTokenizer, BitsAndBytesConfig
import gc
import copy

def dequantize_model(model, to='./dequantized_model', dtype=torch.float16, device="cuda"):
    """
    'model': the peftmodel you loaded with qlora.
    'tokenizer': the model's corresponding hf's tokenizer.
    'to': directory to save the dequantized model
    'dtype': dtype that the model was trained using
    'device': device to load the model to
    """


    os.makedirs(to, exist_ok=True)

    cls = bnb.nn.Linear4bit

    with torch.no_grad():
        for name, module in model.named_modules():
            if isinstance(module, cls):
                print(f"Dequantizing `{name}`...")
                quant_state = copy.deepcopy(module.weight.quant_state)
                quant_state[2] = dtype

                weights = dequantize_4bit(module.weight.data, quant_state=quant_state, quant_type="nf4").to(dtype)

                new_module = torch.nn.Linear(module.in_features, module.out_features, bias=None, dtype=dtype)
                new_module.weight = torch.nn.Parameter(weights)
                new_module.to(device=device, dtype=dtype)

                parent, target, target_name = _get_submodules(model, name)
                setattr(parent, target_name, new_module)

        # a hack, setting this to avoid hf's saving error because hf
        # itself does not support saving a model that is registered to be loaded in 4bit.
        model.is_loaded_in_4bit = False

        print("Saving dequantized model...")
        model.save_pretrained(to)
        #tokenizer.save_pretrained(to)
        config_data = json.loads(open(os.path.join(to, 'config.json'), 'r').read())
        config_data.pop("quantization_config", None)
        config_data.pop("pretraining_tp", None)
        with open(os.path.join(to, 'config.json'), 'w') as config:
            config.write(json.dumps(config_data, indent=2))

        return model

#To which precision do you want to dequantize?
dtype = torch.float16

quantization_config=BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=dtype,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
)

try:
    print(f"Starting to load the model {model_name} into memory")

    model = LlamaForCausalLM.from_pretrained(
        model_name,
        quantization_config=quantization_config,
        device_map={"": 0}
    )
    print(model)
    model = dequantize_model(model, to='./dqz_model/',dtype=dtype)
    print(model)
    model = PeftModel.from_pretrained(model, adapter)
    print(model)
    model = model.merge_and_unload()
    print(model)

    print(f"Successfully loaded the model {model_name} into memory")
    model.save_pretrained("./drive/MyDrive//dqz_merge/", safe_serialization=True)
except Exception as e:
    print(f"An error occurred: {e}")

    # Delete the model object if it exists
    if 'model' in locals():
        del model

    # Clear the GPU cache
    torch.cuda.empty_cache()

    # Run the garbage collection
    gc.collect()

    print("Model, GPU cache, and garbage have been cleared.")


Starting to load the model meta-llama/Llama-2-7b-hf into memory


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear4bit(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm

Downloading (…)/adapter_config.json:   0%|          | 0.00/468 [00:00<?, ?B/s]

Downloading adapter_model.bin:   0%|          | 0.00/92.9M [00:00<?, ?B/s]

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(32000, 4096)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
              (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
              (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
              (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
              (rotary_emb): LlamaRotaryEmbedding()
            )
            (mlp): LlamaMLP(
              (gate_proj): Linear(
                in_features=4096, out_features=11008, bias=False
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_f

Load the merged model and evaluate it.

In [None]:
del model
torch.cuda.empty_cache()
gc.collect()
model = AutoModelForCausalLM.from_pretrained("./dqz_merge/", device_map={"": 0})
ppl = ppl_model(model, tokenizer, dataset)
print(ppl)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

100%|██████████| 518/518 [03:30<00:00,  2.46it/s]

tensor(3.7411, device='cuda:0')





Load the merged model with quantization. Don't do this.

In [None]:
del model
torch.cuda.empty_cache()
gc.collect()
compute_dtype = getattr(torch, "float16")
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=True,
)
model = AutoModelForCausalLM.from_pretrained("./drive/MyDrive/dqz_merge/",  quantization_config=bnb_config, device_map={"": 0})
ppl = ppl_model(model, tokenizer, dataset)
print(ppl)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

100%|██████████| 518/518 [00:56<00:00,  9.21it/s]

tensor(5.2509, device='cuda:0')



