<a href="https://colab.research.google.com/github/tuhinmallick/AI-for-Fashion/blob/main/Training%2C_Loading%2C_and_Merging_QDoRA%2C_QLoRA%2C_and_LoftQ_Adapters.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebook is organized into two sections.

The first section shows how to fine-tune QLoRA, QDoRA, and LoftQ adapters for Mistral 7B using Hugging Face PEFT and TRL libraries. Then, the notebook benchmarks the adapters before and after merging them into the base model, followed by quantization with BNB NF4 and AWQ.


First, we need all these dependencies:

In [None]:
!pip install -q -U transformers peft accelerate datasets trl bitsandbytes

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.8/8.8 MB[0m [31m27.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.1/199.1 kB[0m [31m25.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.4/297.4 kB[0m [31m30.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m45.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m225.0/225.0 kB[0m [31m27.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m102.2/102.2 MB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m27.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━

Import all the necessary packages.

In [None]:
import torch
from datasets import load_dataset
from peft import LoraConfig, prepare_model_for_kbit_training, PeftModel
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    BitsAndBytesConfig
)
from trl import SFTTrainer

# Fine-tuning

**LoftQ**

More info:
[LoftQ: Better Initialization for a Quantization-Aware LoRA](https://kaitchup.substack.com/p/loftq-better-initialization-for-a)


In [None]:
!git clone https://github.com/huggingface/peft.git

!python peft/examples/loftq_finetuning/quantize_save_load.py \
    --model_name_or_path mistralai/Mistral-7B-v0.1 \
    --bits 4 \
    --iter 5 \
    --rank 16 \
    --save_dir "./loftq_iters/"

[1;30;43mLe flux de sortie a été tronqué et ne contient que les 5000 dernières lignes.[0m
  (v_proj): lora.Linear(
    (base_layer): Linear(in_features=4096, out_features=1024, bias=False)
    (lora_dropout): ModuleDict(
      (default): Dropout(p=0.05, inplace=False)
    )
    (lora_A): ModuleDict(
      (default): Linear(in_features=4096, out_features=16, bias=False)
    )
    (lora_B): ModuleDict(
      (default): Linear(in_features=16, out_features=1024, bias=False)
    )
    (lora_embedding_A): ParameterDict()
    (lora_embedding_B): ParameterDict()
  )
  (o_proj): Shell()
  (rotary_emb): MistralRotaryEmbedding()
)
MistralMLP(
  (gate_proj): lora.Linear(
    (base_layer): Linear(in_features=4096, out_features=14336, bias=False)
    (lora_dropout): ModuleDict(
      (default): Dropout(p=0.05, inplace=False)
    )
    (lora_A): ModuleDict(
      (default): Linear(in_features=4096, out_features=16, bias=False)
    )
    (lora_B): ModuleDict(
      (default): Linear(in_features=16, 

In [None]:
MODEL_DIR = "./loftq_iters/Mistral-7B-v0.1-4bit-16rank"
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_use_double_quant=True,
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR, add_eos_token=True, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(
          MODEL_DIR, quantization_config=bnb_config, device_map={"": 0}, torch_dtype=torch.bfloat16
)

model.save_pretrained("./loftq_base/")
tokenizer.save_pretrained("./loftq_base/")


Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

('./drive/MyDrive/mergexp/loftq_base/tokenizer_config.json',
 './drive/MyDrive/mergexp/loftq_base/special_tokens_map.json',
 './drive/MyDrive/mergexp/loftq_base/tokenizer.model',
 './drive/MyDrive/mergexp/loftq_base/added_tokens.json',
 './drive/MyDrive/mergexp/loftq_base/tokenizer.json')

In [None]:
!mkdir -p ./drive/MyDrive/mergexp/loftq_base/

In [None]:
!cp -r ./loftq_iters/Mistral-7B-v0.1-4bit-16rank/loft_init/ ./drive/MyDrive/mergexp/loftq_base/loft_init/

In [None]:
MODEL_DIR = "./loftq_base/"
#Tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR, add_eos_token=True, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'left' #Necessary for FlashAttention compatibility

#Better to use bf16 if supported (Ampere GPUs or more recent)
#If bf16 is supported, the GPU is also recent enough to support FlashAttention
if torch.cuda.is_bf16_supported():
  !pip install flash_attn
  compute_dtype = torch.bfloat16
  attn_implementation = 'flash_attention_2'
else:
  compute_dtype = torch.float16
  attn_implementation = 'sdpa'

dataset = load_dataset("timdettmers/openassistant-guanaco")

model = AutoModelForCausalLM.from_pretrained(
          MODEL_DIR,  device_map={"": 0}, torch_dtype=compute_dtype
)
model = prepare_model_for_kbit_training(model)
#Configure the pad token in the model
model.config.pad_token_id = tokenizer.pad_token_id

peft_model = PeftModel.from_pretrained(
    model,
    MODEL_DIR,
    subfolder="loft_init",
    is_trainable=True,
)



from trl import SFTConfig

training_arguments = SFTConfig(
        output_dir="./loftq_ft/",
        evaluation_strategy="steps",
        do_eval=True,
        optim="paged_adamw_8bit",
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        log_level="debug",
        logging_steps=50,
        learning_rate=1e-5,
        eval_steps=50,
        num_train_epochs=1,
        fp16= not torch.cuda.is_bf16_supported(),
        bf16= torch.cuda.is_bf16_supported(),
        save_strategy='epoch',
        warmup_ratio=0.1,
        lr_scheduler_type="linear",
)

trainer = SFTTrainer(
        model=peft_model,
        train_dataset=dataset['train'],
        eval_dataset=dataset['test'],
        dataset_text_field="text",
        max_seq_length=512,
        tokenizer=tokenizer,
        args=training_arguments,
)

trainer.train()


Collecting flash_attn
  Downloading flash_attn-2.5.6.tar.gz (2.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting einops (from flash_attn)
  Downloading einops-0.7.0-py3-none-any.whl (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.6/44.6 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
Collecting ninja (from flash_attn)
  Downloading ninja-1.11.1.1-py2.py3-none-manylinux1_x86_64.manylinux_2_5_x86_64.whl (307 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m307.2/307.2 kB[0m [31m18.8 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: flash_attn
  Building wheel for flash_attn (setup.py) ... [?25l[?25hdone
  Created wheel for flash_attn: filename=flash_attn-2.5.6-cp310-cp310-linux_x86_64.whl size=120592258 sha256=d8cf54adda65f59820221d329d274e124972d7fdc05ab3b1130253c64eee6c8a
  St

Downloading readme:   0%|          | 0.00/395 [00:00<?, ?B/s]



Downloading data:   0%|          | 0.00/20.9M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.11M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]



Map:   0%|          | 0/9846 [00:00<?, ? examples/s]

Map:   0%|          | 0/518 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
Using auto half precision backend
Currently training with a batch size of: 8
***** Running training *****
  Num examples = 9,846
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 1,231
  Number of trainable parameters = 41,943,040
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
The input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in torch.bfloat16.


Step,Training Loss,Validation Loss
50,1.3477,1.354418
100,1.2702,1.237236
150,1.1852,1.192417
200,1.1606,1.178373
250,1.1396,1.162789
300,1.1543,1.151754
350,1.1306,1.148634
400,1.1388,1.145539
450,1.1231,1.142953
500,1.1204,1.141005


***** Running Evaluation *****
  Num examples = 518
  Batch size = 8
***** Running Evaluation *****
  Num examples = 518
  Batch size = 8
***** Running Evaluation *****
  Num examples = 518
  Batch size = 8
***** Running Evaluation *****
  Num examples = 518
  Batch size = 8
***** Running Evaluation *****
  Num examples = 518
  Batch size = 8
***** Running Evaluation *****
  Num examples = 518
  Batch size = 8
***** Running Evaluation *****
  Num examples = 518
  Batch size = 8
***** Running Evaluation *****
  Num examples = 518
  Batch size = 8
***** Running Evaluation *****
  Num examples = 518
  Batch size = 8
***** Running Evaluation *****
  Num examples = 518
  Batch size = 8
***** Running Evaluation *****
  Num examples = 518
  Batch size = 8
***** Running Evaluation *****
  Num examples = 518
  Batch size = 8
***** Running Evaluation *****
  Num examples = 518
  Batch size = 8
***** Running Evaluation *****
  Num examples = 518
  Batch size = 8
***** Running Evaluation *****
  N

Step,Training Loss,Validation Loss
50,1.3477,1.354418
100,1.2702,1.237236
150,1.1852,1.192417
200,1.1606,1.178373
250,1.1396,1.162789
300,1.1543,1.151754
350,1.1306,1.148634
400,1.1388,1.145539
450,1.1231,1.142953
500,1.1204,1.141005


***** Running Evaluation *****
  Num examples = 518
  Batch size = 8
***** Running Evaluation *****
  Num examples = 518
  Batch size = 8
***** Running Evaluation *****
  Num examples = 518
  Batch size = 8
***** Running Evaluation *****
  Num examples = 518
  Batch size = 8
***** Running Evaluation *****
  Num examples = 518
  Batch size = 8
***** Running Evaluation *****
  Num examples = 518
  Batch size = 8
***** Running Evaluation *****
  Num examples = 518
  Batch size = 8
***** Running Evaluation *****
  Num examples = 518
  Batch size = 8
***** Running Evaluation *****
  Num examples = 518
  Batch size = 8
Saving model checkpoint to ./drive/MyDrive/mergexp/loftq_ft/checkpoint-1231
tokenizer config file saved in ./drive/MyDrive/mergexp/loftq_ft/checkpoint-1231/tokenizer_config.json
Special tokens file saved in ./drive/MyDrive/mergexp/loftq_ft/checkpoint-1231/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=1231, training_loss=1.1346433749535914, metrics={'train_runtime': 15766.5706, 'train_samples_per_second': 0.624, 'train_steps_per_second': 0.078, 'total_flos': 2.148541273402245e+17, 'train_loss': 1.1346433749535914, 'epoch': 1.0})

Merge the LoftQ adapter

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel
#Better to use bf16 if supported (Ampere GPUs or more recent)
#If bf16 is supported, the GPU is also recent enough to support FlashAttention
if torch.cuda.is_bf16_supported():
  !pip install flash_attn
  compute_dtype = torch.bfloat16
  attn_implementation = 'flash_attention_2'
else:
  compute_dtype = torch.float16
  attn_implementation = 'sdpa'


model_name = "./loftq_base/"
adapter = "./loftq_ft/checkpoint-1231"
#Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, padding_size="left",  use_fast=True)

bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=True,
)

model = AutoModelForCausalLM.from_pretrained(
          model_name, quantization_config=bnb_config, device_map={"": 0}, torch_dtype=compute_dtype,  attn_implementation=attn_implementation
)

model = dequantize_model(model, to='./dqz_model_loftq/',dtype=compute_dtype)
model = PeftModel.from_pretrained(model, adapter)
model = model.merge_and_unload()

print(f"Successfully loaded the model {model_name} into memory")

#pushed to hub
#model.push_to_hub("kaitchup/Mistral-7B-v0.1-oasstguanaco-1e-loftq-merged")
#tokenizer.push_to_hub("kaitchup/Mistral-7B-v0.1-oasstguanaco-1e-loftq-merged")






Dequantizing `model.layers.0.self_attn.q_proj`...
Dequantizing `model.layers.0.self_attn.k_proj`...
Dequantizing `model.layers.0.self_attn.v_proj`...
Dequantizing `model.layers.0.self_attn.o_proj`...
Dequantizing `model.layers.0.mlp.gate_proj`...
Dequantizing `model.layers.0.mlp.up_proj`...
Dequantizing `model.layers.0.mlp.down_proj`...
Dequantizing `model.layers.1.self_attn.q_proj`...
Dequantizing `model.layers.1.self_attn.k_proj`...
Dequantizing `model.layers.1.self_attn.v_proj`...
Dequantizing `model.layers.1.self_attn.o_proj`...
Dequantizing `model.layers.1.mlp.gate_proj`...
Dequantizing `model.layers.1.mlp.up_proj`...
Dequantizing `model.layers.1.mlp.down_proj`...
Dequantizing `model.layers.2.self_attn.q_proj`...
Dequantizing `model.layers.2.self_attn.k_proj`...
Dequantizing `model.layers.2.self_attn.v_proj`...
Dequantizing `model.layers.2.self_attn.o_proj`...
Dequantizing `model.layers.2.mlp.gate_proj`...
Dequantizing `model.layers.2.mlp.up_proj`...
Dequantizing `model.layers.2.m

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/kaitchup/Mistral-7B-v0.1-oasstguanaco-1e-loftq-merged/commit/fb41de77231ac8904d1f195cd8b28cec80a22add', commit_message='Upload tokenizer', commit_description='', oid='fb41de77231ac8904d1f195cd8b28cec80a22add', pr_url=None, pr_revision=None, pr_num=None)



---

**QDoRA**

More info:
[DoRA: Better and Faster than LoRA?](https://kaitchup.substack.com/p/dora-better-and-faster-than-lora)

In [None]:
model_name = "mistralai/Mistral-7B-v0.1"
#Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, add_eos_token=True, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'left' #Necessary for FlashAttention compatibility

#Better to use bf16 if supported (Ampere GPUs or more recent)
#If bf16 is supported, the GPU is also recent enough to support FlashAttention
if torch.cuda.is_bf16_supported():
  !pip install flash_attn
  compute_dtype = torch.bfloat16
  attn_implementation = 'flash_attention_2'
else:
  compute_dtype = torch.float16
  attn_implementation = 'sdpa'

dataset = load_dataset("timdettmers/openassistant-guanaco")


#Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, add_eos_token=True, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'left' #Necessary for FlashAttention compatibility

bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=True,
)
model = AutoModelForCausalLM.from_pretrained(
          model_name, quantization_config=bnb_config, device_map={"": 0}, torch_dtype=compute_dtype,  attn_implementation=attn_implementation
)
model = prepare_model_for_kbit_training(model)
model.gradient_checkpointing_enable({'use_reentrant':True})
#model.enable_input_require_grads()

peft_config = LoraConfig(
        lora_alpha=16,
        lora_dropout=0.05,
        r=16,
        bias="none",
        use_dora=True,
        task_type="CAUSAL_LM",
        target_modules= ['k_proj', 'q_proj', 'v_proj', 'o_proj', "gate_proj", "down_proj", "up_proj"]
)


from trl import SFTConfig

training_arguments = SFTConfig(
        output_dir="./qdora/",
        evaluation_strategy="steps",
        do_eval=True,
        optim="paged_adamw_8bit",
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        log_level="debug",
        logging_steps=50,
        learning_rate=1e-5,
        eval_steps=50,
        num_train_epochs=1,
        fp16= not torch.cuda.is_bf16_supported(),
        bf16= torch.cuda.is_bf16_supported(),
        save_strategy='epoch',
        warmup_ratio=0.1,
        lr_scheduler_type="linear",
)

trainer = SFTTrainer(
        model=model,
        train_dataset=dataset['train'],
        eval_dataset=dataset['test'],
        dataset_text_field="text",
        max_seq_length=512,
        peft_config=peft_config,
        tokenizer=tokenizer,
        args=training_arguments,
)

trainer.train()

tokenizer_config.json:   0%|          | 0.00/967 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

loading file tokenizer.model from cache at /root/.cache/huggingface/hub/models--mistralai--Mistral-7B-v0.1/snapshots/26bca36bde8333b5d7f72e9ed20ccda6a618af24/tokenizer.model
loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--mistralai--Mistral-7B-v0.1/snapshots/26bca36bde8333b5d7f72e9ed20ccda6a618af24/tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at /root/.cache/huggingface/hub/models--mistralai--Mistral-7B-v0.1/snapshots/26bca36bde8333b5d7f72e9ed20ccda6a618af24/special_tokens_map.json
loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--mistralai--Mistral-7B-v0.1/snapshots/26bca36bde8333b5d7f72e9ed20ccda6a618af24/tokenizer_config.json




loading file tokenizer.model from cache at /root/.cache/huggingface/hub/models--mistralai--Mistral-7B-v0.1/snapshots/26bca36bde8333b5d7f72e9ed20ccda6a618af24/tokenizer.model
loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--mistralai--Mistral-7B-v0.1/snapshots/26bca36bde8333b5d7f72e9ed20ccda6a618af24/tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at /root/.cache/huggingface/hub/models--mistralai--Mistral-7B-v0.1/snapshots/26bca36bde8333b5d7f72e9ed20ccda6a618af24/special_tokens_map.json
loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--mistralai--Mistral-7B-v0.1/snapshots/26bca36bde8333b5d7f72e9ed20ccda6a618af24/tokenizer_config.json


config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--mistralai--Mistral-7B-v0.1/snapshots/26bca36bde8333b5d7f72e9ed20ccda6a618af24/config.json
Model config MistralConfig {
  "_name_or_path": "mistralai/Mistral-7B-v0.1",
  "architectures": [
    "MistralForCausalLM"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 32768,
  "model_type": "mistral",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "rms_norm_eps": 1e-05,
  "rope_theta": 10000.0,
  "sliding_window": 4096,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.39.3",
  "use_cache": true,
  "vocab_size": 32000
}



model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

loading weights file model.safetensors from cache at /root/.cache/huggingface/hub/models--mistralai--Mistral-7B-v0.1/snapshots/26bca36bde8333b5d7f72e9ed20ccda6a618af24/model.safetensors.index.json


Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Instantiating MistralForCausalLM model under default dtype torch.bfloat16.
Detected flash_attn version: 2.5.6
Detected flash_attn version: 2.5.6
Generate config GenerationConfig {
  "bos_token_id": 1,
  "eos_token_id": 2
}

Detected flash_attn version: 2.5.6
Detected flash_attn version: 2.5.6
Detected flash_attn version: 2.5.6
Detected flash_attn version: 2.5.6
Detected flash_attn version: 2.5.6
Detected flash_attn version: 2.5.6
Detected flash_attn version: 2.5.6
Detected flash_attn version: 2.5.6
Detected flash_attn version: 2.5.6
Detected flash_attn version: 2.5.6
Detected flash_attn version: 2.5.6
Detected flash_attn version: 2.5.6
Detected flash_attn version: 2.5.6
Detected flash_attn version: 2.5.6
Detected flash_attn version: 2.5.6
Detected flash_attn version: 2.5.6
Detected flash_attn version: 2.5.6
Detected flash_attn version: 2.5.6
Detected flash_attn version: 2.5.6
Detected flash_attn version: 2.5.6
Detected flash_attn version: 2.5.6
Detected flash_attn version: 2.5.6
Detect

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

All model checkpoint weights were used when initializing MistralForCausalLM.

All the weights of MistralForCausalLM were initialized from the model checkpoint at mistralai/Mistral-7B-v0.1.
If your task is similar to the task the model of the checkpoint was trained on, you can already use MistralForCausalLM for predictions without further training.


generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

loading configuration file generation_config.json from cache at /root/.cache/huggingface/hub/models--mistralai--Mistral-7B-v0.1/snapshots/26bca36bde8333b5d7f72e9ed20ccda6a618af24/generation_config.json
Generate config GenerationConfig {
  "bos_token_id": 1,
  "eos_token_id": 2
}

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
PyTorch: setting up devices


Map:   0%|          | 0/9846 [00:00<?, ? examples/s]

Map:   0%|          | 0/518 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
Using auto half precision backend
Currently training with a batch size of: 8
***** Running training *****
  Num examples = 9,846
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 1,231
  Number of trainable parameters = 43,319,296


Step,Training Loss,Validation Loss
50,1.3473,1.353635
100,1.2624,1.231624
150,1.1804,1.188604
200,1.1553,1.169926
250,1.1288,1.153853
300,1.15,1.150144
350,1.1279,1.147451
400,1.1381,1.1452
450,1.1226,1.143323
500,1.1212,1.141987


***** Running Evaluation *****
  Num examples = 518
  Batch size = 8
***** Running Evaluation *****
  Num examples = 518
  Batch size = 8
***** Running Evaluation *****
  Num examples = 518
  Batch size = 8
***** Running Evaluation *****
  Num examples = 518
  Batch size = 8
***** Running Evaluation *****
  Num examples = 518
  Batch size = 8
***** Running Evaluation *****
  Num examples = 518
  Batch size = 8
***** Running Evaluation *****
  Num examples = 518
  Batch size = 8
***** Running Evaluation *****
  Num examples = 518
  Batch size = 8
***** Running Evaluation *****
  Num examples = 518
  Batch size = 8
***** Running Evaluation *****
  Num examples = 518
  Batch size = 8
***** Running Evaluation *****
  Num examples = 518
  Batch size = 8
***** Running Evaluation *****
  Num examples = 518
  Batch size = 8
***** Running Evaluation *****
  Num examples = 518
  Batch size = 8
***** Running Evaluation *****
  Num examples = 518
  Batch size = 8
***** Running Evaluation *****
  N

Merging the adapter

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel
#Better to use bf16 if supported (Ampere GPUs or more recent)
#If bf16 is supported, the GPU is also recent enough to support FlashAttention
if torch.cuda.is_bf16_supported():
  !pip install flash_attn
  compute_dtype = torch.bfloat16
  attn_implementation = 'flash_attention_2'
else:
  compute_dtype = torch.float16
  attn_implementation = 'sdpa'


model_name = "mistralai/Mistral-7B-v0.1"
adapter = "./qdora/checkpoint-1231"
#Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, padding_size="left",  use_fast=True)

bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=True,
)

model = AutoModelForCausalLM.from_pretrained(
          model_name, quantization_config=bnb_config, device_map={"": 0}, torch_dtype=compute_dtype,  attn_implementation=attn_implementation
)

model = dequantize_model(model, to='./dqz_model_qdora/',dtype=compute_dtype)
model = PeftModel.from_pretrained(model, adapter)
model = model.merge_and_unload()

print(f"Successfully loaded the model {model_name} into memory")
#pushed to the hub
#model.push_to_hub("kaitchup/Mistral-7B-v0.1-oasstguanaco-1e-qdora-merged")
#tokenizer.push_to_hub("kaitchup/Mistral-7B-v0.1-oasstguanaco-1e-qdora-merged")




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Dequantizing `model.layers.0.self_attn.q_proj`...
Dequantizing `model.layers.0.self_attn.k_proj`...
Dequantizing `model.layers.0.self_attn.v_proj`...
Dequantizing `model.layers.0.self_attn.o_proj`...
Dequantizing `model.layers.0.mlp.gate_proj`...
Dequantizing `model.layers.0.mlp.up_proj`...
Dequantizing `model.layers.0.mlp.down_proj`...
Dequantizing `model.layers.1.self_attn.q_proj`...
Dequantizing `model.layers.1.self_attn.k_proj`...
Dequantizing `model.layers.1.self_attn.v_proj`...
Dequantizing `model.layers.1.self_attn.o_proj`...
Dequantizing `model.layers.1.mlp.gate_proj`...
Dequantizing `model.layers.1.mlp.up_proj`...
Dequantizing `model.layers.1.mlp.down_proj`...
Dequantizing `model.layers.2.self_attn.q_proj`...
Dequantizing `model.layers.2.self_attn.k_proj`...
Dequantizing `model.layers.2.self_attn.v_proj`...
Dequantizing `model.layers.2.self_attn.o_proj`...
Dequantizing `model.layers.2.mlp.gate_proj`...
Dequantizing `model.layers.2.mlp.up_proj`...
Dequantizing `model.layers.2.m

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/kaitchup/Mistral-7B-v0.1-oasstguanaco-1e-qdora-merged/commit/558bbd6ffcc2df825d10398546ed0566511e9653', commit_message='Upload tokenizer', commit_description='', oid='558bbd6ffcc2df825d10398546ed0566511e9653', pr_url=None, pr_revision=None, pr_num=None)



---


**QLoRA**

More info: [QLoRA: Fine-Tune a Large Language Model on Your GPU](https://kaitchup.substack.com/p/qlora-fine-tune-a-large-language-model-on-your-gpu-27bed5a03e2b)

In [None]:
model_name = "mistralai/Mistral-7B-v0.1"
#Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, add_eos_token=True, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'left' #Necessary for FlashAttention compatibility

#Better to use bf16 if supported (Ampere GPUs or more recent)
#If bf16 is supported, the GPU is also recent enough to support FlashAttention
if torch.cuda.is_bf16_supported():
  !pip install flash_attn
  compute_dtype = torch.bfloat16
  attn_implementation = 'flash_attention_2'
else:
  compute_dtype = torch.float16
  attn_implementation = 'sdpa'

dataset = load_dataset("timdettmers/openassistant-guanaco")


#Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, add_eos_token=True, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'left' #Necessary for FlashAttention compatibility

bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=True,
)
model = AutoModelForCausalLM.from_pretrained(
          model_name, quantization_config=bnb_config, device_map={"": 0}, torch_dtype=compute_dtype,  attn_implementation=attn_implementation
)
model = prepare_model_for_kbit_training(model)
model.gradient_checkpointing_enable({'use_reentrant':True})
#model.enable_input_require_grads()

peft_config = LoraConfig(
        lora_alpha=16,
        lora_dropout=0.05,
        r=16,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules= ['k_proj', 'q_proj', 'v_proj', 'o_proj', "gate_proj", "down_proj", "up_proj"]
)


from trl import SFTConfig

training_arguments = SFTConfig(
        output_dir="./qlora/",
        evaluation_strategy="steps",
        do_eval=True,
        optim="paged_adamw_8bit",
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        log_level="debug",
        logging_steps=50,
        learning_rate=1e-5,
        eval_steps=50,
        num_train_epochs=1,
        fp16= not torch.cuda.is_bf16_supported(),
        bf16= torch.cuda.is_bf16_supported(),
        save_strategy='epoch',
        warmup_ratio=0.1,
        lr_scheduler_type="linear",
)

trainer = SFTTrainer(
        model=model,
        train_dataset=dataset['train'],
        eval_dataset=dataset['test'],
        dataset_text_field="text",
        max_seq_length=512,
        peft_config=peft_config,
        tokenizer=tokenizer,
        args=training_arguments,
)

trainer.train()

tokenizer_config.json:   0%|          | 0.00/967 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

Collecting flash_attn
  Downloading flash_attn-2.5.7.tar.gz (2.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting einops (from flash_attn)
  Downloading einops-0.7.0-py3-none-any.whl (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.6/44.6 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
Collecting ninja (from flash_attn)
  Downloading ninja-1.11.1.1-py2.py3-none-manylinux1_x86_64.manylinux_2_5_x86_64.whl (307 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m307.2/307.2 kB[0m [31m21.0 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: flash_attn
  Building wheel for flash_attn (setup.py) ... [?25l[?25hdone
  Created wheel for flash_attn: filename=flash_attn-2.5.7-cp310-cp310-linux_x86_64.whl size=120853563 sha256=bbe6f77fd0899f8a125a5bdcf734b660c4c88e81c9b51c7ce98ebeba44dc6fa0
  St

Downloading readme:   0%|          | 0.00/395 [00:00<?, ?B/s]



Downloading data:   0%|          | 0.00/20.9M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.11M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Map:   0%|          | 0/9846 [00:00<?, ? examples/s]

Map:   0%|          | 0/518 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
Using auto half precision backend
Currently training with a batch size of: 8
***** Running training *****
  Num examples = 9,846
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 1,231
  Number of trainable parameters = 41,943,040
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
The input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in torch.bfloat16.


Step,Training Loss,Validation Loss
50,1.3473,1.353649
100,1.262,1.231738
150,1.1812,1.189125
200,1.1561,1.170896
250,1.1299,1.154133
300,1.1503,1.150258
350,1.128,1.14749
400,1.1383,1.145134
450,1.1227,1.1432
500,1.1211,1.141903


***** Running Evaluation *****
  Num examples = 518
  Batch size = 8
***** Running Evaluation *****
  Num examples = 518
  Batch size = 8
***** Running Evaluation *****
  Num examples = 518
  Batch size = 8
***** Running Evaluation *****
  Num examples = 518
  Batch size = 8
***** Running Evaluation *****
  Num examples = 518
  Batch size = 8
***** Running Evaluation *****
  Num examples = 518
  Batch size = 8
***** Running Evaluation *****
  Num examples = 518
  Batch size = 8
***** Running Evaluation *****
  Num examples = 518
  Batch size = 8
***** Running Evaluation *****
  Num examples = 518
  Batch size = 8
***** Running Evaluation *****
  Num examples = 518
  Batch size = 8
***** Running Evaluation *****
  Num examples = 518
  Batch size = 8


Step,Training Loss,Validation Loss
50,1.3473,1.353649
100,1.262,1.231738
150,1.1812,1.189125
200,1.1561,1.170896
250,1.1299,1.154133
300,1.1503,1.150258
350,1.128,1.14749
400,1.1383,1.145134
450,1.1227,1.1432
500,1.1211,1.141903


***** Running Evaluation *****
  Num examples = 518
  Batch size = 8
***** Running Evaluation *****
  Num examples = 518
  Batch size = 8
***** Running Evaluation *****
  Num examples = 518
  Batch size = 8
***** Running Evaluation *****
  Num examples = 518
  Batch size = 8
***** Running Evaluation *****
  Num examples = 518
  Batch size = 8
***** Running Evaluation *****
  Num examples = 518
  Batch size = 8
***** Running Evaluation *****
  Num examples = 518
  Batch size = 8
***** Running Evaluation *****
  Num examples = 518
  Batch size = 8
***** Running Evaluation *****
  Num examples = 518
  Batch size = 8
***** Running Evaluation *****
  Num examples = 518
  Batch size = 8
***** Running Evaluation *****
  Num examples = 518
  Batch size = 8
***** Running Evaluation *****
  Num examples = 518
  Batch size = 8
***** Running Evaluation *****
  Num examples = 518
  Batch size = 8
Saving model checkpoint to ./drive/MyDrive/mergexp/qlora/checkpoint-1231
loading configuration file con

TrainOutput(global_step=1231, training_loss=1.1352182507611792, metrics={'train_runtime': 15574.6019, 'train_samples_per_second': 0.632, 'train_steps_per_second': 0.079, 'total_flos': 2.148541273402245e+17, 'train_loss': 1.1352182507611792, 'epoch': 1.0})

Merging the adapter

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel
#Better to use bf16 if supported (Ampere GPUs or more recent)
#If bf16 is supported, the GPU is also recent enough to support FlashAttention
if torch.cuda.is_bf16_supported():
  !pip install flash_attn
  compute_dtype = torch.bfloat16
  attn_implementation = 'flash_attention_2'
else:
  compute_dtype = torch.float16
  attn_implementation = 'sdpa'


model_name = "mistralai/Mistral-7B-v0.1"
adapter = "./qlora/checkpoint-1231"
#Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, padding_size="left",  use_fast=True)

bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=True,
)

model = AutoModelForCausalLM.from_pretrained(
          model_name, quantization_config=bnb_config, device_map={"": 0}, torch_dtype=compute_dtype,  attn_implementation=attn_implementation
)

model = dequantize_model(model, to='./dqz_model_qlora/',dtype=compute_dtype)
model = PeftModel.from_pretrained(model, adapter)
model = model.merge_and_unload()

print(f"Successfully loaded the model {model_name} into memory")

#pushed to the hub
#model.push_to_hub("kaitchup/Mistral-7B-v0.1-oasstguanaco-1e-qlora-merged")
#tokenizer.push_to_hub("kaitchup/Mistral-7B-v0.1-oasstguanaco-1e-qlora-merged")





Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Dequantizing `model.layers.0.self_attn.q_proj`...
Dequantizing `model.layers.0.self_attn.k_proj`...
Dequantizing `model.layers.0.self_attn.v_proj`...
Dequantizing `model.layers.0.self_attn.o_proj`...
Dequantizing `model.layers.0.mlp.gate_proj`...
Dequantizing `model.layers.0.mlp.up_proj`...
Dequantizing `model.layers.0.mlp.down_proj`...
Dequantizing `model.layers.1.self_attn.q_proj`...
Dequantizing `model.layers.1.self_attn.k_proj`...
Dequantizing `model.layers.1.self_attn.v_proj`...
Dequantizing `model.layers.1.self_attn.o_proj`...
Dequantizing `model.layers.1.mlp.gate_proj`...
Dequantizing `model.layers.1.mlp.up_proj`...
Dequantizing `model.layers.1.mlp.down_proj`...
Dequantizing `model.layers.2.self_attn.q_proj`...
Dequantizing `model.layers.2.self_attn.k_proj`...
Dequantizing `model.layers.2.self_attn.v_proj`...
Dequantizing `model.layers.2.self_attn.o_proj`...
Dequantizing `model.layers.2.mlp.gate_proj`...
Dequantizing `model.layers.2.mlp.up_proj`...
Dequantizing `model.layers.2.m

# Benchmarking: Inference throughtput and accuracy

Some utilities to dequantize a model and benchmark perplexity and inference speed.

The dequantization is done before merging the adapter. More info here:
[LoRA Adapters: When a Naive Merge Leads to Poor Performance](https://kaitchup.substack.com/p/lora-adapters-when-a-naive-merge)

In [None]:
import torch
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel
from datasets import load_dataset
from peft.utils import _get_submodules
import os,json, copy
import bitsandbytes as bnb
from bitsandbytes.functional import dequantize_4bit
import time
#Better to use bf16 if supported (Ampere GPUs or more recent)
#If bf16 is supported, the GPU is also recent enough to support FlashAttention
if torch.cuda.is_bf16_supported():
  !pip install flash_attn
  compute_dtype = torch.bfloat16
  attn_implementation = 'flash_attention_2'
else:
  compute_dtype = torch.float16
  attn_implementation = 'sdpa'


def dequantize_model(model, to='./dequantized_model', dtype=torch.float16, device="cuda"):
    """
    'model': the peftmodel you loaded with qlora.
    'tokenizer': the model's corresponding hf's tokenizer.
    'to': directory to save the dequantized model
    'dtype': dtype that the model was trained using
    'device': device to load the model to
    """


    os.makedirs(to, exist_ok=True)

    cls = bnb.nn.Linear4bit

    with torch.no_grad():
        for name, module in model.named_modules():
            if isinstance(module, cls):
                print(f"Dequantizing `{name}`...")
                quant_state = copy.deepcopy(module.weight.quant_state)
                quant_state.dtype = dtype

                weights = dequantize_4bit(module.weight.data, quant_state=quant_state, quant_type="nf4").to(dtype)

                new_module = torch.nn.Linear(module.in_features, module.out_features, bias=None, dtype=dtype)
                new_module.weight = torch.nn.Parameter(weights)
                new_module.to(device=device, dtype=dtype)

                parent, target, target_name = _get_submodules(model, name)
                setattr(parent, target_name, new_module)

        # a hack, setting this to avoid hf's saving error because hf
        # itself does not support saving a model that is registered to be loaded in 4bit.
        model.is_loaded_in_4bit = False

        print("Saving dequantized model...")
        model.save_pretrained(to)
        #tokenizer.save_pretrained(to)
        config_data = json.loads(open(os.path.join(to, 'config.json'), 'r').read())
        config_data.pop("quantization_config", None)
        config_data.pop("pretraining_tp", None)
        with open(os.path.join(to, 'config.json'), 'w') as config:
            config.write(json.dumps(config_data, indent=2))

        return model


#compute the perplexity on a dataset given a tokenizer and a model
def ppl_model(model, tokenizer, dataset):
  nlls= []
  max_length = 2048
  stride = 512
  for s in tqdm(range(len(dataset['text']))):
      encodings = tokenizer(dataset['text'][s], return_tensors="pt")
      seq_len = encodings.input_ids.size(1)
      prev_end_loc = 0
      for begin_loc in range(0, seq_len, stride):
          end_loc = min(begin_loc + max_length, seq_len)
          trg_len = end_loc - prev_end_loc
          input_ids = encodings.input_ids[:, begin_loc:end_loc].to("cuda")
          target_ids = input_ids.clone()
          target_ids[:, :-trg_len] = -100
          with torch.no_grad():
              outputs = model(input_ids, labels=target_ids)
              neg_log_likelihood = outputs.loss
          nlls.append(neg_log_likelihood)
          prev_end_loc = end_loc
          if end_loc == seq_len:
              break
  ppl = torch.exp(torch.stack(nlls).mean())
  return ppl.item()

model_name = "mistralai/Mistral-7B-v0.1"

#Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, padding_size="left",  use_fast=True)
tokenizer.pad_token = tokenizer.eos_token
p = "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\n\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information. \n\n Tell me about gravity."


def eval_model(model):
  total_tokens = 0
  total_duration = 0
  for b in range(5):


    inputs = tokenizer(p, return_tensors="pt").to("cuda")
    generation_time = time.time()
    outputs = model.generate(**inputs, pad_token_id=tokenizer.eos_token_id, max_new_tokens=300)
    duration = time.time() - generation_time
    total_duration += duration

    for output in outputs:
      result = tokenizer.decode(output)
      nb_tokens = len(result)
      total_tokens += nb_tokens
    print("--- Speed: %s tokens/second ---" % (round(nb_tokens/duration,2)))
  print("--- Average speed: %s tokens/second ---" % (round(total_tokens/total_duration,2)))

  ds = load_dataset("timdettmers/openassistant-guanaco", split='test')
  ppl = ppl_model(model, tokenizer, ds)
  print("Perplexity: "+str(ppl))

Collecting flash_attn
  Downloading flash_attn-2.5.7.tar.gz (2.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting einops (from flash_attn)
  Downloading einops-0.7.0-py3-none-any.whl (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.6/44.6 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
Collecting ninja (from flash_attn)
  Downloading ninja-1.11.1.1-py2.py3-none-manylinux1_x86_64.manylinux_2_5_x86_64.whl (307 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m307.2/307.2 kB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: flash_attn
  Building wheel for flash_attn (setup.py) ... [?25l[?25hdone
  Created wheel for flash_attn: filename=flash_attn-2.5.7-cp310-cp310-linux_x86_64.whl size=120853563 sha256=bbe6f77fd0899f8a125a5bdcf734b660c4c88e81c9b51c7ce98ebeba44dc6fa0
  St

tokenizer_config.json:   0%|          | 0.00/967 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

# Baseline: Mistral 7B Quantized with bitsandbytes NF4

In [None]:
model_name = "mistralai/Mistral-7B-v0.1"
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=True,
)
loading_start = time.time()
model = AutoModelForCausalLM.from_pretrained(
          model_name, quantization_config=bnb_config, device_map={"": 0}, torch_dtype=compute_dtype,  attn_implementation=attn_implementation
)
print("--- Loading model time: %s seconds ---" % (time.time() - loading_start))
eval_model(model)

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

--- Loading model time: 63.12954354286194 seconds ---
--- Speed: 30.5 tokens/second ---
--- Speed: 35.92 tokens/second ---
--- Speed: 35.9 tokens/second ---
--- Speed: 35.94 tokens/second ---
--- Speed: 36.06 tokens/second ---
--- Average speed: 34.71 tokens/second ---


Downloading readme:   0%|          | 0.00/395 [00:00<?, ?B/s]



Downloading data:   0%|          | 0.00/20.9M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.11M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

100%|██████████| 518/518 [02:18<00:00,  3.73it/s]

Perplexity: 4.720216751098633





# QLoRA

With a loaded adapter

In [None]:
model_name = "mistralai/Mistral-7B-v0.1"
adapter = "../qlora/checkpoint-1231"
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=True,
)
loading_start = time.time()
model = AutoModelForCausalLM.from_pretrained(
          model_name, quantization_config=bnb_config, device_map={"": 0}, torch_dtype=compute_dtype,  attn_implementation=attn_implementation
)
print("--- Loading model time: %s seconds ---" % (time.time() - loading_start))
loading_adapter_start = time.time()
model = PeftModel.from_pretrained(model, adapter)
print("--- Loading adapter time: %s seconds ---" % (time.time() - loading_adapter_start))

eval_model(model)


Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

--- Loading model time: 64.71461462974548 seconds ---
--- Loading adapter time: 6.66745662689209 seconds ---
--- Speed: 43.31 tokens/second ---
--- Speed: 45.56 tokens/second ---
--- Speed: 45.41 tokens/second ---
--- Speed: 45.01 tokens/second ---
--- Speed: 45.22 tokens/second ---
--- Average speed: 44.89 tokens/second ---


Downloading readme:   0%|          | 0.00/395 [00:00<?, ?B/s]



Downloading data:   0%|          | 0.00/20.9M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.11M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

100%|██████████| 518/518 [02:52<00:00,  3.01it/s]

Perplexity: 3.379533052444458





with a merged adapter

In [None]:
model_name = "kaitchup/Mistral-7B-v0.1-oasstguanaco-1e-qlora-merged"
loading_start = time.time()
model = AutoModelForCausalLM.from_pretrained(
          model_name, device_map={"": 0}, torch_dtype=compute_dtype,  attn_implementation=attn_implementation
)
print("--- Loading model time: %s seconds ---" % (time.time() - loading_start))
eval_model(model)


config.json:   0%|          | 0.00/642 [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

--- Loading model time: 7.132479667663574 seconds ---
--- Speed: 76.58 tokens/second ---
--- Speed: 76.71 tokens/second ---
--- Speed: 76.79 tokens/second ---
--- Speed: 76.68 tokens/second ---
--- Speed: 76.64 tokens/second ---
--- Average speed: 76.68 tokens/second ---


100%|██████████| 518/518 [01:36<00:00,  5.35it/s]

Perplexity: 3.3805997371673584





With merged adapter and BNB NF4 quantization

In [None]:
model_name = "kaitchup/Mistral-7B-v0.1-oasstguanaco-1e-qlora-merged"
loading_start = time.time()
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=True,
)
model = AutoModelForCausalLM.from_pretrained(
          model_name, quantization_config=bnb_config, device_map={"": 0}, torch_dtype=compute_dtype,  attn_implementation=attn_implementation
)
print("--- Loading model time: %s seconds ---" % (time.time() - loading_start))
eval_model(model)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

--- Loading model time: 7.103466033935547 seconds ---
--- Speed: 34.68 tokens/second ---
--- Speed: 34.67 tokens/second ---
--- Speed: 34.91 tokens/second ---
--- Speed: 35.11 tokens/second ---
--- Speed: 35.0 tokens/second ---
--- Average speed: 34.87 tokens/second ---


100%|██████████| 518/518 [02:18<00:00,  3.74it/s]

Perplexity: 4.590646266937256





With merged adapter and AWQ quantization

In [None]:
from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer

model_path = 'kaitchup/Mistral-7B-v0.1-oasstguanaco-1e-qlora-merged'
quant_path = 'Mistral-7B-v0.1-oasstguanaco-1e-qlora-merged-awq-4bit'
quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" }

# Load model and tokenizer
model = AutoAWQForCausalLM.from_pretrained(model_path, safetensors=True)
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)

# Quantize
model.quantize(tokenizer, quant_config=quant_config)

# Save quantized model with safetensors
model.save_quantized("./"+quant_path, safetensors=True)
tokenizer.save_pretrained("./"+quant_path)

config.json:   0%|          | 0.00/642 [00:00<?, ?B/s]

Fetching 12 files:   0%|          | 0/12 [00:00<?, ?it/s]

.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/993 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

AWQ: 100%|██████████| 32/32 [28:30<00:00, 53.45s/it]


('./Mistral-7B-v0.1-oasstguanaco-1e-qlora-merged-awq-4bit/tokenizer_config.json',
 './Mistral-7B-v0.1-oasstguanaco-1e-qlora-merged-awq-4bit/special_tokens_map.json',
 './Mistral-7B-v0.1-oasstguanaco-1e-qlora-merged-awq-4bit/tokenizer.model',
 './Mistral-7B-v0.1-oasstguanaco-1e-qlora-merged-awq-4bit/added_tokens.json',
 './Mistral-7B-v0.1-oasstguanaco-1e-qlora-merged-awq-4bit/tokenizer.json')

In [None]:
model_name = "Mistral-7B-v0.1-oasstguanaco-1e-qlora-merged-awq-4bit"
loading_start = time.time()
model = AutoModelForCausalLM.from_pretrained(
          model_name, device_map={"": 0}, torch_dtype=torch.float16
)
print("--- Loading model time: %s seconds ---" % (time.time() - loading_start))
eval_model(model)

--- Loading model time: 2.4577279090881348 seconds ---
--- Speed: 278.74 tokens/second ---
--- Speed: 876.4 tokens/second ---
--- Speed: 113.87 tokens/second ---
--- Speed: 108.53 tokens/second ---
--- Speed: 435.08 tokens/second ---
--- Average speed: 152.05 tokens/second ---


100%|██████████| 518/518 [02:57<00:00,  2.92it/s]

Perplexity: 3.4783289432525635





# QDoRA
With a loaded adapter

In [None]:
model_name = "mistralai/Mistral-7B-v0.1"
adapter = "./qdora/checkpoint-1231"
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=True,
)
loading_start = time.time()
model = AutoModelForCausalLM.from_pretrained(
          model_name, quantization_config=bnb_config, device_map={"": 0}, torch_dtype=compute_dtype,  attn_implementation=attn_implementation
)
print("--- Loading model time: %s seconds ---" % (time.time() - loading_start))
loading_adapter_start = time.time()
model = PeftModel.from_pretrained(model, adapter)
print("--- Loading adapter time: %s seconds ---" % (time.time() - loading_adapter_start))

eval_model(model)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

--- Loading model time: 57.367106676101685 seconds ---
--- Loading adapter time: 6.7472243309021 seconds ---
--- Speed: 5.0 tokens/second ---
--- Speed: 5.01 tokens/second ---
--- Speed: 5.01 tokens/second ---
--- Speed: 5.01 tokens/second ---
--- Speed: 5.01 tokens/second ---
--- Average speed: 5.01 tokens/second ---


100%|██████████| 518/518 [18:27<00:00,  2.14s/it]

Perplexity: 3.3790879249572754





with a merged adapter

In [None]:
model_name = "kaitchup/Mistral-7B-v0.1-oasstguanaco-1e-qdora-merged"
loading_start = time.time()
model = AutoModelForCausalLM.from_pretrained(
          model_name, device_map={"": 0}, torch_dtype=compute_dtype,  attn_implementation=attn_implementation
)
print("--- Loading model time: %s seconds ---" % (time.time() - loading_start))
eval_model(model)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

--- Loading model time: 5.826384544372559 seconds ---
--- Speed: 74.77 tokens/second ---
--- Speed: 77.26 tokens/second ---
--- Speed: 77.34 tokens/second ---
--- Speed: 77.63 tokens/second ---
--- Speed: 77.64 tokens/second ---
--- Average speed: 76.91 tokens/second ---


100%|██████████| 518/518 [01:37<00:00,  5.33it/s]

Perplexity: 3.3809475898742676





With merged adapter and BNB NF4 quantization

In [None]:
model_name = "kaitchup/Mistral-7B-v0.1-oasstguanaco-1e-qdora-merged"
loading_start = time.time()
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=True,
)
model = AutoModelForCausalLM.from_pretrained(
          model_name, quantization_config=bnb_config, device_map={"": 0}, torch_dtype=compute_dtype,  attn_implementation=attn_implementation
)
print("--- Loading model time: %s seconds ---" % (time.time() - loading_start))
eval_model(model)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

--- Loading model time: 6.595882892608643 seconds ---
--- Speed: 35.01 tokens/second ---
--- Speed: 35.06 tokens/second ---
--- Speed: 35.07 tokens/second ---
--- Speed: 35.14 tokens/second ---
--- Speed: 35.1 tokens/second ---
--- Average speed: 35.08 tokens/second ---


100%|██████████| 518/518 [02:18<00:00,  3.74it/s]

Perplexity: 4.622631072998047





With merged adapter and AWQ quantization

In [None]:
!pip install autoawq

Collecting autoawq
  Downloading autoawq-0.2.4-cp310-cp310-manylinux2014_x86_64.whl (80 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m80.8/80.8 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Collecting transformers<=4.38.2,>=4.35.0 (from autoawq)
  Downloading transformers-4.38.2-py3-none-any.whl (8.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.5/8.5 MB[0m [31m26.7 MB/s[0m eta [36m0:00:00[0m
Collecting zstandard (from autoawq)
  Downloading zstandard-0.22.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.4/5.4 MB[0m [31m60.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting autoawq-kernels (from autoawq)
  Downloading autoawq_kernels-0.0.6-cp310-cp310-manylinux2014_x86_64.whl (33.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.4/33.4 MB[0m [31m50.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: zstandard, trans

In [None]:
from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer

model_path = 'kaitchup/Mistral-7B-v0.1-oasstguanaco-1e-qdora-merged'
quant_path = 'Mistral-7B-v0.1-oasstguanaco-1e-qdora-merged-awq-4bit'
quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" }

# Load model and tokenizer
model = AutoAWQForCausalLM.from_pretrained(model_path, safetensors=True)
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)

# Quantize
model.quantize(tokenizer, quant_config=quant_config)

# Save quantized model with safetensors
model.save_quantized("./"+quant_path, safetensors=True)
tokenizer.save_pretrained("./"+quant_path)

In [None]:
model_name = "Mistral-7B-v0.1-oasstguanaco-1e-qdora-merged-awq-4bit"
loading_start = time.time()
model = AutoModelForCausalLM.from_pretrained(
          model_name, device_map={"": 0}, torch_dtype=torch.float16
)
print("--- Loading model time: %s seconds ---" % (time.time() - loading_start))
eval_model(model)

--- Loading model time: 2.804798126220703 seconds ---
--- Speed: 96.55 tokens/second ---
--- Speed: 118.2 tokens/second ---
--- Speed: 114.38 tokens/second ---
--- Speed: 1568.48 tokens/second ---
--- Speed: 6009.32 tokens/second ---
--- Average speed: 132.51 tokens/second ---


Downloading readme:   0%|          | 0.00/395 [00:00<?, ?B/s]



Downloading data:   0%|          | 0.00/20.9M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.11M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

100%|██████████| 518/518 [02:57<00:00,  2.92it/s]

Perplexity: 3.477247714996338





# LoftQ

In [None]:
model_name = "./loftq_base/"
adapter = "./loftq_ft/checkpoint-1231"

loading_start = time.time()
model = AutoModelForCausalLM.from_pretrained(
          model_name, device_map={"": 0}, torch_dtype=compute_dtype,  attn_implementation=attn_implementation
)
print("--- Loading model time: %s seconds ---" % (time.time() - loading_start))
loading_adapter_start = time.time()
model = PeftModel.from_pretrained(model, adapter)
print("--- Loading adapter time: %s seconds ---" % (time.time() - loading_adapter_start))

eval_model(model)

--- Loading model time: 51.703083515167236 seconds ---
--- Loading adapter time: 9.962267637252808 seconds ---
--- Speed: 40.67 tokens/second ---
--- Speed: 43.75 tokens/second ---
--- Speed: 43.7 tokens/second ---
--- Speed: 43.75 tokens/second ---
--- Speed: 43.64 tokens/second ---
--- Average speed: 43.07 tokens/second ---


100%|██████████| 518/518 [02:51<00:00,  3.01it/s]

Perplexity: 3.355402946472168





With merged adapter

In [None]:
model_name = "kaitchup/Mistral-7B-v0.1-oasstguanaco-1e-loftq-merged"
loading_start = time.time()
model = AutoModelForCausalLM.from_pretrained(
          model_name, device_map={"": 0}, torch_dtype=compute_dtype,  attn_implementation=attn_implementation
)
print("--- Loading model time: %s seconds ---" % (time.time() - loading_start))
eval_model(model)

config.json:   0%|          | 0.00/652 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

--- Loading model time: 345.3683784008026 seconds ---
--- Speed: 73.65 tokens/second ---
--- Speed: 75.74 tokens/second ---
--- Speed: 75.76 tokens/second ---
--- Speed: 75.65 tokens/second ---
--- Speed: 75.77 tokens/second ---
--- Average speed: 75.3 tokens/second ---


100%|██████████| 518/518 [01:35<00:00,  5.45it/s]

Perplexity: 3.3551559448242188





With merged adapter and BNB NF4 quantization

In [None]:
model_name = "kaitchup/Mistral-7B-v0.1-oasstguanaco-1e-loftq-merged"
loading_start = time.time()
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=True,
)
model = AutoModelForCausalLM.from_pretrained(
          model_name, quantization_config=bnb_config, device_map={"": 0}, torch_dtype=compute_dtype,  attn_implementation=attn_implementation
)
print("--- Loading model time: %s seconds ---" % (time.time() - loading_start))
eval_model(model)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

--- Loading model time: 7.500432014465332 seconds ---
--- Speed: 30.09 tokens/second ---
--- Speed: 34.83 tokens/second ---
--- Speed: 35.02 tokens/second ---
--- Speed: 35.15 tokens/second ---
--- Speed: 35.07 tokens/second ---
--- Average speed: 33.91 tokens/second ---


100%|██████████| 518/518 [02:17<00:00,  3.76it/s]

Perplexity: 4.526394844055176





With merged adapter and GPTQ quantization

In [None]:
!pip install --upgrade transformers auto-gptq accelerate datasets auto-gptq optimum

Collecting transformers
  Downloading transformers-4.39.3-py3-none-any.whl (8.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.8/8.8 MB[0m [31m71.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting auto-gptq
  Downloading auto_gptq-0.7.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (23.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.5/23.5 MB[0m [31m72.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.29.2-py3-none-any.whl (297 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.4/297.4 kB[0m [31m40.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m55.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting optimum
  Downloading optimum-1.18.1-py3-none-any.whl (410 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[

In [None]:
from transformers import  GPTQConfig

model_path = 'kaitchup/Mistral-7B-v0.1-oasstguanaco-1e-loftq-merged'
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)

quantization_config = GPTQConfig(bits=4, dataset = "c4", tokenizer=tokenizer)

model = AutoModelForCausalLM.from_pretrained(model_path, device_map={"": 0}, quantization_config=quantization_config)

tokenizer_config.json:   0%|          | 0.00/992 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/652 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/41.1k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/319M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Quantizing model.layers blocks :   0%|          | 0/32 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]



In [None]:
eval_model(model)

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


--- Speed: 65.54 tokens/second ---


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


--- Speed: 72.3 tokens/second ---


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


--- Speed: 72.1 tokens/second ---


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


--- Speed: 72.95 tokens/second ---
--- Speed: 73.04 tokens/second ---
--- Average speed: 71.07 tokens/second ---


Downloading readme:   0%|          | 0.00/395 [00:00<?, ?B/s]



Downloading data:   0%|          | 0.00/20.9M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.11M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

100%|██████████| 518/518 [00:35<00:00, 14.44it/s]

Perplexity: 3.671870470046997





With merged adapter and AWQ quantization

In [None]:
!pip install autoawq

Collecting autoawq
  Downloading autoawq-0.2.4-cp310-cp310-manylinux2014_x86_64.whl (80 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/80.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m80.8/80.8 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
Collecting transformers<=4.38.2,>=4.35.0 (from autoawq)
  Downloading transformers-4.38.2-py3-none-any.whl (8.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.5/8.5 MB[0m [31m26.0 MB/s[0m eta [36m0:00:00[0m
Collecting zstandard (from autoawq)
  Downloading zstandard-0.22.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.4/5.4 MB[0m [31m57.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting autoawq-kernels (from autoawq)
  Downloading autoawq_kernels-0.0.6-cp310-cp310-manylinux2014_x86_64.whl (33.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer

model_path = 'kaitchup/Mistral-7B-v0.1-oasstguanaco-1e-loftq-merged'
quant_path = 'Mistral-7B-v0.1-oasstguanaco-1e-loftq-merged-awq-4bit'
quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" }

# Load model and tokenizer
model = AutoAWQForCausalLM.from_pretrained(model_path, safetensors=True)
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)

# Quantize
model.quantize(tokenizer, quant_config=quant_config)

# Save quantized model with safetensors
model.save_quantized("./"+quant_path, safetensors=True)
tokenizer.save_pretrained("./"+quant_path)

config.json:   0%|          | 0.00/652 [00:00<?, ?B/s]

Fetching 12 files:   0%|          | 0/12 [00:00<?, ?it/s]

README.md:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/992 [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading readme:   0%|          | 0.00/167 [00:00<?, ?B/s]



Downloading data:   0%|          | 0.00/471M [00:00<?, ?B/s]

Generating validation split: 0 examples [00:00, ? examples/s]

AWQ: 100%|██████████| 32/32 [28:31<00:00, 53.48s/it]


('./Mistral-7B-v0.1-oasstguanaco-1e-loftq-merged-awq-4bit/tokenizer_config.json',
 './Mistral-7B-v0.1-oasstguanaco-1e-loftq-merged-awq-4bit/special_tokens_map.json',
 './Mistral-7B-v0.1-oasstguanaco-1e-loftq-merged-awq-4bit/tokenizer.model',
 './Mistral-7B-v0.1-oasstguanaco-1e-loftq-merged-awq-4bit/added_tokens.json',
 './Mistral-7B-v0.1-oasstguanaco-1e-loftq-merged-awq-4bit/tokenizer.json')

In [None]:
model_name = "Mistral-7B-v0.1-oasstguanaco-1e-loftq-merged-awq-4bit"
loading_start = time.time()
model = AutoModelForCausalLM.from_pretrained(
          model_name, device_map={"": 0}, torch_dtype=torch.float16
)
print("--- Loading model time: %s seconds ---" % (time.time() - loading_start))
eval_model(model)

--- Loading model time: 2.324984312057495 seconds ---
--- Speed: 109.81 tokens/second ---
--- Speed: 156.0 tokens/second ---
--- Speed: 154.99 tokens/second ---
--- Speed: 158.94 tokens/second ---
--- Speed: 153.41 tokens/second ---
--- Average speed: 140.26 tokens/second ---


Downloading readme:   0%|          | 0.00/395 [00:00<?, ?B/s]



Downloading data:   0%|          | 0.00/20.9M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.11M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

100%|██████████| 518/518 [02:58<00:00,  2.90it/s]

Perplexity: 3.4350991249084473



