# PEFT with Instruction Tuning

## 1. Install and import necessary libaries

In [1]:
# !pip install -q -U bitsandbytes
# !pip install -q -U datasets
# !pip install -q -U git+https://github.com/huggingface/transformers.git
# !pip install -q -U git+https://github.com/huggingface/peft.git
# !pip install -q -U git+https://github.com/huggingface/accelerate.git
# !pip install -q -U loralib
# !pip install -q -U einops

In [2]:
import torch
from datasets import load_dataset, Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
)
from peft import LoraConfig, TaskType, get_peft_model, prepare_model_for_kbit_training

In [3]:
from peft import PeftModelForCausalLM

PeftModelForCausalLM.__mro__

(peft.peft_model.PeftModelForCausalLM,
 peft.peft_model.PeftModel,
 transformers.utils.hub.PushToHubMixin,
 torch.nn.modules.module.Module,
 object)

In [4]:
# import os
# from google.colab import userdata

# os.environ['HF_TOKEN'] =  userdata.get('HF_TOKEN')

## 2. Prepare pre-trained LLM

## 2.1 Load pre-trained model and quantize it

In [5]:
from transformers.utils.quantization_config import BitsAndBytesConfig

MODEL_NAME = "meta-llama/Llama-3.2-3B-Instruct"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    ## Use NormalFloat4b or FP4b
    bnb_4bit_quant_type="nf4",
    ## dtype to dequantize weights into
    bnb_4bit_compute_dtype=torch.bfloat16,
)

In [6]:
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    offload_state_dict=True,
    quantization_config=bnb_config,
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [7]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 3072)
    (layers): ModuleList(
      (0-27): 28 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=3072, out_features=3072, bias=False)
          (k_proj): Linear4bit(in_features=3072, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=3072, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=3072, out_features=3072, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=3072, out_features=8192, bias=False)
          (up_proj): Linear4bit(in_features=3072, out_features=8192, bias=False)
          (down_proj): Linear4bit(in_features=8192, out_features=3072, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((3072,)

In [8]:
## 1- Cast the layernorm in fp32
## 2- Making output embedding layer require grads (no need for model.enable_input_require_grads())
## 3- Add the upcasting of the lm head to fp32
## 4- Freezing the base model layers to ensure they are not updated during training
## use_gradient_checkpointing: Forward computation in checkpointed regions omits saving tensors for backward and recomputes them during the backward pass
model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True)

In [9]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 3072)
    (layers): ModuleList(
      (0-27): 28 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=3072, out_features=3072, bias=False)
          (k_proj): Linear4bit(in_features=3072, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=3072, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=3072, out_features=3072, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=3072, out_features=8192, bias=False)
          (up_proj): Linear4bit(in_features=3072, out_features=8192, bias=False)
          (down_proj): Linear4bit(in_features=8192, out_features=3072, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((3072,)

## 2.2 Set up LoRA

In [10]:
config = LoraConfig(
    r=16,
    lora_alpha=32,
    ## output = MHA(q_proj x q, k_proj x k, v_proj x v)
    ## output = o_proj x output
    ## output = down_proj( act_fn(gate_proj(input)) x up_proj(input) )
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],
    ## Use Rank-Stablized LoRA, setting scaling to alpha/sqrt(r) instead of alpha/r
    # use_rslora=True,
    lora_dropout=0.05,
    ## ["none", "all", "lora_only"]
    bias="none",
    ## "SEQ_CLS": PeftModelForSequenceClassification,
    ## "SEQ_2_SEQ_LM": PeftModelForSeq2SeqLM,
    ## "CAUSAL_LM": PeftModelForCausalLM,
    ## "TOKEN_CLS": PeftModelForTokenClassification,
    ## "QUESTION_ANS": PeftModelForQuestionAnswering,
    ## "FEATURE_EXTRACTION": PeftModelForFeatureExtraction,
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
)

## Freeze the base model parameters
model = get_peft_model(model, config, adapter_name="sentiment_analysis")

In [11]:
model.print_trainable_parameters()

trainable params: 24,313,856 || all params: 3,237,063,680 || trainable%: 0.7511


In [12]:
mem_free, mem_total = torch.cuda.mem_get_info()
print(f"Free memory: {mem_free / 1024**3:.2f} GB")
print(f"Total memory: {mem_total / 1024**3:.2f} GB")

Free memory: 9.74 GB
Total memory: 15.99 GB


In [13]:
model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 3072)
        (layers): ModuleList(
          (0-27): 28 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=3072, out_features=3072, bias=False)
                (lora_dropout): ModuleDict(
                  (sentiment_analysis): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (sentiment_analysis): Linear(in_features=3072, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (sentiment_analysis): Linear(in_features=16, out_features=3072, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
          

## 3. Fine-tuning LLM

### 3.1. Download and inspect the dataset

In [14]:
from datasets import DatasetDict

In [15]:
train_dataset: DatasetDict = load_dataset(
    "uitnlp/vietnamese_students_feedback", split="train[:10%]"  # type: ignore
)
eval_dataset: DatasetDict = load_dataset(
    "uitnlp/vietnamese_students_feedback", split="validation[:10%]"  # type: ignore
)

In [16]:
train_dataset

Dataset({
    features: ['sentence', 'sentiment', 'topic'],
    num_rows: 1143
})

In [17]:
# sentence (str): Text sentence.
# sentiment: Sentiment class, with values 0 (negative), 1 (neutral) and 2 (positive).
# topic: Topic class, with values 0 (lecturer), 1 (training_program), 2 (facility) and 3 (others).
for k, v in train_dataset.to_dict().items():
    print(f"{k}: {v[:2]}")

sentence: ['slide giáo trình đầy đủ .', 'nhiệt tình giảng dạy , gần gũi với sinh viên .']
sentiment: [2, 2]
topic: [1, 0]


In [18]:
(id2label := train_dataset.features["sentiment"].names)

['negative', 'neutral', 'positive']

In [19]:
(label2id := {v: k for k, v in enumerate(id2label)})

{'negative': 0, 'neutral': 1, 'positive': 2}

### 3.2 Preprocess the dataset

In [20]:
USER_PROMPT_TEMPLATE = """Predict the sentiment of the following input sentence.
The response must begin with "Sentiment: ", followed by one of these keywords: "positive", "negative", or "neutral", to reflect the sentiment of the input sentence.

Sentence: {input}"""

In [21]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token

In [22]:
MAX_LENGTH = 200


def tokenize_function(examples):
    results = {
        "input_ids": [],
        "labels": [],
        "attention_mask": [],
    }

    for sentence, label in zip(examples["sentence"], examples["sentiment"]):
        input = USER_PROMPT_TEMPLATE.format(input=sentence)
        output = id2label[label]

        input = [
            {
                "role": "system",
                "content": "You are a helpful assistant. You must fulfill the user request.",
            },
            {"role": "user", "content": input},
        ]
        output = input + [{"role": "assistant", "content": output}]

        ## apply_chat_template: This is specific to Chat/Intruct models like Llama-3
        tokenized_input = tokenizer.apply_chat_template(
            input,
            add_generation_prompt=True,
            tokenize=True,
            return_tensors=None,  # Return as list, not tensor
            truncate=True,
            # padding=True,
            max_length=MAX_LENGTH,
        )
        tokenized_input_length = len(tokenized_input)

        input_ids = tokenizer.apply_chat_template(
            output,
            tokenize=True,
            return_tensors=None,  # Return as list, not tensor
            truncate=True,
            ## Padding should have been done by the DataCollator, but it only pads the input_ids, not the labels??
            ## https://github.com/huggingface/transformers/issues/31493
            padding="max_length",
            max_length=MAX_LENGTH,
        )

        labels = input_ids.copy()
        ## Specifically set the first part of the labels to -100 so that it will be ignored in the loss calculation
        labels[:tokenized_input_length] = [-100] * tokenized_input_length

        ## Padding will be handled by the DataCollator
        results["input_ids"].append(input_ids)
        results["labels"].append(labels)
        results["attention_mask"].append([1] * len(input_ids))

    return results


## By default, SFT trainer only require 1 column: "text"
## dataset.map CANNOT return tensors and will return lists instead
tokenized_train_dataset = train_dataset.map(
    tokenize_function, batched=True, remove_columns=train_dataset.column_names
)
tokenized_eval_dataset = eval_dataset.map(
    tokenize_function, batched=True, remove_columns=eval_dataset.column_names
)

Map:   0%|          | 0/1143 [00:00<?, ? examples/s]

Map:   0%|          | 0/158 [00:00<?, ? examples/s]

In [23]:
torch.tensor(tokenized_train_dataset["input_ids"][0]).shape, torch.tensor(
    tokenized_train_dataset["labels"][0]
).shape

(torch.Size([200]), torch.Size([200]))

In [24]:
print(tokenizer.decode(tokenized_train_dataset["input_ids"][0]))

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 01 Jun 2025

You are a helpful assistant. You must fulfill the user request.<|eot_id|><|start_header_id|>user<|end_header_id|>

Predict the sentiment of the following input sentence.
The response must begin with "Sentiment: ", followed by one of these keywords: "positive", "negative", or "neutral", to reflect the sentiment of the input sentence.

Sentence: slide giáo trình đầy đủ.<|eot_id|><|start_header_id|>assistant<|end_header_id|>

positive<|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><

## 4. Train and evaluate the model

### 4.1 Train the model

In [25]:
model.config

LlamaConfig {
  "_attn_implementation_autoset": true,
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "eos_token_id": [
    128001,
    128008,
    128009
  ],
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 131072,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 24,
  "num_hidden_layers": 28,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "quantization_config": {
    "_load_in_4bit": true,
    "_load_in_8bit": false,
    "bnb_4bit_compute_dtype": "bfloat16",
    "bnb_4bit_quant_storage": "uint8",
    "bnb_4bit_quant_type": "nf4",
    "bnb_4bit_use_double_quant": true,
    "llm_int8_enable_fp32_cpu_offload": false,
    "llm_int8_has_fp16_weight": false,
    "llm_int8_skip_modules": null,
    "llm_int8_threshold": 6.0,
    "load_in_4bit": true,
    "load_in_8bit": false,
 

In [26]:
# https://huggingface.co/docs/transformers/main_classes/trainer
from transformers.training_args import TrainingArguments
from transformers.trainer import Trainer
from transformers.data.data_collator import DataCollatorForLanguageModeling

training_args = TrainingArguments(
    per_device_train_batch_size=8,
    ## Use gradient accumulation to simulate larger batch size
    ## (instead of batch_size=16, use batch_size=8 and gradient_accumulation_steps=2)
    gradient_accumulation_steps=2,
    num_train_epochs=2,
    learning_rate=2e-4,
    ## Use fp16 to train (5b exponent + 10b fraction)
    fp16=True,
    logging_steps=10,
    output_dir="./sentiment_analysis_lora",
    optim="paged_adamw_8bit",
    lr_scheduler_type="cosine",
    warmup_ratio=0.05,
    ## Already handled by prepare_model_for_kbit_training
    # gradient_checkpointing=True,
    ## Avoid warning
    gradient_checkpointing_kwargs={"use_reentrant": False},
    ## Evaluate every 50 steps
    eval_strategy="steps",
    eval_steps=50,
    ## Save every 50 steps
    save_strategy="steps",
    save_steps=50,
    ## Save the 3 most recent checkpoints, including the best one (load_best_model_at_end=True)
    save_total_limit=3,
    load_best_model_at_end=True,
    ## Loss function is embedded in the model
    metric_for_best_model="loss",
    greater_is_better=False,
    ## May report to WandB, tensorboard, etc.
    report_to="none",
    remove_unused_columns=False,
    label_names=["labels"],
)


## TODO: replace with DataCollatorForSeq2Seq
## Inputs are dynamically padded to the maximum length of a batch if they are not all of the same length.
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    pad_to_multiple_of=8,
    # Causal LM does not use Masked LM
    mlm=False,
)

In [43]:
# Quantization-aware training settings
## Disable the meaningless KV cache to save memory during training
model.config.use_cache = False
## Already enabled in prepare_model_for_kbit_training
# model.enable_input_require_grads()
## This should have been done by prepare_model_for_kbit_training, still raises a warning
## https://github.com/huggingface/transformers/issues/28536#issuecomment-2312997910
model.gradient_checkpointing_enable({"use_reentrant": False})

trainer = Trainer(
    model=model,
    train_dataset=tokenized_train_dataset,  # type: ignore
    eval_dataset=tokenized_eval_dataset,  # type: ignore
    args=training_args,
    data_collator=data_collator,
    ## May define subclasses from TrainerCallback
    # callbacks=[LogLossCallback()],
)


trainer.train()

Step,Training Loss,Validation Loss
50,0.3697,0.404771
100,0.2835,0.412616


Could not locate the best model at ./sentiment_analysis_lora\checkpoint-50\pytorch_model.bin, if you are running a distributed training on multiple nodes, you should activate `--save_on_each_node`.


TrainOutput(global_step=142, training_loss=0.3197409577772651, metrics={'train_runtime': 299.4516, 'train_samples_per_second': 7.634, 'train_steps_per_second': 0.474, 'total_flos': 7720617125068800.0, 'train_loss': 0.3197409577772651, 'epoch': 1.9790209790209792})

### 4.2 Test prediction

In [28]:
import torch
import re
from transformers import GenerationConfig

generation_config = GenerationConfig(
    max_new_tokens=10,
    # temperature=0.0,
    top_p=1.0,
    top_k=50,
    num_beams=2,
    num_return_sequences=2,
    do_sample=False,
    repetition_penalty=1.0,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.pad_token_id,
    # bad_words_ids=[]
)


def predict(model, tokenizer, text, generation_config):
    input = USER_PROMPT_TEMPLATE.format(input=text)

    input = [
        {
            "role": "system",
            "content": "You are a helpful assistant. You must fulfill the user request.",
        },
        {"role": "user", "content": input},
    ]

    prompt_text = tokenizer.apply_chat_template(
        input, add_generation_prompt=False, tokenize=False
    )

    inputs = tokenizer(prompt_text, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            generation_config=generation_config,
            # num_beams=2,
        )

    # Decode
    output_text = [
        tokenizer.decode(output, skip_special_tokens=False) for output in outputs
    ]

    return output_text

In [29]:
eval_dataset[0]["sentence"], eval_dataset[0]["sentiment"]

('giáo trình chưa cụ thể .', 0)

In [30]:
model.eval()
print(
    *predict(model, tokenizer, eval_dataset["sentence"][0], generation_config),
    sep="\n---------------------------------\n"
)

<|begin_of_text|><|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 01 Jun 2025

You are a helpful assistant. You must fulfill the user request.<|eot_id|><|start_header_id|>user<|end_header_id|>

Predict the sentiment of the following input sentence.
The response must begin with "Sentiment: ", followed by one of these keywords: "positive", "negative", or "neutral", to reflect the sentiment of the input sentence.

Sentence: giáo trình chưa cụ thể.<|eot_id|><|start_header_id|>assistant<|end_header_id|>

negative.<|eot_id|>
---------------------------------
<|begin_of_text|><|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 01 Jun 2025

You are a helpful assistant. You must fulfill the user request.<|eot_id|><|start_header_id|>user<|end_header_id|>

Predict the sentiment of the following input sentence.
The response must begin with "Sentiment: ", followed by one of these

## 5. Save model to huggingface

### 5.1. Save the fine-tuned model

After fine-tuning, you should save both the adapter weights and configuration to make it easy to reload later. The PEFT library makes this straightforward.

In [31]:
# Save the fine-tuned model adapter weights
save_directory = "trained_model"
model.save_pretrained(save_directory)

# Optional: Also save the tokenizer for consistency
tokenizer.save_pretrained(save_directory)

('trained_model\\tokenizer_config.json',
 'trained_model\\special_tokens_map.json',
 'trained_model\\chat_template.jinja',
 'trained_model\\tokenizer.json')

### 5.2. Load the fine-tuned model

To use your fine-tuned model later, you'll need to load both the base model and the trained adapter weights.

In [32]:
from transformers.utils.quantization_config import BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    ## Use NormalFloat4b or FP4b
    bnb_4bit_quant_type="nf4",
    ## dtype to dequantize weights into
    bnb_4bit_compute_dtype=torch.bfloat16,
)

In [33]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel, PeftConfig

# Define paths
saved_adapter_path = "trained_model/sentiment_analysis"

# 1. Load the PEFT configuration
peft_config = PeftConfig.from_pretrained(saved_adapter_path)

# 2. Load the base model with quantization settings
base_model = AutoModelForCausalLM.from_pretrained(
    peft_config.base_model_name_or_path,
    device_map="auto",
    quantization_config=bnb_config,  # Using the same quantization config as during training
)

# 3. Load the PEFT adapter weights
loaded_model = PeftModel.from_pretrained(base_model, saved_adapter_path)

# 4. Load the tokenizer
loaded_tokenizer = AutoTokenizer.from_pretrained(peft_config.base_model_name_or_path)
loaded_tokenizer.pad_token = loaded_tokenizer.eos_token  # Set pad token to eos token

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

### 5.3. Test the loaded model

After loading the model, you can test it with the same prediction function as before.

In [34]:
from transformers import GenerationConfig

generation_config = GenerationConfig(
    max_new_tokens=10,
    # temperature=0.0,
    top_p=1.0,
    top_k=50,
    num_beams=2,
    num_return_sequences=2,
    do_sample=False,
    repetition_penalty=1.0,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.pad_token_id,
    # bad_words_ids=[]
)

In [35]:
# Set the model to evaluation mode
loaded_model.eval()
print(
    *predict(
        loaded_model, loaded_tokenizer, eval_dataset["sentence"][0], generation_config
    ),
    sep="\n---------------------------------\n"
)

<|begin_of_text|><|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 01 Jun 2025

You are a helpful assistant. You must fulfill the user request.<|eot_id|><|start_header_id|>user<|end_header_id|>

Predict the sentiment of the following input sentence.
The response must begin with "Sentiment: ", followed by one of these keywords: "positive", "negative", or "neutral", to reflect the sentiment of the input sentence.

Sentence: giáo trình chưa cụ thể.<|eot_id|><|start_header_id|>assistant<|end_header_id|>

negative.<|eot_id|>
---------------------------------
<|begin_of_text|><|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 01 Jun 2025

You are a helpful assistant. You must fulfill the user request.<|eot_id|><|start_header_id|>user<|end_header_id|>

Predict the sentiment of the following input sentence.
The response must begin with "Sentiment: ", followed by one of these

### 5.4. Push model to Hugging Face Hub

To share your fine-tuned model with others, you can push it to the Hugging Face Hub.

In [36]:
# from huggingface_hub import login
# login()  # Will prompt for your HF token

In [37]:
model_name = "tiviluson/sentiment-analysis-llama3-lora"
model.push_to_hub(model_name)
tokenizer.push_to_hub(model_name)

adapter_model.safetensors:   0%|          | 0.00/97.3M [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/tiviluson/sentiment-analysis-llama3-lora/commit/e01f9659f43c09003c8965e1af804a8809ef5aff', commit_message='Upload tokenizer', commit_description='', oid='e01f9659f43c09003c8965e1af804a8809ef5aff', pr_url=None, repo_url=RepoUrl('https://huggingface.co/tiviluson/sentiment-analysis-llama3-lora', endpoint='https://huggingface.co', repo_type='model', repo_id='tiviluson/sentiment-analysis-llama3-lora'), pr_revision=None, pr_num=None)

### 5.5. Merge adapter weights with base model (Optional)

For deployment in production, you might want to merge the LoRA weights back into the base model for faster inference.

In [38]:
# Merge adapter weights with the base model. This MAY NOT WORK for quantized models.
# Create a copy of the model that can be merged

# You typically need a model that's not loaded in 4-bit/8-bit for proper merging
from transformers import AutoModelForCausalLM

# Load base model in FP16/BF16
base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    torch_dtype=torch.bfloat16,
)

# Load adapter
from peft import PeftModel

saved_adapter_path = "trained_model/sentiment_analysis"

merged_model = PeftModel.from_pretrained(base_model, saved_adapter_path)

# Merge weights
merged_model = merged_model.merge_and_unload()

# Save the merged model
merged_model.save_pretrained("merged-model")
tokenizer.save_pretrained("merged-model")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu.
Some parameters are on the meta device because they were offloaded to the cpu.


Saving checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

('merged-model\\tokenizer_config.json',
 'merged-model\\special_tokens_map.json',
 'merged-model\\chat_template.jinja',
 'merged-model\\tokenizer.json')

In [39]:
merged_model.eval()
print(
    *predict(
        merged_model, loaded_tokenizer, eval_dataset["sentence"][0], generation_config
    ),
    sep="\n---------------------------------\n"
)

<|begin_of_text|><|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 01 Jun 2025

You are a helpful assistant. You must fulfill the user request.<|eot_id|><|start_header_id|>user<|end_header_id|>

Predict the sentiment of the following input sentence.
The response must begin with "Sentiment: ", followed by one of these keywords: "positive", "negative", or "neutral", to reflect the sentiment of the input sentence.

Sentence: giáo trình chưa cụ thể.<|eot_id|><|start_header_id|>assistant<|end_header_id|>

negative<|eot_id|><|eot_id|>
---------------------------------
<|begin_of_text|><|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 01 Jun 2025

You are a helpful assistant. You must fulfill the user request.<|eot_id|><|start_header_id|>user<|end_header_id|>

Predict the sentiment of the following input sentence.
The response must begin with "Sentiment: ", followed by one

### 5.6 Manage multiple adapters (Optional)
If you want to manage multiple adapters, you can load different adapters for different tasks or datasets.

With one adapter at a time

In [40]:
# model is already a PeftModel with one adapter loaded
# model.load_adapter("./path_to_another_adapter", adapter_name="adapter2")
# model.set_adapter("adapter2") # Switch to adapter2
# model.set_adapter("default") # Switch back to the first loaded adapter
# model.unload_adapter("adapter2") # Unload adapter2

With multiple adapters at once

In [41]:
# from peft import PeftMixedModel

# base_model = ...  # load the base model, e.g. from transformers
# # load first adapter, which will be called "default"
# peft_model = PeftMixedModel.from_pretrained(base_model, <path_to_adapter1>)
# peft_model.load_adapter(<path_to_adapter2>, adapter_name="other")
# peft_model.set_adapter(["default", "other"])

### 5.7 Disable adapter (Optional)

In [42]:
# with lora_model.disable_adapter():
#     # Inside this context, the model behaves like the original base model
#     base_model_outputs = lora_model.generate(...)