<a href="https://colab.research.google.com/github/toan-ly/PEFT-LoRA-Basic/blob/main/QLoRA_SentimentAnalysis_Llama3_2_1B.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -qq --upgrade pip
!pip install -qq --upgrade peft transformers accelerate bitsandbytes datasets trl huggingface_hub

In [None]:
# from google.colab import userdata
# from huggingface_hub import login

# login(token=userdata.get('HF_TOKEN'))

In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"


import torch
import numpy as np

from peft import PeftModel, PeftConfig, LoraConfig, TaskType, get_peft_model, get_peft_config
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import DataCollatorForLanguageModeling, Trainer, TrainingArguments
from datasets import load_dataset
from trl import SFTTrainer
from transformers import BitsAndBytesConfig
from peft import prepare_model_for_kbit_training
import evaluate
import warnings

warnings.filterwarnings("ignore")



base_model_id = "meta-llama/Llama-3.2-1B-Instruct"
cache_dir = "./cache"

# QLoRA load base model with 4-bit quantization
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    # bnb_4bit_compute_dtype=torch.bfloat16, # Google colab does not support bfloat16
    bnb_4bit_compute_dtype=torch.float16,
)

## Hyperparameters

In [None]:
MAX_TRAIN_STEPS = 5_000
NUM_EVAL_STEPS = 500
MAX_TRAIN_SAMPLES = 20_000
MAX_EVAL_SAMPLES = 2_000

training_args = TrainingArguments(
    output_dir="./output",
    # num_train_epochs=1,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=8,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=NUM_EVAL_STEPS,
    max_steps=MAX_TRAIN_STEPS,
    eval_steps=NUM_EVAL_STEPS,
    eval_strategy="steps",
    overwrite_output_dir=True,
    save_total_limit=2,
    report_to="none",
    push_to_hub=False,
    remove_unused_columns=False,
)

## Load Base Model

In [None]:
tokenizer = AutoTokenizer.from_pretrained(base_model_id, trust_remote_code=True, cache_dir=cache_dir)
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    trust_remote_code=True,
    cache_dir=cache_dir,
    quantization_config=quantization_config,
    device_map="cuda:0" if torch.cuda.is_available() else "cpu",
)
base_model = prepare_model_for_kbit_training(base_model)

base_model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear4bit(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear4bit(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear4bit(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear4bit(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((2048,), 

In [None]:
tokenizer = AutoTokenizer.from_pretrained(base_model_id, trust_remote_code=True, cache_dir=cache_dir)

In [None]:
if tokenizer.pad_token is None or tokenizer.pad_token_id is None:
    print("Pad token is not set. Setting it to EOS token.")
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id
else:
    print(f'Pad token: {tokenizer.pad_token}')
    print(f'Pad token id: {tokenizer.pad_token_id}')

print(f'EOS token: {tokenizer.eos_token}')
print(f'EOS token id: {tokenizer.eos_token_id}')

Pad token is not set. Setting it to EOS token.
EOS token: <|eot_id|>
EOS token id: 128009


In [None]:
if tokenizer.chat_template is None:
    tokenizer.chat_template = """{{- bos_token }}
{%- if not date_string is defined %}
    {%- if strftime_now is defined %}{%- set date_string = strftime_now("%d %b %Y") %}{%- else %}{%- set date_string = "26 Jul 2024" %}{%- endif %}
{%- endif %}

{#- This block extracts the system message, so we can slot it into the right place. #}
{%- if messages[0]['role'] == 'system' %}
    {%- set system_message = messages[0]['content']|trim %}
    {%- set messages = messages[1:] %}
{%- else %}
    {%- set system_message = "" %}
{%- endif %}

{#- System message #}
{{- "<|start_header_id|>system<|end_header_id|>\n\n" }}
{{- "Cutting Knowledge Date: December 2023\n" }}
{{- "Today Date: " + date_string + "\n\n" }}
{{- system_message }}
{{- "<|eot_id|>" }}

{%- for message in messages %}
    {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }}
{%- endfor %}
{%- if add_generation_prompt %}
    {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}
{%- endif %}
"""

## Load and Apply LoRA

In [None]:
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1
)

In [None]:
peft_model = get_peft_model(base_model, peft_config)
peft_model.print_trainable_parameters()
peft_model

trainable params: 851,968 || all params: 1,236,666,368 || trainable%: 0.0689


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 2048)
        (layers): ModuleList(
          (0-15): 16 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=2048, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): Linear

## Load Dataset and Format

In [None]:
dataset = load_dataset("uitnlp/vietnamese_students_feedback", cache_dir=cache_dir)

for split in dataset:
    if split == "train":
        MAX_TRAIN_SAMPLES = min(MAX_TRAIN_SAMPLES, len(dataset[split]))
        dataset[split] = dataset[split].select(range(MAX_TRAIN_SAMPLES))
    else:
        MAX_EVAL_SAMPLES = min(MAX_EVAL_SAMPLES, len(dataset[split]))
        dataset[split] = dataset[split].select(range(MAX_EVAL_SAMPLES))
    print(f"{split}: {len(dataset[split])}")

train: 11426
validation: 1583
test: 1583


In [None]:
label_set = set([item["sentiment"] for split in dataset for item in dataset[split]])
label_set

{0, 1, 2}

In [None]:
all_labels = dataset['train'].features['sentiment'].names
print(f'There are {len(all_labels)} labels in the dataset, including {all_labels}')

label2id = {label: i for i, label in enumerate(all_labels)}
id2label = {i: label for i, label in enumerate(all_labels)}

print(f'label2id: {label2id}')
print(f'id2label: {id2label}')

There are 3 labels in the dataset, including ['negative', 'neutral', 'positive']
label2id: {'negative': 0, 'neutral': 1, 'positive': 2}
id2label: {0: 'negative', 1: 'neutral', 2: 'positive'}


In [None]:
USER_PROMPT_TEMPLATE = """Predict the sentiment of the following input sentence.
The response must begin with "Sentiment: ", followed by one of these keywords: "positive", "negative", or "neutral", to reflect the sentiment of the input sentence.

Sentence: {input}"""

def tokenize_function(examples):
    results = {
        "input_ids": [],
        "labels": [],
        "attention_mask": [],
    }

    for i in range(len(examples['sentence'])):
        cur_input = examples['sentence'][i]
        cur_output_id = examples['sentiment'][i]

        cur_prompt = USER_PROMPT_TEMPLATE.format(input=cur_input)
        cur_output = id2label[cur_output_id]

        input_messages = [
            {"role": "system", "content": "You are a helpful assistant. You must fulfill the user request."},
            {"role": "user", "content": cur_prompt},
        ]
        input_output_messages = input_messages + [{"role": "assistant", "content": f"Sentiment: {cur_output}"}]

        input_prompt_tokenized = tokenizer.apply_chat_template(conversation=input_messages, return_tensors="pt", add_generation_prompt=True)[0]
        input_output_prompt_tokenized = tokenizer.apply_chat_template(conversation=input_output_messages, return_tensors="pt")[0]

        # print(f'Input prompt tokenized: {input_prompt_tokenized}')
        # print(f'Input/Output prompt tokenized: {input_output_prompt_tokenized}')
        # print(f'Decoded Input prompt tokenized: {tokenizer.decode(input_prompt_tokenized, skip_special_tokens=False)}')
        # print(f'Decoded Input/Output prompt tokenized: {tokenizer.decode(input_output_prompt_tokenized, skip_special_tokens=False)}')
        # raise Exception()

        input_ids = input_output_prompt_tokenized
        label_ids = torch.cat([
            torch.full_like(input_prompt_tokenized, fill_value=-100),
            input_output_prompt_tokenized[len(input_prompt_tokenized):]
        ])

        # print(f'Input ids: {input_ids}')
        # print(f'Label ids: {label_ids}')
        # print(f'Decoded label ids: {tokenizer.decode(np.where(label_ids!=-100, label_ids, tokenizer.pad_token_id), skip_special_tokens=True)}')
        assert len(input_ids) == len(label_ids)


        results["input_ids"].append(input_ids)
        results["labels"].append(label_ids)
        results['attention_mask'].append(torch.ones_like(input_ids))

    return results

col_names = dataset['train'].column_names
tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=col_names,
    num_proc=os.cpu_count(),
)
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels', 'attention_mask'],
        num_rows: 11426
    })
    validation: Dataset({
        features: ['input_ids', 'labels', 'attention_mask'],
        num_rows: 1583
    })
    test: Dataset({
        features: ['input_ids', 'labels', 'attention_mask'],
        num_rows: 1583
    })
})

In [None]:
print(tokenized_dataset['train'][0])
print(tokenizer.decode(tokenized_dataset['train'][0]['input_ids'], skip_special_tokens=False))

{'input_ids': [128000, 128006, 9125, 128007, 271, 38766, 1303, 33025, 2696, 25, 6790, 220, 2366, 18, 198, 15724, 2696, 25, 220, 1591, 13806, 220, 2366, 20, 271, 2675, 527, 264, 11190, 18328, 13, 1472, 2011, 21054, 279, 1217, 1715, 13, 128009, 128006, 882, 128007, 271, 54644, 279, 27065, 315, 279, 2768, 1988, 11914, 627, 791, 2077, 2011, 3240, 449, 330, 32458, 3904, 25, 3755, 8272, 555, 832, 315, 1521, 21513, 25, 330, 31587, 498, 330, 43324, 498, 477, 330, 60668, 498, 311, 8881, 279, 27065, 315, 279, 1988, 11914, 382, 85664, 25, 15332, 104300, 101291, 107114, 104602, 662, 128009, 128006, 78191, 128007, 271, 32458, 3904, 25, 6928, 128009], 'labels': [-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -

## Custom data collator

In [None]:
from transformers import DataCollatorWithPadding
from typing import Any, Dict, List

class RightPaddingDataCollator(DataCollatorWithPadding):
    """The default data collator pads only inputs, not including the labels."""

    def __init__(self, tokenizer, max_length: int = 1024):
        super().__init__(tokenizer, max_length=max_length)

    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
        input_ids, labels, attention_mask = [], [], []
        max_batch_len = max(len(f["input_ids"]) for f in features)

        for sample in features:
            # Convert to torch tensors
            cur_input_ids = torch.tensor(sample["input_ids"], dtype=torch.long)
            cur_labels = torch.tensor(sample["labels"], dtype=torch.long)
            cur_attention_mask = torch.ones_like(cur_input_ids)

            # Next, we pad the inputs and labels to the maximum length within the batch
            pad_token_id = self.tokenizer.pad_token_id
            padding_length = max_batch_len - len(cur_input_ids)
            cur_input_ids = torch.cat([cur_input_ids, torch.full((padding_length,), fill_value=pad_token_id, dtype=torch.long)])
            cur_labels = torch.cat([cur_labels, torch.full((padding_length,), fill_value=-100, dtype=torch.long)])
            cur_attention_mask = torch.cat([cur_attention_mask, torch.zeros((padding_length,), dtype=torch.long)])

            # Truncate the inputs and labels to the maximum length
            cur_input_ids = cur_input_ids[:max_batch_len]
            cur_labels = cur_labels[:max_batch_len]
            cur_attention_mask = cur_attention_mask[:max_batch_len]

            # Append to the return lists
            input_ids.append(cur_input_ids)
            labels.append(cur_labels)
            attention_mask.append(cur_attention_mask)

        # Return formatted batch.
        return {
            "input_ids": torch.stack(input_ids),
            "labels": torch.stack(labels),
            "attention_mask": torch.stack(attention_mask)
        }

data_collator = RightPaddingDataCollator(tokenizer)

In [None]:
accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")


def preprocess_logits_for_metrics(logits, labels):
    if isinstance(logits, tuple):
        logits = logits[0]
    return logits.argmax(dim=-1)


def compute_metrics(eval_preds):
    preds, labels = eval_preds

    if isinstance(preds, tuple):
        preds = preds[0]

    idx = 0
    for i in range(len(labels[0])):
        if labels[0][i] == -100:
            idx = i
        else:
            break
    # Slice the labels and preds to remove the prompt tokens
    preds = preds[:, idx:]

    # Replace -100 in the preds as we can't decode them
    preds = np.where(preds != -100, preds, tokenizer.pad_token_id)

    processed_preds = []
    for pred in preds:
        end_pred_idx = np.where(pred == tokenizer.eos_token_id)[0]
        if len(end_pred_idx) > 0:
            end_pred_idx = end_pred_idx[0]
            processed_preds.append(pred[:end_pred_idx])
        else:
            processed_preds.append(pred)

    # Decode generated summaries into text
    decoded_preds = tokenizer.batch_decode(processed_preds, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    # Decode reference summaries into text
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Convert the decoded predictions and labels to label ids
    # int_preds = [label2id.get(p, -1) for p in decoded_preds]
    # int_labels = [label2id[label] for label in decoded_labels]
    int_preds, int_labels = [], []
    for p, l in zip(decoded_preds, decoded_labels):
        l = l.split(":")[-1].strip()
        cur_label_id = label2id[l]
        int_labels.append(cur_label_id)
        try:
            p = p.split(":")[-1].strip()
            cur_pred_id = label2id[p]
        except Exception as e:
            cur_pred_id = (cur_label_id + 1) % len(label2id)
        int_preds.append(cur_pred_id)

    # results = f1_metric.compute(predictions=[0, -1, -1], references=[0, 1, 2], average="macro")
    accuracy_results = accuracy_metric.compute(predictions=int_preds, references=int_labels)
    f1_results = f1_metric.compute(predictions=int_preds, references=int_labels, average="macro")
    precision_results = precision_metric.compute(predictions=int_preds, references=int_labels, average="macro")
    recall_results = recall_metric.compute(predictions=int_preds, references=int_labels, average="macro")

    return {
        **accuracy_results,
        **f1_results,
        **precision_results,
        **recall_results
    }

## Train the Model

In [None]:
import bitsandbytes as bnb
from transformers import get_linear_schedule_with_warmup

trainable_params = filter(lambda p: p.requires_grad, peft_model.parameters())

paged_optimizer = bnb.optim.PagedAdamW(
    trainable_params,
    lr=3e-4,
    weight_decay=0.0,
)
scheduler = get_linear_schedule_with_warmup(
    paged_optimizer,
    num_warmup_steps=int(MAX_TRAIN_STEPS*0.1),
    num_training_steps=MAX_TRAIN_STEPS,
)

# trainer = Trainer(
trainer = SFTTrainer(
    model=peft_model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
    preprocess_logits_for_metrics=preprocess_logits_for_metrics,
    compute_metrics=compute_metrics,
    processing_class=tokenizer,
    data_collator=data_collator,
    optimizers=(paged_optimizer, scheduler),
)
trainer.train()

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
500,0.0944,0.090676,0.368288,0.318606,0.424111,0.330477
1000,0.0738,0.056492,0.389135,0.324631,0.408603,0.306244
1500,0.0171,0.0606,0.400505,0.344262,0.426506,0.367617
2000,0.0505,0.063298,0.425774,0.358517,0.442607,0.36571
2500,0.0228,0.04697,0.442198,0.376316,0.445185,0.411554
3000,0.0399,0.039352,0.407454,0.357311,0.442817,0.409776
3500,0.034,0.046267,0.415035,0.356501,0.435712,0.385738
4000,0.0415,0.042435,0.425774,0.368892,0.444563,0.417573
4500,0.029,0.041408,0.421352,0.365206,0.443306,0.41064
5000,0.017,0.041768,0.415035,0.359724,0.438873,0.402288


TrainOutput(global_step=5000, training_loss=0.05073607933204621, metrics={'train_runtime': 2963.4955, 'train_samples_per_second': 6.749, 'train_steps_per_second': 1.687, 'total_flos': 1.477983256915968e+16, 'train_loss': 0.05073607933204621})

In [None]:
# Evaluate the model on the test set

trainer.evaluate(tokenized_dataset['test'])

{'eval_loss': 0.052675094455480576,
 'eval_accuracy': 0.4472520530638029,
 'eval_f1': 0.378188781201539,
 'eval_precision': 0.4428049000724454,
 'eval_recall': 0.3938102250122711,
 'eval_runtime': 61.48,
 'eval_samples_per_second': 25.748,
 'eval_steps_per_second': 3.221}

## Inference Pipeline

In [None]:
def inference(model, tokenizer, input_sentence):
    tokenizer.pad_token_id = tokenizer.eos_token_id

    user_prompt = USER_PROMPT_TEMPLATE.format(input=input_sentence)
    messages = [
        {"role": "system", "content": "You are a helpful assistant. You must fulfill the user request."},
        {"role": "user", "content": user_prompt},
    ]
    input_prompt = tokenizer.apply_chat_template(conversation=messages, add_generation_prompt=True, tokenize=False)
    inputs = tokenizer(input_prompt, return_tensors="pt", add_special_tokens=False)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    output_ids = model.generate(**inputs, max_new_tokens=16, pad_token_id=tokenizer.eos_token_id)
    output_ids = output_ids[:, inputs['input_ids'][0].shape[-1]:output_ids.shape[-1]]
    results = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
    return results[0]

def batch_inference(model, tokenizer, input_sentences):
    tokenizer.padding_side = "left"
    tokenizer.pad_token_id = tokenizer.eos_token_id

    user_prompts = [USER_PROMPT_TEMPLATE.format(input=input_sentence) for input_sentence in input_sentences]
    messages_list = [
        [
            {"role": "system", "content": "You are a helpful assistant. You must fulfill the user request."},
            {"role": "user", "content": user_prompt},
        ]
        for user_prompt in user_prompts
    ]
    input_prompts = [tokenizer.apply_chat_template(conversation=messages, add_generation_prompt=True, tokenize=False) for messages in messages_list]

    inputs = tokenizer(input_prompts, return_tensors="pt", padding=True, add_special_tokens=False)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    output_ids = model.generate(**inputs, max_new_tokens=16, pad_token_id=tokenizer.eos_token_id)
    output_ids = output_ids[:, inputs['input_ids'][0].shape[-1]:output_ids.shape[-1]]
    results = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
    return results

In [None]:
inference(peft_model, tokenizer, "The weather is nice today.")

'Sentiment: positive'

In [None]:
batch_inference(peft_model, tokenizer, ["Môn học này quá khó để học", "Thầy dạy hay, dễ hiểu"])

['Sentiment: negative', 'Sentiment: positive']