# Fine-tune Llama 2 for sentiment analysis

Download datasets from [Fine-tune Llama 2 for sentiment analysis](https://www.kaggle.com/code/lucamassaron/fine-tune-llama-2-for-sentiment-analysis/notebook)

In [1]:
import os

# for ROCm, 10.3.0 is gfx1030
os.environ["HSA_OVERRIDE_GFX_VERSION"] = "10.3.0"
# for multiple GPUs ,e.g. you have GPU + iGPU/APU
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [2]:
import warnings

warnings.filterwarnings("ignore")

In [5]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm

import bitsandbytes as bnb
import torch
import torch.nn as nn
import transformers
from datasets import Dataset
from peft import LoraConfig, PeftConfig
from trl import SFTTrainer
from trl import setup_chat_format
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
)
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

In [8]:
print(f"pytorch version {torch.__version__}")
print(f"bitsandbytes version {bnb.__version__}")
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"working on {device}")

pytorch version 2.3.1+rocm6.0
bitsandbytes version 0.43.2.dev
working on cuda:0


In [9]:
filename = "./all-data.csv"

df = pd.read_csv(
    filename, names=["sentiment", "text"], encoding="utf-8", encoding_errors="replace"
)

X_train = list()
X_test = list()
for sentiment in ["positive", "neutral", "negative"]:
    train, test = train_test_split(
        df[df.sentiment == sentiment], train_size=300, test_size=300, random_state=42
    )
    X_train.append(train)
    X_test.append(test)

X_train = pd.concat(X_train).sample(frac=1, random_state=10)
X_test = pd.concat(X_test)

eval_idx = [
    idx for idx in df.index if idx not in list(X_train.index) + list(X_test.index)
]
X_eval = df[df.index.isin(eval_idx)]
X_eval = X_eval.groupby("sentiment", group_keys=False).apply(
    lambda x: x.sample(n=50, random_state=10, replace=True)
)
X_train = X_train.reset_index(drop=True)


def generate_prompt(data_point):
    return f"""
            Analyze the sentiment of the news headline enclosed in square brackets,
            determine if it is positive, neutral, or negative, and return the answer as
            the corresponding sentiment label "positive" or "neutral" or "negative".

            [{data_point["text"]}] = {data_point["sentiment"]}
            """.strip()


def generate_test_prompt(data_point):
    return f"""
            Analyze the sentiment of the news headline enclosed in square brackets,
            determine if it is positive, neutral, or negative, and return the answer as
            the corresponding sentiment label "positive" or "neutral" or "negative".

            [{data_point["text"]}] = """.strip()


X_train = pd.DataFrame(X_train.apply(generate_prompt, axis=1), columns=["text"])
X_eval = pd.DataFrame(data=X_eval.apply(generate_prompt, axis=1), columns=["text"])

y_true = X_test.sentiment
X_test = pd.DataFrame(X_test.apply(generate_test_prompt, axis=1), columns=["text"])

train_data = Dataset.from_pandas(X_train)
eval_data = Dataset.from_pandas(X_eval)

In [10]:
def evaluate(y_true, y_pred):
    labels = ["positive", "neutral", "negative"]
    mapping = {"positive": 2, "neutral": 1, "none": 1, "negative": 0}

    def map_func(x):
        return mapping.get(x, 1)

    y_true = np.vectorize(map_func)(y_true)
    y_pred = np.vectorize(map_func)(y_pred)

    # Calculate accuracy
    accuracy = accuracy_score(y_true=y_true, y_pred=y_pred)
    print(f"Accuracy: {accuracy:.3f}")

    # Generate accuracy report
    unique_labels = set(y_true)  # Get unique labels

    for label in unique_labels:
        label_indices = [i for i in range(len(y_true)) if y_true[i] == label]
        label_y_true = [y_true[i] for i in label_indices]
        label_y_pred = [y_pred[i] for i in label_indices]
        accuracy = accuracy_score(label_y_true, label_y_pred)
        print(f"Accuracy for label {label}: {accuracy:.3f}")

    # Generate classification report
    class_report = classification_report(y_true=y_true, y_pred=y_pred)
    print("\nClassification Report:")
    print(class_report)

    # Generate confusion matrix
    conf_matrix = confusion_matrix(y_true=y_true, y_pred=y_pred, labels=[0, 1, 2])
    print("\nConfusion Matrix:")
    print(conf_matrix)

In [11]:
model_name = "microsoft/Phi-3-mini-4k-instruct"

compute_dtype = getattr(torch, "float16")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=True,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map=device,
    torch_dtype=compute_dtype,
    quantization_config=bnb_config,
)

model.config.use_cache = False
model.config.pretraining_tp = 1

tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    trust_remote_code=True,
)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

model, tokenizer = setup_chat_format(model, tokenizer)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [12]:
def predict(test, model, tokenizer):
    y_pred = []
    for i in tqdm(range(len(X_test))):
        prompt = X_test.iloc[i]["text"]
        pipe = pipeline(
            task="text-generation",
            model=model,
            tokenizer=tokenizer,
            max_new_tokens=1,
            temperature=0.0,
        )
        result = pipe(prompt)
        answer = result[0]["generated_text"].split("=")[-1]
        if "positive" in answer:
            y_pred.append("positive")
        elif "negative" in answer:
            y_pred.append("negative")
        elif "neutral" in answer:
            y_pred.append("neutral")
        else:
            y_pred.append("none")
    return y_pred

In [13]:
y_pred = predict(test, model, tokenizer)

  0%|          | 0/900 [00:00<?, ?it/s]You are not running the flash-attention implementation, expect numerical differences.
100%|██████████| 900/900 [01:40<00:00,  8.95it/s]


In [14]:
evaluate(y_true, y_pred)

Accuracy: 0.669
Accuracy for label 0: 0.970
Accuracy for label 1: 0.620
Accuracy for label 2: 0.417

Classification Report:
              precision    recall  f1-score   support

           0       0.67      0.97      0.79       300
           1       0.60      0.62      0.61       300
           2       0.81      0.42      0.55       300

    accuracy                           0.67       900
   macro avg       0.69      0.67      0.65       900
weighted avg       0.69      0.67      0.65       900


Confusion Matrix:
[[291   8   1]
 [ 86 186  28]
 [ 60 115 125]]


In [15]:
output_dir = "trained_weights"

peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    target_modules="all-linear",
    task_type="CAUSAL_LM",
)

training_arguments = TrainingArguments(
    output_dir=output_dir,  # directory to save and repository id
    num_train_epochs=3,  # number of training epochs
    per_device_train_batch_size=1,  # batch size per device during training
    gradient_accumulation_steps=8,  # number of steps before performing a backward/update pass
    gradient_checkpointing=True,  # use gradient checkpointing to save memory
    optim="paged_adamw_32bit",
    save_steps=0,
    logging_steps=25,  # log every 10 steps
    learning_rate=2e-4,  # learning rate, based on QLoRA paper
    weight_decay=0.001,
    fp16=True,
    bf16=False,
    max_grad_norm=0.3,  # max gradient norm based on QLoRA paper
    max_steps=-1,
    warmup_ratio=0.03,  # warmup ratio based on QLoRA paper
    group_by_length=True,
    lr_scheduler_type="cosine",  # use cosine learning rate scheduler
    report_to="tensorboard",  # report metrics to tensorboard
    evaluation_strategy="epoch",  # save checkpoint every epoch
)

trainer = SFTTrainer(
    model=model,
    args=training_arguments,
    train_dataset=train_data,
    eval_dataset=eval_data,
    peft_config=peft_config,
    dataset_text_field="text",
    tokenizer=tokenizer,
    max_seq_length=1024,
    packing=False,
    dataset_kwargs={
        "add_special_tokens": False,
        "append_concat_token": False,
    },
)

Map:   0%|          | 0/900 [00:00<?, ? examples/s]

Map:   0%|          | 0/150 [00:00<?, ? examples/s]

In [16]:
# Train model
trainer.train()

  0%|          | 0/336 [00:00<?, ?it/s]

{'loss': 1.5626, 'grad_norm': 0.14719325304031372, 'learning_rate': 0.0001990856842874641, 'epoch': 0.22}
{'loss': 0.9383, 'grad_norm': 0.13051626086235046, 'learning_rate': 0.00019297764858882514, 'epoch': 0.44}
{'loss': 0.8695, 'grad_norm': 0.1571911722421646, 'learning_rate': 0.00018146608991420534, 'epoch': 0.67}
{'loss': 0.9152, 'grad_norm': 0.1308598816394806, 'learning_rate': 0.0001652200182109602, 'epoch': 0.89}


  0%|          | 0/19 [00:00<?, ?it/s]

{'eval_loss': 0.7957496643066406, 'eval_runtime': 9.6754, 'eval_samples_per_second': 15.503, 'eval_steps_per_second': 1.964, 'epoch': 1.0}
{'loss': 0.8328, 'grad_norm': 0.1403578668832779, 'learning_rate': 0.0001451835961144145, 'epoch': 1.11}
{'loss': 0.7734, 'grad_norm': 0.20339500904083252, 'learning_rate': 0.00012252126764738844, 'epoch': 1.33}
{'loss': 0.7638, 'grad_norm': 0.18621598184108734, 'learning_rate': 9.855008496617327e-05, 'epoch': 1.56}
{'loss': 0.7756, 'grad_norm': 0.2258760631084442, 'learning_rate': 7.466316607649738e-05, 'epoch': 1.78}
{'loss': 0.7611, 'grad_norm': 0.16866222023963928, 'learning_rate': 5.2248731878811365e-05, 'epoch': 2.0}


  0%|          | 0/19 [00:00<?, ?it/s]

{'eval_loss': 0.7744114398956299, 'eval_runtime': 10.0926, 'eval_samples_per_second': 14.862, 'eval_steps_per_second': 1.883, 'epoch': 2.0}
{'loss': 0.7261, 'grad_norm': 0.1970079243183136, 'learning_rate': 3.2609427815531426e-05, 'epoch': 2.22}
{'loss': 0.7273, 'grad_norm': 0.21473923325538635, 'learning_rate': 1.6886618852849724e-05, 'epoch': 2.44}
{'loss': 0.6818, 'grad_norm': 0.21480531990528107, 'learning_rate': 5.994057497592031e-06, 'epoch': 2.67}
{'loss': 0.6671, 'grad_norm': 0.22301256656646729, 'learning_rate': 5.647798228764156e-07, 'epoch': 2.89}


  0%|          | 0/19 [00:00<?, ?it/s]

{'eval_loss': 0.7884143590927124, 'eval_runtime': 10.1034, 'eval_samples_per_second': 14.846, 'eval_steps_per_second': 1.881, 'epoch': 2.99}
{'train_runtime': 2021.2358, 'train_samples_per_second': 1.336, 'train_steps_per_second': 0.166, 'train_loss': 0.8362029762495131, 'epoch': 2.99}


TrainOutput(global_step=336, training_loss=0.8362029762495131, metrics={'train_runtime': 2021.2358, 'train_samples_per_second': 1.336, 'train_steps_per_second': 0.166, 'total_flos': 5931624305147904.0, 'train_loss': 0.8362029762495131, 'epoch': 2.986666666666667})

In [17]:
# Save trained model and tokenizer
trainer.save_model()
tokenizer.save_pretrained(output_dir)

('trained_weigths/tokenizer_config.json',
 'trained_weigths/special_tokens_map.json',
 'trained_weigths/tokenizer.json')

In [None]:
import gc

del [
    model,
    tokenizer,
    peft_config,
    trainer,
    train_data,
    eval_data,
    bnb_config,
    training_arguments,
]
del [df, X_train, X_eval]
del [TrainingArguments, SFTTrainer, LoraConfig, BitsAndBytesConfig]

In [20]:
for _ in range(100):
    torch.cuda.empty_cache()
    gc.collect()

In [21]:
from peft import AutoPeftModelForCausalLM

finetuned_model = "./trained_weigths/"
compute_dtype = getattr(torch, "float16")
tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoPeftModelForCausalLM.from_pretrained(
    finetuned_model,
    torch_dtype=compute_dtype,
    return_dict=False,
    low_cpu_mem_usage=True,
    device_map=device,
)

merged_model = model.merge_and_unload()
merged_model.save_pretrained(
    "./merged_model", safe_serialization=True, max_shard_size="2GB"
)
tokenizer.save_pretrained("./merged_model")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


('./merged_model/tokenizer_config.json',
 './merged_model/special_tokens_map.json',
 './merged_model/tokenizer.json')

In [22]:
y_pred = predict(test, merged_model, tokenizer)
evaluate(y_true, y_pred)

100%|██████████| 900/900 [01:29<00:00, 10.01it/s]


Accuracy: 0.861
Accuracy for label 0: 0.957
Accuracy for label 1: 0.720
Accuracy for label 2: 0.907

Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.96      0.95       300
           1       0.87      0.72      0.79       300
           2       0.79      0.91      0.84       300

    accuracy                           0.86       900
   macro avg       0.87      0.86      0.86       900
weighted avg       0.87      0.86      0.86       900


Confusion Matrix:
[[287   8   5]
 [ 15 216  69]
 [  5  23 272]]


In [None]:
%load_ext tensorboard
%tensorboard --logdir logs/runs

In [23]:
evaluation = pd.DataFrame(
    {"text": X_test["text"], "y_true": y_true, "y_pred": y_pred},
)
evaluation.to_csv("test_predictions.csv", index=False)