# Fine-Tune Llama 3

In [1]:
MODEL = "meta-llama/Llama-3.2-3B"
MAX_TOKENS = 8192

In [2]:
%pip install "torch==2.2.2" tensorboard --quiet
%pip install --upgrade "transformers==4.43.2" "vllm==0.5.5" "datasets==2.18.0" "accelerate==0.29.3" "evaluate==0.4.1" "bitsandbytes==0.43.1" "huggingface_hub==0.23.5" "trl==0.8.6" "peft==0.10.0" --quiet

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
autogluon-multimodal 1.1.1 requires nvidia-ml-py3==7.352.0, which is not installed.
autogluon-multimodal 1.1.1 requires jsonschema<4.22,>=4.18, but you have jsonschema 4.23.0 which is incompatible.
autogluon-multimodal 1.1.1 requires omegaconf<2.3.0,>=2.1.1, but you have omegaconf 2.3.0 which is incompatible.
autogluon-multimodal 1.1.1 requires scikit-learn<1.4.1,>=1.3.0, but you have scikit-learn 1.5.2 which is incompatible.
autogluon-multimodal 1.1.1 requires scipy<1.13,>=1.5.4, but you have scipy 1.14.1 which is incompatible.
autogluon-timeseries 1.1.1 requires gluonts==0.15.1, but you have gluonts 0.14.3 which is incompatible.
autogluon-timeseries 1.1.1 requires scipy<1.13,>=1.5.4, but you have scipy 1.14.1 which is incompatible.[0m[31m
[0mNote: you may need to restart the kernel to use updated package

###### Imports

In [3]:
import os
import random
import functools
import csv
import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F
import evaluate

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    f1_score,
    confusion_matrix,
    classification_report,
    balanced_accuracy_score,
    accuracy_score,
)

from scipy.stats import pearsonr
from datasets import Dataset, DatasetDict
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model

from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
)
from datasets import load_metric

# we do not want to init wandb
os.environ["WANDB_DISABLED"] = "true"

2024-11-17 13:55:54.742457: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-17 13:55:54.758479: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-17 13:55:54.763914: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-17 13:55:54.777430: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## 1. Read Data


In [4]:
# read yelp data (see 00-prepare-dataset.ipynb for more info)
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")
df_val = pd.read_csv("val.csv")

dataset_train = Dataset.from_pandas(df_train)
dataset_val = Dataset.from_pandas(df_val)
dataset_test = Dataset.from_pandas(df_test)
dataset = DatasetDict(
    {
        "train": dataset_train,
        "val": dataset_val,
    }
)
dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'text', 'id'],
        num_rows: 250
    })
    val: Dataset({
        features: ['label', 'text', 'id'],
        num_rows: 225
    })
})

In [5]:
labels = df_train["label"].unique()
labels

array([3, 4, 2, 1, 0])

## 2. Configure model

### 2.1  4-bit quantization

A technique to reduce the size and computational requirements of a machine learning model by </br>
representing its weights with 4-bit integers instead of higher-precision values, </br>
trading off some precision for efficiency.


In [6]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,  # enable 4-bit quantization
    bnb_4bit_quant_type="nf4",  # information theoretically optimal dtype for normally distributed weights
    bnb_4bit_use_double_quant=True,  # quantize quantized weights //insert xzibit meme
    bnb_4bit_compute_dtype=torch.bfloat16,  # optimized fp format for M
)

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL, quantization_config=quantization_config, num_labels=len(labels)
)
model = prepare_model_for_kbit_training(model)
model

`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-3B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LlamaForSequenceClassification(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 3072)
    (layers): ModuleList(
      (0-27): 28 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear4bit(in_features=3072, out_features=3072, bias=False)
          (k_proj): Linear4bit(in_features=3072, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=3072, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=3072, out_features=3072, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=3072, out_features=8192, bias=False)
          (up_proj): Linear4bit(in_features=3072, out_features=8192, bias=False)
          (down_proj): Linear4bit(in_features=8192, out_features=3072, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
   

### 2.2 LORA config
(Low-Rank Adaptation): A method for efficiently fine-tuning large language models by </br>
updating only a small number of low-rank parameters, significantly reducing the computational </br>
and storage requirements.

In [7]:
lora_config = LoraConfig(
    r=16,  # the dimension of the low-rank matrices
    lora_alpha=8,  # scaling factor for LoRA activations vs pre-trained weight activations
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.05,  # dropout probability of the LoRA layers
    bias="none",  # wether to train bias weights, set to 'none' for attention layers
    task_type="SEQ_CLS",
)

model = get_peft_model(model, lora_config)
model

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): LlamaForSequenceClassification(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 3072)
        (layers): ModuleList(
          (0-27): 28 x LlamaDecoderLayer(
            (self_attn): LlamaSdpaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=3072, out_features=3072, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=3072, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=3072, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): lora.Linear4bit(
        

### 2.3 Tokenizer

In [8]:
tokenizer = AutoTokenizer.from_pretrained(MODEL, add_prefix_space=True)

# Since LLAMA3 pre-training doesn't have EOS token
# Set the pad_token_id to eos_token_id
tokenizer.pad_token_id = tokenizer.eos_token_id
# Set pad token to eos_token
tokenizer.pad_token = tokenizer.eos_token

tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

### 2.4 Model Config

In [9]:
model.config.pad_token_id = tokenizer.pad_token_id
# Must use .cache = False as below or it crashes from my experience
model.config.use_cache = False
model.config.pretraining_tp = 1

## 3. Preprocess dataset

In [10]:
col_to_delete = ["id"]


# convert words to tokens
def llama_preprocessing_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=MAX_TOKENS)


tokenized_datasets = dataset.map(
    llama_preprocessing_function, batched=True, remove_columns=col_to_delete
)
tokenized_datasets.set_format("torch")

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

Map:   0%|          | 0/225 [00:00<?, ? examples/s]

In [11]:
# The `DataCollatorWithPadding` specifically manages padding, using a tokenizer
# to ensure that all sequences are padded to the same length for consistent model input.

# Padding: Uniformly pads sequences to the length of the longest sequence using a special token,
# allowing simultaneous batch processing.
collate_fn = DataCollatorWithPadding(tokenizer=tokenizer)

# 4. Training a model

In [13]:
# configure how we should train
training_args = TrainingArguments(
    output_dir=f"checkpoints/{MODEL}",
    learning_rate=1e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

# calculate accuracy for evaluating the output of the model.
metric = load_metric("accuracy")


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["val"],
    tokenizer=tokenizer,
    data_collator=collate_fn,
    compute_metrics=compute_metrics,
)
train_result = trainer.train()

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.246127,0.475556
2,No log,1.017372,0.595556
3,No log,0.986125,0.595556


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


## 5. Evaluate Model

In [14]:
def make_predictions(model, df):
    # Convert summaries to a list
    sentences = df.text.tolist()

    # Define the batch size
    batch_size = 32  # You can adjust this based on your system's memory capacity

    # Initialize an empty list to store the model outputs
    all_outputs = []

    # Process the sentences in batches
    for i in range(0, len(sentences), batch_size):
        # Get the batch of sentences
        batch_sentences = sentences[i : i + batch_size]

        # Tokenize the batch
        inputs = tokenizer(
            batch_sentences,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=MAX_TOKENS,
        )

        # Move tensors to the device where the model is (e.g., GPU or CPU)
        inputs = {
            k: v.to("cuda" if torch.cuda.is_available() else "cpu")
            for k, v in inputs.items()
        }

        # Perform inference and store the logits
        with torch.no_grad():
            outputs = model(**inputs)
            all_outputs.append(outputs["logits"])

    final_outputs = torch.cat(all_outputs, dim=0)
    df["predictions"] = final_outputs.argmax(axis=1).cpu().numpy()
    return df

In [15]:
def get_performance_metrics(df_test):
    y_test = df_test.label.round()
    y_pred = df_test.predictions.round()

    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

    print("Balanced Accuracy Score:", balanced_accuracy_score(y_test, y_pred))
    print("Accuracy Score:", accuracy_score(y_test, y_pred))

In [16]:
df_test = make_predictions(model, df_test)
get_performance_metrics(df_test)


Classification Report:
              precision    recall  f1-score   support

           0       0.60      0.65      0.62        96
           1       0.51      0.51      0.51       108
           2       0.41      0.47      0.44       100
           3       0.47      0.41      0.44       115
           4       0.64      0.59      0.61       106

    accuracy                           0.52       525
   macro avg       0.52      0.53      0.52       525
weighted avg       0.52      0.52      0.52       525

Balanced Accuracy Score: 0.525625573481603
Accuracy Score: 0.5219047619047619
