# Fine-Tune Llama 3

In [None]:
%pip install "torch==2.2.2" tensorboard --quiet
%pip install  --upgrade "transformers==4.43.2" "vllm==0.5.5" "datasets==2.18.0" "accelerate==0.29.3" "evaluate==0.4.1" "bitsandbytes==0.43.1" "huggingface_hub==0.23.5" "trl==0.8.6" "peft==0.10.0" 

###### Imports

In [2]:
import os
import random
import functools
import csv
import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F
import evaluate

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, confusion_matrix, classification_report, balanced_accuracy_score, accuracy_score

from scipy.stats import pearsonr
from datasets import Dataset, DatasetDict
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model

from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)


2024-11-16 12:12:29.432269: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-16 12:12:29.448099: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-16 12:12:29.453126: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-16 12:12:29.465275: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## 1. Read Data


In [5]:
# read yelp data (see 00-prepare-dataset.ipynb for more info)
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")
df_val = pd.read_csv("val.csv")

# we have a score from 1 - 5, bnut labels need to start from 0 so we subtract 1.
df_train["class_name"] = df_train["class_name"] - 1
df_test["class_name"] = df_test["class_name"] - 1
df_val["class_name"] = df_val["class_name"] - 1

dataset_train = Dataset.from_pandas(df_train)
dataset_val = Dataset.from_pandas(df_val)
dataset_test = Dataset.from_pandas(df_test)
dataset = DatasetDict({
    'train': dataset_train,
    'val': dataset_val,
})
dataset

DatasetDict({
    train: Dataset({
        features: ['class_name', 'question', 'id'],
        num_rows: 250
    })
    val: Dataset({
        features: ['class_name', 'question', 'id'],
        num_rows: 225
    })
})

In [6]:
labels = df_train["class_name"].unique()
labels

array([4, 1, 2, 3, 0])

## 2. Configure model

In [8]:
model_name = "meta-llama/Llama-3.2-1B-Instruct"

### 2.1  4-bit quantization

A technique to reduce the size and computational requirements of a machine learning model by </br>
representing its weights with 4-bit integers instead of higher-precision values, </br>
trading off some precision for efficiency.


In [9]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit = True, # enable 4-bit quantization
    bnb_4bit_quant_type = 'nf4', # information theoretically optimal dtype for normally distributed weights
    bnb_4bit_use_double_quant = True, # quantize quantized weights //insert xzibit meme
    bnb_4bit_compute_dtype = torch.bfloat16 # optimized fp format for M
)

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    quantization_config=quantization_config,
    num_labels=len(labels)
)
model = prepare_model_for_kbit_training(model)
model

`low_cpu_mem_usage` was None, now set to True since model is quantized.
Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B-Instruct and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LlamaForSequenceClassification(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear4bit(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear4bit(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear4bit(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear4bit(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (

### 2.2 LORA config
(Low-Rank Adaptation): A method for efficiently fine-tuning large language models by </br>
updating only a small number of low-rank parameters, significantly reducing the computational </br>
and storage requirements.

In [10]:
lora_config = LoraConfig(
    r = 16, # the dimension of the low-rank matrices
    lora_alpha = 8, # scaling factor for LoRA activations vs pre-trained weight activations
    target_modules = ['q_proj', 'k_proj', 'v_proj', 'o_proj'],
    lora_dropout = 0.05, # dropout probability of the LoRA layers
    bias = 'none', # wether to train bias weights, set to 'none' for attention layers
    task_type = 'SEQ_CLS'
)

model = get_peft_model(model, lora_config)
model

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): LlamaForSequenceClassification(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 2048)
        (layers): ModuleList(
          (0-15): 16 x LlamaDecoderLayer(
            (self_attn): LlamaSdpaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=2048, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): lora.Linear4bit(
        

### 2.3 Tokenizer

In [11]:
tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True)

# Since LLAMA3 pre-training doesn't have EOS token
# Set the pad_token_id to eos_token_id
tokenizer.pad_token_id = tokenizer.eos_token_id
# Set pad token to eos_token
tokenizer.pad_token = tokenizer.eos_token

### 2.4 Model Config

In [12]:
model.config.pad_token_id = tokenizer.pad_token_id
# Must use .cache = False as below or it crashes from my experience
model.config.use_cache = False
model.config.pretraining_tp = 1

## 3. Preprocess dataset

In [13]:
MAX_LEN = 8192
col_to_delete = ['id']

# convert words to tokens
def llama_preprocessing_function(examples):
    return tokenizer(examples['question'], truncation=True, max_length=8192)

tokenized_datasets = dataset.map(llama_preprocessing_function, batched=True, remove_columns=col_to_delete)
tokenized_datasets = tokenized_datasets.rename_column("class_name", "label")
tokenized_datasets.set_format("torch")

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

Map:   0%|          | 0/225 [00:00<?, ? examples/s]

In [14]:
# The `DataCollatorWithPadding` specifically manages padding, using a tokenizer 
# to ensure that all sequences are padded to the same length for consistent model input.

# Padding: Uniformly pads sequences to the length of the longest sequence using a special token, 
# allowing simultaneous batch processing.
collate_fn = DataCollatorWithPadding(tokenizer=tokenizer)


In [16]:
# class CustomTrainer(Trainer):
#     def __init__(self, *args, class_weights=None, **kwargs):
#         super().__init__(*args, **kwargs)
#         # Ensure label_weights is a tensor
#         if class_weights is not None:
#             self.class_weights = torch.tensor(class_weights, dtype=torch.float32).to(self.args.device)
#         else:
#             self.class_weights = None

#     def compute_loss(self, model, inputs, return_outputs=False):
#         # Extract labels and convert them to long type for cross_entropy
#         labels = inputs.pop("labels").long()

#         # Forward pass
#         outputs = model(**inputs)

#         # Extract logits assuming they are directly outputted by the model
#         logits = outputs.get('logits')

#         # Compute custom loss with class weights for imbalanced data handling
#         if self.class_weights is not None:
#             loss = F.cross_entropy(logits, labels, weight=self.class_weights)
#         else:
#             loss = F.cross_entropy(logits, labels)

#         return (loss, outputs) if return_outputs else loss


# define training args

In [17]:
training_args = TrainingArguments(
    output_dir = 'sequence_classification',
    learning_rate = 1e-4,
    per_device_train_batch_size = 8,
    per_device_eval_batch_size = 8,
    num_train_epochs = 2,
    weight_decay = 0.01,
    evaluation_strategy = 'epoch',
    save_strategy = 'epoch',
    load_best_model_at_end = True
    
)

# calculate accuracy for evaluating the output of the model.
def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    try:
        # it's a classification task, take the argmax
        predictions_processed = np.argmax(predictions, axis=1)

        # Calculate Pearson correlation
        pearson, _ = pearsonr(predictions_processed, labels)

        return {'pearson': pearson}
    except Exception as e:
        print(f"Error in compute_metrics: {e}")
        return {'pearson': None}

trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_datasets['train'],
    eval_dataset = tokenized_datasets['val'],
    tokenizer = tokenizer,
    data_collator = collate_fn,
    compute_metrics = compute_metrics,
    # class_weights=class_weights,
)



#### Define custom trainer

* https://huggingface.co/docs/transformers/en/training

### Run trainer!

In [19]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [20]:
train_result = trainer.train()

  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Epoch,Training Loss,Validation Loss,Pearson
1,No log,1.803955,0.219168
2,No log,1.664384,0.357756


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


#### Let's check the results
* I wrapped in a function a convenient way add the predictions

In [21]:
def make_predictions(model, df):


  # Convert summaries to a list
  sentences = df.question.tolist()

  # Define the batch size
  batch_size = 32  # You can adjust this based on your system's memory capacity

  # Initialize an empty list to store the model outputs
  all_outputs = []

  # Process the sentences in batches
  for i in range(0, len(sentences), batch_size):
      # Get the batch of sentences
      batch_sentences = sentences[i:i + batch_size]

      # Tokenize the batch
      inputs = tokenizer(batch_sentences, return_tensors="pt", padding=True, truncation=True, max_length=512)

      # Move tensors to the device where the model is (e.g., GPU or CPU)
      inputs = {k: v.to('cuda' if torch.cuda.is_available() else 'cpu') for k, v in inputs.items()}

      # Perform inference and store the logits
      with torch.no_grad():
          outputs = model(**inputs)
          all_outputs.append(outputs['logits'])

  final_outputs = torch.cat(all_outputs, dim=0)
  df['predictions']=final_outputs.argmax(axis=1).cpu().numpy()
  # df['predictions']=df['predictions'].apply(lambda l:category_map[l])




### Analyze performance

In [22]:
def get_performance_metrics(df_test):
  y_test = df_test.class_name.round()
  y_pred = df_test.predictions.round()
  print(f"comparing test {y_test} and pred {y_pred}")

  print("Confusion Matrix:")
  print(confusion_matrix(y_test, y_pred))

  print("\nClassification Report:")
  print(classification_report(y_test, y_pred))

  print("Balanced Accuracy Score:", balanced_accuracy_score(y_test, y_pred))
  print("Accuracy Score:", accuracy_score(y_test, y_pred))

In [23]:
df_val

Unnamed: 0,class_name,question,id
0,0,staff is friendly enough..drs are quick and do...,8918
1,4,"Incredible coffee, on the same level as LUX co...",303120
2,4,Pizza is really good and hot wings are too. Se...,474126
3,1,Place is falling apart! Just outside of the B...,162806
4,2,"It's not often I find myself on Murray Ave, bu...",77420
...,...,...,...
220,2,Think of the Sugar Factory as a beddazzled Che...,504446
221,2,This would be a great place to watch a game. T...,544799
222,4,During a extended layover at the Phoenix Sky H...,111784
223,1,This Fry's is just around the corner from my h...,23785


In [24]:
make_predictions(model,df_val)

get_performance_metrics(df_val)
df_val

comparing test 0      0
1      4
2      4
3      1
4      2
      ..
220    2
221    2
222    4
223    1
224    2
Name: class_name, Length: 225, dtype: int64 and pred 0      4
1      4
2      0
3      3
4      4
      ..
220    2
221    2
222    1
223    2
224    0
Name: predictions, Length: 225, dtype: int64
Confusion Matrix:
[[25 11  5  5  4]
 [12 11 13  7  5]
 [10  7 10 10  4]
 [ 6  4  6  9 11]
 [ 8  5  7 15 15]]

Classification Report:
              precision    recall  f1-score   support

           0       0.41      0.50      0.45        50
           1       0.29      0.23      0.26        48
           2       0.24      0.24      0.24        41
           3       0.20      0.25      0.22        36
           4       0.38      0.30      0.34        50

    accuracy                           0.31       225
   macro avg       0.30      0.30      0.30       225
weighted avg       0.31      0.31      0.31       225

Balanced Accuracy Score: 0.3046138211382114
Accuracy Score: 0.31111

Unnamed: 0,class_name,question,id,predictions
0,0,staff is friendly enough..drs are quick and do...,8918,4
1,4,"Incredible coffee, on the same level as LUX co...",303120,4
2,4,Pizza is really good and hot wings are too. Se...,474126,0
3,1,Place is falling apart! Just outside of the B...,162806,3
4,2,"It's not often I find myself on Murray Ave, bu...",77420,4
...,...,...,...,...
220,2,Think of the Sugar Factory as a beddazzled Che...,504446,2
221,2,This would be a great place to watch a game. T...,544799,2
222,4,During a extended layover at the Phoenix Sky H...,111784,1
223,1,This Fry's is just around the corner from my h...,23785,2


### Saving the model trainer state and model adapters

In [None]:
metrics = train_result.metrics
max_train_samples = len(dataset_train)
metrics["train_samples"] = min(max_train_samples, len(dataset_train))
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()

#### Saving the adapter model
* Note this doesn't save the entire model. It only saves the adapters.