# PEFT Fine-Tune Llama Models in a Notebook

In [1]:
import os
import torch

## Check the environment
assert torch.cuda.is_available(), "Failed to detect GPUs, make sure you set up cuda correctly!"
print("Number of GPUs available: ", torch.cuda.device_count())

major, _ = torch.cuda.get_device_capability()
if major >= 8:
    print("=" * 80)
    print("Your GPU supports bfloat16: accelerate training with bf16=True")
    print("=" * 80)

print("Huggingface Home =", os.environ['HF_HOME'])

ModuleNotFoundError: No module named 'torch'

In [2]:
#!pip install -U accelerate peft bitsandbytes transformers trl

NameError: name 'L' is not defined

In [2]:
from datasets import Dataset, load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

# Model

In [3]:
# The model that you want to load from a directory or the model name from the Hugging Face hub.
model_name = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"

In [4]:
# Activate 4-bit precision for model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

# Load the entire model on the GPU 0
device_map = {"": 0}

### Load the Model

In [5]:
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

# Load model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

### Test the model

In [6]:
# Run text generation pipeline with the loaded model.
# prompt = "<s>USER: Quote by Einstein?\nASST: "
prompt = "Once upon a time, Einstein said "
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=56)
result = pipe(prompt)
print(result[0]['generated_text'])

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Once upon a time, Einstein said 100 years ago, "The most important discovery of the 20th century is the theory of relativity."
"The most important discovery of the 20th century is the theory of relativity."
—


In [8]:
prompt = "Artificial intelligence is transforming the world by "
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=56)
result = pipe(prompt)
print(result[0]['generated_text'])

Artificial intelligence is transforming the world by 2030.
The world is changing at a rapid pace. The world is changing at a rapid pace. The world is changing at a rapid pace. The world is changing at a rapid pace. The world is


In [9]:
prompt = "What is 2 + 2?"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=56)
result = pipe(prompt)
print(result[0]['generated_text'])

What is 2 + 2?
2 + 2 = 4


A: You can use the following code:
var a = 2;
var b = 2;
var c = 2;
var d = 2;


In [10]:
prompt = "How do you write the SMILES notation for polyethylene?"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=56)
result = pipe(prompt)
print(result[0]['generated_text'])

How do you write the SMILES notation for polyethylene?
What is the SMILES notation for polyethylene?
What is the SMILES notation for polyethylene?
What is the SMILES notation for polyethylene


In [11]:
prompt = "What is polyethylene?"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=56)
result = pipe(prompt)
print(result[0]['generated_text'])

What is polyethylene?
Polyethylene is a plastic that is used in a variety of applications. It is a clear, colorless, and odorless plastic that is made from petroleum. It is a common plastic that is


In [21]:
prompt = "The SMILES of the repeating unit of polyethylene is "
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=56)
result = pipe(prompt)
print(result[0]['generated_text'])

The SMILES of the repeating unit of polyethylene is 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17


In [18]:
prompt = "The melting point of polyethylene is "
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=50)
result = pipe(prompt)
print(result[0]['generated_text'])

The melting point of polyethylene is 120°C.
Polyethylene is a clear, colorless, odorless, tasteless, and non-flammable plastic.
Poly


# Dataset

In [22]:
# Load a huggingface dataset.
# -----------------------------------------------------------
dataset_name = "Abirate/english_quotes"
dataset = load_dataset(dataset_name, split="train[:1000]")

# Load from a custom jsonl dataset (list of dict).
# -----------------------------------------------------------
# jsonlist = []
# def gen():
#     yield from jsonlist

# dataset = Dataset.from_generator(gen)

Downloading readme:   0%|          | 0.00/5.55k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/647k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2508 [00:00<?, ? examples/s]

In [26]:
dataset[2]

{'quote': "“Two things are infinite: the universe and human stupidity; and I'm not sure about the universe.”",
 'author': 'Albert Einstein',
 'tags': ['human-nature',
  'humor',
  'infinity',
  'philosophy',
  'science',
  'stupidity',
  'universe']}

### Instruction format

In [27]:
dataset_field = "text"

def instruction_format(row):
    row[dataset_field] = f"<s>USER: Quote by {row['author']}?\nASST: {row['quote']}</s>"
    return row

dataset = dataset.map(instruction_format)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [29]:
dataset[2]

{'quote': "“Two things are infinite: the universe and human stupidity; and I'm not sure about the universe.”",
 'author': 'Albert Einstein',
 'tags': ['human-nature',
  'humor',
  'infinity',
  'philosophy',
  'science',
  'stupidity',
  'universe'],
 'text': "<s>USER: Quote by Albert Einstein?\nASST: “Two things are infinite: the universe and human stupidity; and I'm not sure about the universe.”</s>"}

# Training

In [30]:
# Fine-tuned model name
new_model = "tinyllama-quotes"

# Output directory where the model predictions and checkpoints will be stored
output_dir = "./results"

################################################################################
# QLoRA parameters
################################################################################

# LoRA attention dimension
lora_r = 8

# Alpha parameter for LoRA scaling
lora_alpha = 8

# Dropout probability for LoRA layers
lora_dropout = 0.01

################################################################################
# TrainingArguments parameters
################################################################################

# Number of training epochs
num_train_epochs = 1

# Enable fp16/bf16 training (set bf16 to True with an A100)
fp16 = False
bf16 = False

# Batch size per GPU for training
per_device_train_batch_size = 4

# Batch size per GPU for evaluation
per_device_eval_batch_size = 4

# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = 1

# Enable gradient checkpointing
gradient_checkpointing = True

# Maximum gradient normal (gradient clipping)
max_grad_norm = 0.3

# Initial learning rate (AdamW optimizer)
learning_rate = 2e-4

# Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay = 0.001

# Optimizer to use
optim = "paged_adamw_32bit"

# Learning rate schedule (constant a bit better than cosine)
lr_scheduler_type = "constant"

# Number of training steps (overrides num_train_epochs)
max_steps = -1

# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.03

# Group sequences into batches with same length
# Saves memory and speeds up training considerably
group_by_length = True

# Save checkpoint every X updates steps
save_steps = 25

# Log every X updates steps
logging_steps = 25

################################################################################
# SFT parameters
################################################################################

# Maximum sequence length to use
max_seq_length = None

# Pack multiple short examples in the same input sequence to increase efficiency
packing = False


### Set Training & SFT parameters

In [31]:
tokenizer.padding_side = "right" # Fix overflow issue with fp16 training

peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
)

# Set training parameters
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to="none"
)

# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    dataset_text_field=dataset_field,
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=packing,
)



Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

### Train & save the trained model

In [32]:
# Train model
trainer.train()

# Save trained model
trainer.model.save_pretrained(new_model)

Step,Training Loss
25,2.9186
50,2.175
75,1.9983
100,1.6022
125,1.8636
150,1.4462
175,1.7769
200,1.3663
225,1.7151
250,1.359




In [37]:
prompt = "Quote by Albert Einstein "
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=100)
result = pipe(prompt)
print(result[0]['generated_text'])

Quote by Albert Einstein 1921
"The most important thing in life is not to stop questioning."
"The most important thing in life is not to stop questioning; it is to ask questions until you get to the answer."
"The most important thing in life is not to stop questioning; it is to ask questions until you get to the answer."
"The most important thing in life is not to stop questioning; it is to ask questions


In [38]:
prompt = "Quote by Albert Einstein "
pipe = pipeline(task="text-generation", model=new_model, tokenizer=tokenizer, max_length=100)
result = pipe(prompt)
print(result[0]['generated_text'])

Quote by Albert Einstein 1921
"The most important thing is not to stop questioning. Curiosity has its own reason for existing. One must not merely exist, one must also question."
— Albert Einstein German-born theoretical physicist and author 1879 - 1955
"The most important thing is not to stop questioning. Curiosity has its own reason for existing. One must not merely exist, one
