In [1]:
%%capture
# Install dependencies
%pip install -q -U trl transformers accelerate peft
%pip install -q datasets bitsandbytes
%pip install -U "huggingface_hub[cli]"

In [2]:
# Configure Huggingface token if download model from Huggingface
# import getpass
# import os
# # Get the Huggingface Token
# os.environ["HF_TOKEN"] = getpass.getpass()

In [3]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel,PeftConfig

In [4]:
# Load from HF 
# base_model = "meta-llama/Meta-Llama-3.1-8B-Instruct"
# tokenizer = AutoTokenizer.from_pretrained(base_model, token=os.environ['HF_TOKEN'])
# base_model_fp16 = AutoModelForCausalLM.from_pretrained(base_model, device_map="auto", token=os.environ['HF_TOKEN'])

In [5]:
# Load from local disk
base_model = "Meta-Llama-3.1-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(base_model)

* Test the Tokenizer

In [6]:
def print_tokens_with_ids(txt):
    tokens = tokenizer.tokenize(txt, add_special_tokens=False)
    token_ids = tokenizer.encode(txt, add_special_tokens=False)
    print(list(zip(tokens, token_ids)))
prompt = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Based on the information provided, rewrite the sentence by changing its tense from past to future.<|eot_id|><|start_header_id|>user<|end_header_id|>

She played the piano beautifully for hours and then stopped as it was midnight.<|eot_id|><|start_header_id|>assistant<|end_header_id|>

"""
print_tokens_with_ids(prompt)

[('<|begin_of_text|>', 128000), ('<|start_header_id|>', 128006), ('system', 9125), ('<|end_header_id|>', 128007), ('ĊĊ', 271), ('Based', 29815), ('Ġon', 389), ('Ġthe', 279), ('Ġinformation', 2038), ('Ġprovided', 3984), (',', 11), ('Ġrewrite', 18622), ('Ġthe', 279), ('Ġsentence', 11914), ('Ġby', 555), ('Ġchanging', 10223), ('Ġits', 1202), ('Ġtense', 43787), ('Ġfrom', 505), ('Ġpast', 3347), ('Ġto', 311), ('Ġfuture', 3938), ('.', 13), ('<|eot_id|>', 128009), ('<|start_header_id|>', 128006), ('user', 882), ('<|end_header_id|>', 128007), ('ĊĊ', 271), ('She', 8100), ('Ġplayed', 6476), ('Ġthe', 279), ('Ġpiano', 27374), ('Ġbeautifully', 32719), ('Ġfor', 369), ('Ġhours', 4207), ('Ġand', 323), ('Ġthen', 1243), ('Ġstopped', 10717), ('Ġas', 439), ('Ġit', 433), ('Ġwas', 574), ('Ġmidnight', 33433), ('.', 13), ('<|eot_id|>', 128009), ('<|start_header_id|>', 128006), ('assistant', 78191), ('<|end_header_id|>', 128007), ('ĊĊ', 271)]


In [7]:
#  Quantize the llama3.1 FP16 model to BNB NF4. Load the quantized model to GPU
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)
base_model_bnb_4b = AutoModelForCausalLM.from_pretrained(base_model, quantization_config=bnb_config, device_map = 'auto')

# Use the code below to load directly from HF
# base_model_bnb_4b = AutoModelForCausalLM.from_pretrained(base_model, device_map="auto", quantization_config=bnb_config, token=os.environ['HF_TOKEN'])

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [8]:
# print model config
base_model_bnb_4b.config

LlamaConfig {
  "_name_or_path": "Meta-Llama-3.1-8B-Instruct",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "eos_token_id": [
    128001,
    128008,
    128009
  ],
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 131072,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "quantization_config": {
    "_load_in_4bit": true,
    "_load_in_8bit": false,
    "bnb_4bit_compute_dtype": "bfloat16",
    "bnb_4bit_quant_storage": "uint8",
    "bnb_4bit_quant_type": "nf4",
    "bnb_4bit_use_double_quant": false,
    "llm_int8_enable_fp32_cpu_offload": false,
    "llm_int8_has_fp16_weight": false,
    "llm_int8_skip_modules": null,
    "llm_int8_threshold": 6.0,
    "load_in_4bit": true,
    "load_in_8bit": false,
    "quan

In [13]:
print(f"Model: {base_model_bnb_4b.__class__.__name__}")
print(f"\nModel Dtype: {base_model_bnb_4b.dtype}")
    
print("\nMemory Usage:")
memory_footprint = base_model_bnb_4b.get_memory_footprint()
# 
if isinstance(memory_footprint, dict):
    for device, memory in memory_footprint.items():
        print(f"- {device}: {memory / 1024**2:.2f} MB")
else:
    print(f"Total: {memory_footprint / 1024**2:.2f} MB")

Model: LlamaForCausalLM

Model Dtype: torch.float16

Memory Usage:
Total: 5332.52 MB


In [10]:
base_model_bnb_4b

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps

In [14]:
# Test inference
prompt = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Based on the information provided, rewrite the sentence by changing its tense from past to future.<|eot_id|><|start_header_id|>user<|end_header_id|>

She played the piano beautifully for hours and then stopped as it was midnight.<|eot_id|><|start_header_id|>assistant<|end_header_id|>

"""
input_ids = tokenizer(prompt, return_tensors="pt", truncation=True).input_ids.cuda()
outputs = base_model_bnb_4b.generate(input_ids=input_ids,
                          pad_token_id=tokenizer.eos_token_id,
                          max_new_tokens=200,
                          do_sample=True,
                          top_p=0.9,
                          temperature=0.1)
result = tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=False)[0]
print(result)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


<|begin_of_text|><|begin_of_text|><|start_header_id|>system<|end_header_id|>

Based on the information provided, rewrite the sentence by changing its tense from past to future.<|eot_id|><|start_header_id|>user<|end_header_id|>

She played the piano beautifully for hours and then stopped as it was midnight.<|eot_id|><|start_header_id|>assistant<|end_header_id|>

Here is the sentence rewritten in the future tense:

She will play the piano beautifully for hours and then stop as it is midnight.<|eot_id|>


## Prepare dateset
[Prepare Your Dataset for Fine-Tuning Llama 3.1](https://medium.com/@yuxiaojian/prepare-your-dataset-for-fine-tuning-llama-3-1-46fd3c78f6fd)

In [15]:
from datasets import load_dataset
dataset = load_dataset("yahma/alpaca-cleaned", split = "train")

llama31_prompt="""<|begin_of_text|><|start_header_id|>system<|end_header_id|>

{}<|eot_id|><|start_header_id|>user<|end_header_id|>

{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>

{}<|eot_id|>"""

def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        text = llama31_prompt.format(instruction, input, output)
        texts.append(text)
    return { "text" : texts, }


dataset = dataset.map(formatting_prompts_func, batched = True,)

Downloading readme:   0%|          | 0.00/11.6k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/44.3M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/51760 [00:00<?, ? examples/s]

Map:   0%|          | 0/51760 [00:00<?, ? examples/s]

In [16]:
print(dataset[22]['text'])

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Based on the information provided, rewrite the sentence by changing its tense from past to future.<|eot_id|><|start_header_id|>user<|end_header_id|>

She played the piano beautifully for hours and then stopped as it was midnight.<|eot_id|><|start_header_id|>assistant<|end_header_id|>

She will play the piano beautifully for hours and then stop as it will be midnight.<|eot_id|>


In [18]:
# Wrap up the quantized model with a LoRA adaptor

from peft import LoraConfig, get_peft_config, get_peft_model
# https://huggingface.co/docs/peft/main/en/conceptual_guides/lora
lora_alpha = 16
lora_dropout = 0.1
lora_r = 32

peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
)

peft_model = get_peft_model(base_model_bnb_4b, peft_config)
peft_model.print_trainable_parameters()

trainable params: 83,886,080 || all params: 8,114,147,328 || trainable%: 1.0338


In [19]:
# Less trainable params with lower r
lora_r = 16

peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
)

peft_model = get_peft_model(base_model_bnb_4b, peft_config)
peft_model.print_trainable_parameters()

trainable params: 41,943,040 || all params: 8,072,204,288 || trainable%: 0.5196


In [20]:
# Take a look at the model with LoRA adaptor
peft_model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 4096)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaSdpaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): 

* Total items in the dataset: 51,760
* steps_per_epoch = total_items / (batch_size * gradient_accumulation_steps * gpu)
* steps_per_epoch = 51,760 / (2 * 4 * 1) = 6,470 for 1 GPU

`max_steps` should >= 6,470 in production. We use a smaller value for testing purposes

In [22]:
from trl import SFTConfig
from transformers import TrainingArguments

output_dir = "./results"
per_device_train_batch_size = 2
gradient_accumulation_steps = 4
optim = "paged_adamw_32bit"
save_steps = 50
logging_steps = 5
learning_rate = 2e-4
max_grad_norm = 0.3
max_steps = 200
warmup_ratio = 0.03
lr_scheduler_type = "linear"


sft_config = SFTConfig(
    dataset_text_field="text",
    max_seq_length=512,
    output_dir=output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    fp16=True,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=True,
    lr_scheduler_type=lr_scheduler_type,
    gradient_checkpointing=True,
)

In [23]:
from trl import SFTTrainer
trainer = SFTTrainer(
    model=base_model_bnb_4b,
    train_dataset=dataset,
    peft_config=peft_config,
    args=sft_config,
)

Map:   0%|          | 0/51760 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
max_steps is given, it will override any value given in num_train_epochs


In [24]:
# Start training

trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss
5,1.3454
10,1.2659
15,1.2102
20,1.2665
25,1.2544
30,1.1724
35,1.2628
40,1.3594
45,1.2677
50,1.1369


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


TrainOutput(global_step=200, training_loss=1.1582042407989501, metrics={'train_runtime': 2144.9838, 'train_samples_per_second': 0.746, 'train_steps_per_second': 0.093, 'total_flos': 1.2560825495027712e+16, 'train_loss': 1.1582042407989501, 'epoch': 0.030911901081916538})

In [25]:
# Get the PEFT model
lora_model = trainer.model

# Save the adaptor, we will merge it with the base model.
# Since the base fp16 model takes too much memory, we will merge it with llama.cpp
lora_model.save_pretrained("llama3.1-ft-lora-adaptor")

# Don't forget to save the tokenizer if you need it
tokenizer = trainer.tokenizer
tokenizer.save_pretrained("llama3.1-ft-lora-adaptor")

('llama3.1-ft-lora-adaptor/tokenizer_config.json',
 'llama3.1-ft-lora-adaptor/special_tokens_map.json',
 'llama3.1-ft-lora-adaptor/tokenizer.json')

In [26]:
# Test inference
prompt = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Based on the information provided, rewrite the sentence by changing its tense from past to future.<|eot_id|><|start_header_id|>user<|end_header_id|>

She played the piano beautifully for hours and then stopped as it was midnight.<|eot_id|><|start_header_id|>assistant<|end_header_id|>

"""
input_ids = tokenizer(prompt, return_tensors="pt", truncation=True).input_ids.cuda()
outputs = lora_model.generate(input_ids=input_ids,
                          pad_token_id=tokenizer.eos_token_id,
                          max_new_tokens=200,
                          do_sample=True,
                          top_p=0.9,
                          temperature=0.1)
result = tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=False)[0]
print(result)

  return fn(*args, **kwargs)


<|begin_of_text|><|begin_of_text|><|start_header_id|>system<|end_header_id|>

Based on the information provided, rewrite the sentence by changing its tense from past to future.<|eot_id|><|start_header_id|>user<|end_header_id|>

She played the piano beautifully for hours and then stopped as it was midnight.<|eot_id|><|start_header_id|>assistant<|end_header_id|>

She will play the piano beautifully for hours and then stop as it is midnight.<|eot_id|>
