# Llama 3 8B QLoRA Fine-tuning and Inference

### Dependencies

In [1]:
# Install Unsloth, Xformers (Flash Attention) and all other required packages

!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps xformers "trl<0.9.0" peft accelerate bitsandbytes

Collecting unsloth@ git+https://github.com/unslothai/unsloth.git (from unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Cloning https://github.com/unslothai/unsloth.git to /tmp/pip-install-drk_nadj/unsloth_6f575955200c44f78eccdf98629b513f
  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth.git /tmp/pip-install-drk_nadj/unsloth_6f575955200c44f78eccdf98629b513f
  Resolved https://github.com/unslothai/unsloth.git to commit 933d9fe2cb2459f949ee2250e90a5b610d277eab
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone


### Model and Tokenizer Settings

In [2]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere A100
load_in_4bit = True # Using 4bit quantization to reduce memory usage

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b-Instruct-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
[2024-06-30 06:27:05,812] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)
==((====))==  Unsloth: Fast Llama patching release 2024.6
   \\   /|    GPU: Quadro RTX 5000. Max memory: 15.56 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.2.0+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. Xformers = 0.0.24. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 32,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",
    # "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

Unsloth 2024.6 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


### Data Formatter

In [5]:
# assuming the dataset already in instruction, input, output format

alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        # Must add EOS_TOKEN, otherwise the generation will go on forever!
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

from datasets import load_dataset
dataset = load_dataset("json", data_files= "..Code_Refinement/train.jsonl")
dataset = dataset.map(formatting_prompts_func, batched = True,)

### Logger Settings

In [5]:
# check wandb login, use wandb login --relogin if needed

import os
wandb_project = "unsloth-llama3-five-epochs-refine"
if len(wandb_project) > 0:
    os.environ["WANDB_PROJECT"] = wandb_project


### HF Trainer Settings

In [6]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported
# import datetime

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset['train'],
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False,
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 500,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 100,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "qlora-llama3-five-epochs-refine",
        num_train_epochs=5,
        report_to="wandb",
        save_steps=10000,
    ),
)

### Memory Check

In [7]:
# Show current memory stat before training

gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Quadro RTX 5000. Max memory = 15.56 GB.
5.82 GB of memory reserved.


### Training Stats

In [9]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 150,406 | Num Epochs = 5
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 94,000
 "-____-"     Number of trainable parameters = 83,886,080
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33m1805112[0m. Use [1m`wandb login --relogin`[0m to force relogin


  0%|          | 0/94000 [00:00<?, ?it/s]

{'loss': 1.7555, 'grad_norm': 0.3547803461551666, 'learning_rate': 4e-05, 'epoch': 0.01}
{'loss': 0.9047, 'grad_norm': 0.39553922414779663, 'learning_rate': 8e-05, 'epoch': 0.01}
{'loss': 0.8583, 'grad_norm': 0.25137999653816223, 'learning_rate': 0.00012, 'epoch': 0.02}
{'loss': 0.8517, 'grad_norm': 0.2347923070192337, 'learning_rate': 0.00016, 'epoch': 0.02}
{'loss': 0.8486, 'grad_norm': 0.25250470638275146, 'learning_rate': 0.0002, 'epoch': 0.03}
{'loss': 0.8562, 'grad_norm': 0.23727315664291382, 'learning_rate': 0.0001997860962566845, 'epoch': 0.03}
{'loss': 0.8473, 'grad_norm': 0.2884700894355774, 'learning_rate': 0.00019957219251336898, 'epoch': 0.04}
{'loss': 0.835, 'grad_norm': 0.26967114210128784, 'learning_rate': 0.00019935828877005348, 'epoch': 0.04}
{'loss': 0.8555, 'grad_norm': 0.34635454416275024, 'learning_rate': 0.00019914438502673798, 'epoch': 0.05}
{'loss': 0.8349, 'grad_norm': 0.37339168787002563, 'learning_rate': 0.00019893048128342245, 'epoch': 0.05}
{'loss': 0.8406

wandb: Network error (ConnectionError), entering retry loop.


{'loss': 0.705, 'grad_norm': 0.410397469997406, 'learning_rate': 0.00015723422459893048, 'epoch': 1.09}
{'loss': 0.7211, 'grad_norm': 0.44502586126327515, 'learning_rate': 0.00015702032085561498, 'epoch': 1.1}
{'loss': 0.7214, 'grad_norm': 0.4496280252933502, 'learning_rate': 0.00015680641711229946, 'epoch': 1.1}
{'loss': 0.7097, 'grad_norm': 0.45690014958381653, 'learning_rate': 0.00015659251336898398, 'epoch': 1.11}
{'loss': 0.7124, 'grad_norm': 0.44366851449012756, 'learning_rate': 0.00015637860962566845, 'epoch': 1.11}
{'loss': 0.7154, 'grad_norm': 0.5002066493034363, 'learning_rate': 0.00015616470588235295, 'epoch': 1.12}
{'loss': 0.7086, 'grad_norm': 0.6164109706878662, 'learning_rate': 0.00015595080213903743, 'epoch': 1.12}
{'loss': 0.7141, 'grad_norm': 0.6238429546356201, 'learning_rate': 0.00015573689839572195, 'epoch': 1.13}
{'loss': 0.7081, 'grad_norm': 0.40691399574279785, 'learning_rate': 0.00015552299465240642, 'epoch': 1.13}
{'loss': 0.716, 'grad_norm': 0.395464181900024

### Memory Recheck

In [10]:
# Show final memory and time stats

used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

367021.5092 seconds used for training.
6117.03 minutes used for training.
Peak reserved memory = 10.18 GB.
Peak reserved memory for training = 4.36 GB.
Peak reserved memory % of max memory = 65.424 %.
Peak reserved memory for training % of max memory = 28.021 %.


### Single Sample Inference

In [None]:
test_file = "..Code_Refinement/test.jsonl"
test_dataset = load_dataset("json", data_files= test_file)

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
test_dataset

DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output'],
        num_rows: 13104
    })
})

In [None]:
test_dataset['train'][0]

{'instruction': 'You are a powerful code reviewer model. Your job is to suggest refined or fixed code based on the natural language review comment. You are given a question, and context regarding an old diff hunk or code change in programming language. You are also given a review comment based on that old code. You must output accurate refined, fixed new code snippet for that old code change and corresponding review comment in the same programming language as the old code. ',
 'input': 'Review Comment: Is the name "head" a convention for health checking? Regardless it caught me by surprise, maybe add some docs to this function on why it exist? It should also say what 204.\nOld Code: self.redirect("/static/visualiser/index.html") def head(self): self.set_status(204) self.finish()',
 'output': 'New Code: self.redirect("/static/visualiser/index.html") def head(self): """HEAD endpoint for health checking the scheduler""" self.set_status(204) self.finish()'}

In [None]:
# inference example

FastLanguageModel.for_inference(model) # Enables native 2x faster inference
inputs = tokenizer(
[
    alpaca_prompt.format(
        test_dataset['train']['instruction'][0],
        test_dataset['train']['input'][0],
        "",
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 100, use_cache = True)
tokenizer.batch_decode(outputs)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


['<|begin_of_text|>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nYou are a powerful code reviewer model. Your job is to suggest refined or fixed code based on the natural language review comment. You are given a question, and context regarding an old diff hunk or code change in programming language. You are also given a review comment based on that old code. You must output accurate refined, fixed new code snippet for that old code change and corresponding review comment in the same programming language as the old code. \n\n### Input:\nReview Comment: Is the name "head" a convention for health checking? Regardless it caught me by surprise, maybe add some docs to this function on why it exist? It should also say what 204.\nOld Code: self.redirect("/static/visualiser/index.html") def head(self): self.set_status(204) self.finish()\n\n### Response:\nNew Code:

### Saving Fine-tuned Model and Tokenizer

In [None]:
model.save_pretrained("qlora-llama3-five-epochs-refine") # Local saving
tokenizer.save_pretrained("qlora-llama3-five-epochs-refine")

('qlora-llama3-five-epochs-refine/tokenizer_config.json',
 'qlora-llama3-five-epochs-refine/special_tokens_map.json',
 'qlora-llama3-five-epochs-refine/tokenizer.json')

### Experimental Inference on Test Subset

In [6]:
if True:
    from unsloth import FastLanguageModel
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "qlora-llama3-five-epochs-refine",
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )
    FastLanguageModel.for_inference(model)

==((====))==  Unsloth: Fast Llama patching release 2024.6
   \\   /|    GPU: Quadro RTX 5000. Max memory: 15.56 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.2.0+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. Xformers = 0.0.24. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


### Saving Ground Truths

In [7]:
import json
test_data_file_path = "../ref-test-5000-tuned.jsonl"
test_data = []
with open(test_data_file_path, 'r') as f:
    for i in range(5000):
        line = f.readline()
        data = json.loads(line)
        test_data.append(data)


# write all the output part of the test data to a file as the gold for comparison
gold_file_path = "ref-5000-gold.txt"
with open(gold_file_path, 'w') as f:
    for data in test_data:
        # remove "New Code: " from the output
        value = data['output'].replace("New Code: ", "")
        f.write(value + "\n")

### Batch Inference and Saving Model Response

In [9]:
from datasets import load_dataset

test_file = "../Llama/ref-test-5000-tuned.jsonl"
test_dataset = load_dataset("json", data_files= test_file)


for i in range(5000):
    FastLanguageModel.for_inference(model)
    inputs = tokenizer(
    [
        alpaca_prompt.format(
            test_dataset['train']['instruction'][i],
            test_dataset['train']['input'][i],
            "",
        )
    ], return_tensors = "pt").to("cuda")

    outputs = model.generate(**inputs, max_new_tokens = 96, use_cache = True)
    tokenizer.batch_decode(outputs)
    # write the output to a file
    with open("ref-5000-output.txt", 'a') as f:
        f.write(tokenizer.batch_decode(outputs)[0] + "\n")

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for

### Collecting Predictions from Responses

In [None]:
# check the output file and retrieve all lines following ### Response:\n
# write the output to a new file
output_file_path = "ref-5000-pred.txt"
with open("ref-5000-output.txt", 'r') as f:
    lines = f.readlines()
    with open(output_file_path, 'w') as f:
        for i in range(len(lines)):
            if "New Code:" in lines[i]:
                # write the next line trimming <|end_of_text|> from the end
                f.write(lines[i][9:])