<a href="https://colab.research.google.com/github/xuliucool/LLM-and-AppModernization/blob/main/Fine-Tuning/XBC_Fine_tune_Llama_2_in_Google_Colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Fine-tune Llama 2 for XBC Bank POC in Google Colab
> 🗣️ XBC Bank Large Language Model Fine tuning

❤️ Extended by Ahilan Ponnusamy from the notebook referenced below
 Created by [@maximelabonne](https://twitter.com/maximelabonne) as explained in this [Blog](https://towardsdatascience.com/fine-tune-your-own-llama-2-model-in-a-colab-notebook-df9823a04a32)

This notebook runs on a T4 GPU. (Last update: 23 Dec 2023)


In [1]:
!pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/244.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━[0m [32m204.8/244.2 kB[0m [31m6.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.5/92.5 MB[0m [31m18.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m100.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.4/77.4 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m103.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━

In [18]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

In [39]:
# The model that you want to train from the Hugging Face hub
#model_name = "NousResearch/Llama-2-7b-chat-hf"
model_name = "instructlab/granite-7b-lab"

# The instruction dataset to use
#dataset_name = "mlabonne/guanaco-llama2-1k"
#dataset_name = "AhilanPonnusamy/data"

file_dict = {
  "train" : "/content/data/xbc_train.csv",
}

# Fine-tuned model name
#new_model = "llama-2-7b-xbcfinetuned"
new_model = "granite-7b-lab-xbcft"

################################################################################
# QLoRA parameters
################################################################################

# LoRA attention dimension
lora_r = 64

# Alpha parameter for LoRA scaling
lora_alpha = 16

# Dropout probability for LoRA layers
lora_dropout = 0.1

################################################################################
# bitsandbytes parameters
################################################################################

# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

################################################################################
# TrainingArguments parameters
################################################################################

# Output directory where the model predictions and checkpoints will be stored
output_dir = "./results"

# Number of training epochs
num_train_epochs = 3

# Enable fp16/bf16 training (set bf16 to True with an A100)
fp16 = False
bf16 = False

# Batch size per GPU for training
per_device_train_batch_size = 4

# Batch size per GPU for evaluation
per_device_eval_batch_size = 4

# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = 1

# Enable gradient checkpointing
gradient_checkpointing = True

# Maximum gradient normal (gradient clipping)
max_grad_norm = 0.3

# Initial learning rate (AdamW optimizer)
learning_rate = 2e-4

# Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay = 0.001

# Optimizer to use
optim = "paged_adamw_32bit"

# Learning rate schedule
lr_scheduler_type = "cosine"

# Number of training steps (overrides num_train_epochs)
max_steps = -1

# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.03

# Group sequences into batches with same length
# Saves memory and speeds up training considerably
group_by_length = True

# Save checkpoint every X updates steps
save_steps = 0

# Log every X updates steps
logging_steps = 25

################################################################################
# SFT parameters
################################################################################

# Maximum sequence length to use
#max_seq_length = None
max_seq_length = 10240

# Pack multiple short examples in the same input sequence to increase efficiency
packing = False

# Load the entire model on the GPU 0
device_map = {"": 0}

In [40]:
# Load dataset (you can process it here)
#dataset = load_dataset(dataset_name, split="train")
#load xbc training dataset
dataset = load_dataset(
  'csv',
  data_files=file_dict
)

# Load tokenizer and model with QLoRA configuration
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

# Load base model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map
)
model.config.use_cache = False
model.config.pretraining_tp = 1

# Load LLaMA tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training

# Load LoRA configuration
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
)

# Set training parameters
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to="tensorboard"
)

# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset["train"],
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=packing,
)

# Train model
trainer.train()

# Save trained model
trainer.model.save_pretrained(new_model)

Generating train split: 0 examples [00:00, ? examples/s]

Your GPU supports bfloat16: accelerate training with bf16=True




Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some weights of LlamaForCausalLM were not initialized from the model checkpoint at instructlab/granite-7b-lab and are newly initialized: ['model.layers.20.self_attn.rotary_emb.inv_freq', 'model.layers.8.self_attn.rotary_emb.inv_freq', 'model.layers.26.self_attn.rotary_emb.inv_freq', 'model.layers.23.self_attn.rotary_emb.inv_freq', 'model.layers.9.self_attn.rotary_emb.inv_freq', 'model.layers.28.self_attn.rotary_emb.inv_freq', 'model.layers.18.self_attn.rotary_emb.inv_freq', 'model.layers.31.self_attn.rotary_emb.inv_freq', 'model.layers.5.self_attn.rotary_emb.inv_freq', 'model.layers.24.self_attn.rotary_emb.inv_freq', 'model.layers.15.self_attn.rotary_emb.inv_freq', 'model.layers.3.self_attn.rotary_emb.inv_freq', 'model.layers.25.self_attn.rotary_emb.inv_freq', 'model.layers.14.self_attn.rotary_emb.inv_freq', 'model.layers.29.self_attn.rotary_emb.inv_freq', 'model.layers.27.self_attn.rotary_emb.inv_freq', 'model.layers.12.self_attn.rotary_emb.inv_freq', 'model.layers.2.self_attn.rotary_

Map:   0%|          | 0/293 [00:00<?, ? examples/s]

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
25,2.1927
50,0.4941
75,0.4542
100,0.4358
125,0.3295
150,0.3756
175,0.3388
200,0.2848


In [41]:
# %load_ext tensorboard
# %tensorboard --logdir results/runs

In [42]:
# Ignore warnings
logging.set_verbosity(logging.CRITICAL)

# Run text generation pipeline with our next model
prompt = "can you transfer $50 to joseph?"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])



<s>[INST] can you transfer $50 to joseph? [/INST] Your transfer of $50 to Joseph was completed successfully. Your current balance is $50



In [43]:
# Empty VRAM
del model
del pipe
del trainer
import gc
gc.collect()
gc.collect()

20933

In [44]:
# Reload model in FP16 and merge it with LoRA weights
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map=device_map,
)
model = PeftModel.from_pretrained(base_model, new_model)
model = model.merge_and_unload()

# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [45]:
# @title Default title text
variable_name = "" # @param {type:"string"}
import locale
locale.getpreferredencoding = lambda: "UTF-8"

!huggingface-cli login

model.push_to_hub(new_model, use_temp_dir=False)
tokenizer.push_to_hub(new_model, use_temp_dir=False)


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Samlxf/granite-7b-lab-xbcft/commit/a347bb5c45d900f3e94e3aab1fea13f43752edae', commit_message='Upload tokenizer', commit_description='', oid='a347bb5c45d900f3e94e3aab1fea13f43752edae', pr_url=None, pr_revision=None, pr_num=None)

In [47]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

!git clone https://github.com/ggerganov/llama.cpp
%cd llama.cpp
#!make
!pip install -r requirements.txt

Cloning into 'llama.cpp'...
remote: Enumerating objects: 25653, done.[K
remote: Counting objects: 100% (9320/9320), done.[K
remote: Compressing objects: 100% (542/542), done.[K
remote: Total 25653 (delta 9039), reused 8840 (delta 8778), pack-reused 16333[K
Receiving objects: 100% (25653/25653), 46.90 MiB | 25.41 MiB/s, done.
Resolving deltas: 100% (18250/18250), done.
/content/llama.cpp
Collecting numpy~=1.24.4 (from -r ./requirements/requirements-convert.txt (line 1))
  Downloading numpy-1.24.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.3/17.3 MB[0m [31m47.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sentencepiece~=0.2.0 (from -r ./requirements/requirements-convert.txt (line 2))
  Downloading sentencepiece-0.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m61.8 MB/s[0m eta [36

In [48]:
#!git clone https://github.com/ggerganov/llama.cpp.git
#%cd llama.cpp
#!make
#!pip install -r requirements.txt
!python convert.py /content/granite-7b-lab-xbcft \
    --outfile granite-7b-lab-xbcfinetuned-q8_0-gguf.gguf \
    --outtype q8_0 \
    --pad-vocab




INFO:convert:Loading model file /content/granite-7b-lab-xbcft/pytorch_model-00001-of-00002.bin
INFO:convert:Loading model file /content/granite-7b-lab-xbcft/pytorch_model-00001-of-00002.bin
INFO:convert:Loading model file /content/granite-7b-lab-xbcft/pytorch_model-00002-of-00002.bin
INFO:convert:model parameters count : 6738483200 (7B)
INFO:convert:params = Params(n_vocab=32008, n_embd=4096, n_layer=32, n_ctx=2048, n_ff=11008, n_head=32, n_head_kv=32, n_experts=None, n_experts_used=None, f_norm_eps=1e-05, rope_scaling_type=None, f_rope_freq_base=10000.0, f_rope_scale=None, n_orig_ctx=None, rope_finetuned=None, ftype=<GGMLFileType.MostlyQ8_0: 7>, path_model=PosixPath('/content/granite-7b-lab-xbcft'))
INFO:convert:Loaded vocab file PosixPath('/content/granite-7b-lab-xbcft/tokenizer.model'), type 'spm'
INFO:convert:Vocab info: <SentencePieceVocab with 32000 base tokens and 5 added tokens>
INFO:convert:Special vocab info: <SpecialVocab with 0 merges, special tokens {'bos': 1, 'eos': 32000

In [51]:
!mv /content/llama.cpp/granite-7b-lab-xbcfinetuned-q8_0-gguf.gguf /content/granite-7b-lab-xbcft/

In [52]:
from huggingface_hub import HfApi
api = HfApi()
api.upload_file(
    path_or_fileobj="/content/granite-7b-lab-xbcft/granite-7b-lab-xbcfinetuned-q8_0-gguf.gguf",
    path_in_repo="granite-7b-lab-xbcfinetuned-q8_0-gguf.gguf",
    repo_id="Samlxf/granite-7b-lab-xbcft",
    repo_type="model",
)

granite-7b-lab-xbcfinetuned-q8_0-gguf.gguf:   0%|          | 0.00/7.16G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Samlxf/granite-7b-lab-xbcft/commit/e03318799d2f5dfd3ccee1da3728803a1a10cd6f', commit_message='Upload granite-7b-lab-xbcfinetuned-q8_0-gguf.gguf with huggingface_hub', commit_description='', oid='e03318799d2f5dfd3ccee1da3728803a1a10cd6f', pr_url=None, pr_revision=None, pr_num=None)

In [59]:
%cd /content/llama.cpp/
#!make
!./main -m /content/granite-7b-lab-xbcft/granite-7b-lab-xbcfinetuned-q8_0-gguf.gguf  --seed 42 --temp 0.7 --repeat_penalty 1.1 -n 256 -p "<system>\n<user>\n How many employees are there in XBC?\n<assistant>\n"



/content/llama.cpp
Log start
main: build = 3031 (0e8d8bfd)
main: built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
main: seed  = 42
llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /content/granite-7b-lab-xbcft/granite-7b-lab-xbcfinetuned-q8_0-gguf.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = granite-7b-lab-xbcft
llama_model_loader: - kv   2:                           llama.vocab_size u32              = 32008
llama_model_loader: - kv   3:                       llama.context_length u32              = 2048
llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   5:                          llama.b