# Fine tune Llama 2

In [1]:
!pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7

In [2]:
import torch

# Get the number of available GPUs
num_gpus = torch.cuda.device_count()

if num_gpus > 0:
    print("Available GPUs:")
    for i in range(num_gpus):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
else:
    print("No GPUs available.")

device_ids = list(range(num_gpus))
device_ids

Available GPUs:
GPU 0: Tesla T4
GPU 1: Tesla T4


[0, 1]

In [3]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

2024-04-22 08:35:47.922766: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-22 08:35:47.922828: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-22 08:35:47.924330: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [4]:
from kaggle_secrets import UserSecretsClient
huggingface_token = UserSecretsClient().get_secret("huggingface_token")

from huggingface_hub import login

login(token=huggingface_token)

Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


**How to fine tune Llama 2**
* a 15GB Graphics Card (Limited Resources --> Barely enough to store Llama 2–7b’s weights)
* We also need to consider the overhead due to optimizer states, gradients, and forward activations
* Full fine-tuning is not possible here: we need parameter-efficient fine-tuning (PEFT) techniques like LoRA or QLoRA.
* To drastically reduce the VRAM usage, we must fine-tune the model in 4-bit precision, which is why we’ll use QLoRA here.

**Load dataset**

In [5]:
dataset = load_dataset('vishnun0027/guanaco-llama2', split="train[:1000]")
dataset

Dataset({
    features: ['text'],
    num_rows: 1000
})

In [6]:
compute_dtype = getattr(torch,'float16')
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)

# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and True:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)


In [7]:
# Load base model

model = AutoModelForCausalLM.from_pretrained(
    "NousResearch/Llama-2-7b-chat-hf",
    quantization_config=bnb_config,
    device_map={"": 0}
)
# Wrap the model with DataParallel
model.config.use_cache = False
model.config.pretraining_tp = 1

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [8]:
# # Load base model
# from torch.nn.parallel import DataParallel

# model = AutoModelForCausalLM.from_pretrained(
#     "NousResearch/Llama-2-7b-chat-hf",
#     quantization_config=bnb_config,
# )
# # Wrap the model with DataParallel
# model = DataParallel(model, device_ids=device_ids)  
# model.config.use_cache = False
# model.config.pretraining_tp = 1

In [9]:
# Load LLaMA tokenizer
tokenizer = AutoTokenizer.from_pretrained("NousResearch/Llama-2-7b-chat-hf", trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training


In [10]:
# Load LoRA configuration
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
)


**Train**

In [11]:
# Set training parameters
training_arguments = TrainingArguments(
    output_dir="results",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    optim="paged_adamw_32bit",
    save_steps=0,
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay= 0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type='cosine',
    report_to="tensorboard",
#     push_to_hub=True,
    
)

# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=None,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=False,
)

# Train model
trainer.train()




Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
25,1.7382


TrainOutput(global_step=31, training_loss=1.746327369443832, metrics={'train_runtime': 1474.9984, 'train_samples_per_second': 0.678, 'train_steps_per_second': 0.021, 'total_flos': 8134444407521280.0, 'train_loss': 1.746327369443832, 'epoch': 0.99})

In [12]:
# Save trained model
trainer.model.save_pretrained("Llama-2-7b-chat-finetune")

In [13]:
%load_ext tensorboard
%tensorboard --logdir results/runs

In [14]:
# Ignore warnings
logging.set_verbosity(logging.CRITICAL)

# Run text generation pipeline with our next model
prompt = "What is a large language model?"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])



<s>[INST] What is a large language model? [/INST]  A large language model is a type of artificial intelligence (AI) model that is trained on a large dataset of text to generate language outputs that are coherent and natural-sounding. everybody has a unique personality, and the same is true for language models. Large language models are trained on vast amounts of text data, which allows them to learn patterns and relationships in language.

Large language models are trained on vast amounts of text data, which allows them to learn patterns and relationships in language. They are capable of generating text that is coherent and natural-sounding, and they can be used for a wide range of applications, such as language translation, text summarization, and chatbots.

Large language models are trained on vast amounts of text data, which allows them to learn patterns and relationships in language. They are capable of generating text that is coherent


In [15]:
# Empty VRAM
del model
del pipe
del trainer
import gc
gc.collect()
gc.collect()

0

****Store New Llama2 Model (Llama-2-7b-chat-finetune)****

*How can we store our new Llama-2-7b-chat-finetune model now? We need to merge the weights from LoRA with the base model. Unfortunately, as far as I know, there is no straightforward way to do it: we need to reload the base model in FP16 precision and use the peft library to merge everything.*

In [17]:
# Reload model in FP16 and merge it with LoRA weights
base_model = AutoModelForCausalLM.from_pretrained(
    "NousResearch/Llama-2-7b-chat-hf",
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map={"": 0},
)
model = PeftModel.from_pretrained(base_model, 'Llama-2-7b-chat-finetune')
model = model.merge_and_unload()

# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained('NousResearch/Llama-2-7b-chat-hf', trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [18]:
model.push_to_hub("vishnun0027/Llama-2-7b-chat-finetune")

tokenizer.push_to_hub("vishnun0027/Llama-2-7b-chat-finetune")


pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/vishnun0027/Llama-2-7b-chat-finetune/commit/d7376db1e15a1ff68aa0275483d68e829ae3e8f8', commit_message='Upload tokenizer', commit_description='', oid='d7376db1e15a1ff68aa0275483d68e829ae3e8f8', pr_url=None, pr_revision=None, pr_num=None)