In [None]:
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q datasets

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.0/76.0 MB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m110.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m81.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m55.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel, get_peft_model

In [None]:
print("GPU Available:", torch.cuda.is_available()) # checking that the GPU is availabe or not.

GPU Available: True


In [None]:
# Define the Model name, and Data set
model_name = "NousResearch/Llama-2-7b-chat-hf"
dataset_name = "mlabonne/guanaco-llama2-1k"
new_model = "Llama-2-7b-Dochat-finetune"

# Define the PEFT(QLoRA) Parameters
QLora_parm = {
    "lora_r" : 64,
    "lora_alpha" :16,
    "lora_dropout" : 0.1
}

# Define Quantize Parameters
BitsBytes_parm = {
    "use_4bit": True,
    "bnb_4bit_compute_dtype" : "float16",
    "bnb_4bit_quant_type" : "nf4",
    "use_nested_quant" : False
}

# Define the Traning Parameters
training_parm = {
    "per_device_train_batch_size" : 1,
    "gradient_accumulation_steps" :2,
    "warmup_steps" : 2,
    "max_steps" : -1,
    "learning_rate" :2e-4,
    "num_train_epochs" : 1,
    "fp16" : False,
    "logging_steps" :25,
    "output_dir" : "./results",
    "remove_unused_columns" :False,
    "report_to" : "none",
    "optim" : "paged_adamw_32bit"

}


In [None]:
dataset = load_dataset(dataset_name, split='train')

compute_dtype = getattr(torch, BitsBytes_parm['bnb_4bit_compute_dtype'])

bnb_config = BitsAndBytesConfig(
  load_in_4bit = BitsBytes_parm['use_4bit'],
  bnb_4bit_quant_type = BitsBytes_parm['bnb_4bit_quant_type'] ,
  bnb_4bit_compute_dtype = compute_dtype,
  bnb_4bit_use_double_quant = BitsBytes_parm['use_nested_quant']
)

# check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and BitsBytes_parm['use_4bit']:
  major, _ = torch.cuda.get_device_capability()
  if major >= 8:
    print("=", * 80)
    print("Your GPU supports bfloat16: accelerate traning with bf16=True")
    print("=", * 80)


# Load Base Model
model = AutoModelForCausalLM.from_pretrained(
  model_name,
  quantization_config = bnb_config,
  device_map = {"": 0}
)
model.config.use_cache = False
model.config.pretraining_tp = 1

# Load LLaMA Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'right'


# Load LoRA Config
peft_config = LoraConfig(
  lora_alpha = QLora_parm['lora_alpha'],
  lora_dropout = QLora_parm['lora_dropout'],
  r = QLora_parm['lora_r'],
  bias = 'none',
  task_type = "CAUSAL_LM"
)


# Load Traning Arguments
training_arg = TrainingArguments(
    per_device_train_batch_size = training_parm['per_device_train_batch_size'],
    gradient_accumulation_steps= training_parm['gradient_accumulation_steps'],
    warmup_steps= training_parm['warmup_steps'],
    max_steps= training_parm['max_steps'],
    learning_rate= training_parm['learning_rate'],
    num_train_epochs= training_parm['num_train_epochs'],
    fp16= training_parm['fp16'],
    logging_steps= training_parm['logging_steps'],
    output_dir= training_parm['output_dir'],
    remove_unused_columns= training_parm['remove_unused_columns'],
    report_to = training_parm['report_to'],
    optim= training_parm['optim']
)



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

(…)-00000-of-00001-9ad84bb9cf65a42f.parquet:   0%|          | 0.00/967k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1000 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/583 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/746 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/435 [00:00<?, ?B/s]

In [None]:
# This function will print the NO. of Trainable Parameters in the Model
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [None]:
print_trainable_parameters(model) # No. of trainable parameters before adding the PEFT (fine tunning)
model = get_peft_model(model, peft_config) # apply the fine-tunning
print_trainable_parameters(model) # no. of trainable parameters after adding the PEFT (fine tunning)

trainable params: 262410240 || all params: 3500412928 || trainable%: 7.496550989769399
trainable params: 33554432 || all params: 3533967360 || trainable%: 0.9494833591219133


In [None]:
# Tokenizer (tokenize the data set)
def tokenize_function(example):
        return tokenizer(
          example["text"],
          padding="max_length",
          truncation=True,
          max_length=512,
          return_tensors="pt"
        )
data = dataset.map(tokenize_function, batched=True, remove_columns=["text"])

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [None]:
# Train the Model on customize Data set
# Use the Trainer (from transformers) to train the Model you can also use the SFTTrainer.
from transformers import Trainer, DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer = tokenizer,
    mlm = False
)

trainer = Trainer(
    model = model,
    train_dataset=data,
    args=training_arg,
    data_collator=data_collator
)

trainer.train()

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
25,1.5912
50,1.3665
75,1.2903
100,1.2967
125,1.3623
150,1.3154
175,1.4171
200,1.3088
225,1.2917
250,1.181


TrainOutput(global_step=500, training_loss=1.3066954307556153, metrics={'train_runtime': 1156.5668, 'train_samples_per_second': 0.865, 'train_steps_per_second': 0.432, 'total_flos': 2.0400838803456e+16, 'train_loss': 1.3066954307556153, 'epoch': 1.0})

In [None]:
# Save the Quantized and Fine-tunned Model
trainer.model.save_pretrained(new_model)

In [None]:
# Ignore warnings
logging.set_verbosity(logging.CRITICAL)

# Run text generation pipeline with our new model
prompt = "What is a large language model?"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])

<s>[INST] What is a large language model? [/INST] A large language model is a machine learning model that is trained on a large dataset of text, such as a large corpus of books or a large dataset of text from the internet. The model is designed to learn the patterns and structures of language, and to be able to generate text that is similar to the training data.

Large language models are typically trained using a technique called deep learning, which involves using multiple layers of artificial neural networks to learn the patterns and structures of language. These models are often used for a variety of natural language processing tasks, such as language translation, text summarization, and text generation.

Some examples of large language models include:

* BERT (Bidirectional Encoder Representations from Transformers): A popular large language model developed by Google that has been trained on a large corpus of text from the internet.
* RoBERTa


In [None]:
# Empty VRAM
del model
del pipe
del trainer
import gc
gc.collect()
gc.collect()

In [None]:
# Reload model in FP16 and merge it with LoRA weights
# we need to merge the fine tunned weights in the base model to create an final Quantized and Fine tunned model
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map={"": 0},
)
model = PeftModel.from_pretrained(base_model, new_model)
model = model.merge_and_unload()

# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
# Login into hugging face to Push the model
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: fineGrained).
The token `DoChat` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `DoChat`


In [None]:
# Push the final model and the tokenizer to the Hugging face.
model.push_to_hub("Tarun9216/Llama-2-7b-DoChat-finetune", check_pr=True)

tokenizer.push_to_hub("Tarun9216/Llama-2-7b-DoChat-finetune",check_pr=True)

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/3.59G [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Tarun9216/Llama-2-7b-DoChat-finetune/commit/45a1991ac2fe9ae35b8e247ea021fb91c03458f0', commit_message='Upload tokenizer', commit_description='', oid='45a1991ac2fe9ae35b8e247ea021fb91c03458f0', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Tarun9216/Llama-2-7b-DoChat-finetune', endpoint='https://huggingface.co', repo_type='model', repo_id='Tarun9216/Llama-2-7b-DoChat-finetune'), pr_revision=None, pr_num=None)

In [None]:
# Now the Final Finetuned model is uploded to the hugging face and you can use this model like any other LLaMA Model.