<a href="https://colab.research.google.com/github/wtergan/ML_notebooks/blob/main/finetuning_practice.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Basic installations:
!pip install -q datasets accelerate loralib sentencepiece
!pip install -U bitsandbytes
!pip install -q git+https://github.com/huggingface/transformers.git@main git+https://github.com/huggingface/peft.git

#bitsandbytes is a package for 8-bit (and 4-bit) CUDA functions for PyTorch.
import os
import torch
import torch.nn as nn
import bitsandbytes as bnb
import sentencepiece as spm

import transformers
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, LlamaTokenizer, LlamaForCausalLM

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [None]:
# Just checking if the GPU is available.
torch.cuda.is_available()

True

In [None]:
"""
OpenLLaMA: An Open Reproduction of LLaMA (version 1).
  - 3B, 7B variants pretrained on more than 1T tokens from the RedPajama dataset (open source).
  - Same preprocessing steps and training hyperparameters as the original LLaMA paper.
  - 7B variant is comparable to LLaMA in evaluation.
"""
# Loading of the open_llama_3b model, including its weights. Changing the precision of the weights to be 8-bit, for memory conservation.
# Using Google Colab's T4 GPU for this process.

# Specifies which GPU(s) to use if multiple are available.
os.environ["CUDA_VISIBLE_DEVICES"]="0"

model_path = "openlm-research/open_llama_3b"

# Creation of the open_llama_3b model.
model = LlamaForCausalLM.from_pretrained(
    model_path,
    load_in_8bit=True,
    device_map="auto",
)

# Creation of tokenizer. A BPE model based on sentencepiece.
# Lets set legacy to False, so that tokens that come after special tokens will be properly handled.
tokenizer = LlamaTokenizer.from_pretrained(model_path, legacy=False)

# Lets add an <eos> pad token to the tokenizer.
tokenizer.pad_token = tokenizer.eos_token

In [None]:
# We can then do some testing of model generation to see if the model was created successfully.
prompt = "Q: What is the currently the largest animal in the world?\nA:"
input_ids = tokenizer(prompt, return_tensor="pt").input_ids

# For some reason, return_tensor is not working. Lets change the ids list into a tensor manually.
# We will subsequently send the resulting tensor into
input_tensor = torch.tensor(input_ids).unsqueeze(0)

# Lets set the tensor to GPU.
input_tensor = input_tensor.to(model.device)

# Output generation.
generation_output = model.generate(input_ids=input_tensor, max_new_tokens=32)
print(tokenizer.decode(generation_output[0]))

Keyword arguments {'return_tensor': 'pt'} not recognized.


<s>Q: What is the currently the largest animal in the world?
A: The blue whale is the largest animal in the world.
Q: What is the largest animal in the world?
A: The blue whale is the largest


In [None]:
# Freezing of the model's layers, casting of the layer norm and output of the last layer in float32 for stability.
for param in model.parameters():
  # This line does the freezing. We will instead train the adapters.
  # If the param
  param.requires_grad = False
  if param.ndim == 1:
    # Cast the small parameters (e.g layernorm) to fp32 for stability (some parameters go to 0 or inf if set to lower precision).
    param.data = param.data.to(torch.float32)

# This reduces the number of stored activations.
model.gradient_checkpointing_enable()
model.enable_input_require_grads()

class CastOutputToFloat(nn.Sequential):
  def forward(self, x): return super().forward(x).to(torch.float32)
model.lm_head = CastOutputToFloat(model.lm_head)

In [None]:
# Using LoRA using get_peft_model utility function from peft.
def print_trainable_parameters(model):
    """Prints the number of trainable parameters in the model."""
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
          trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all_params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [None]:
# We need to get the name of the weight parameters in the model for the LoRA process.
parameters_base_model = model.state_dict()
for name, param in parameters_base_model.items():
  print(name)

model.embed_tokens.weight
model.layers.0.self_attn.q_proj.weight
model.layers.0.self_attn.q_proj.SCB
model.layers.0.self_attn.q_proj.weight_format
model.layers.0.self_attn.k_proj.weight
model.layers.0.self_attn.k_proj.SCB
model.layers.0.self_attn.k_proj.weight_format
model.layers.0.self_attn.v_proj.weight
model.layers.0.self_attn.v_proj.SCB
model.layers.0.self_attn.v_proj.weight_format
model.layers.0.self_attn.o_proj.weight
model.layers.0.self_attn.o_proj.SCB
model.layers.0.self_attn.o_proj.weight_format
model.layers.0.self_attn.rotary_emb.inv_freq
model.layers.0.mlp.gate_proj.weight
model.layers.0.mlp.gate_proj.SCB
model.layers.0.mlp.gate_proj.weight_format
model.layers.0.mlp.down_proj.weight
model.layers.0.mlp.down_proj.SCB
model.layers.0.mlp.down_proj.weight_format
model.layers.0.mlp.up_proj.weight
model.layers.0.mlp.up_proj.SCB
model.layers.0.mlp.up_proj.weight_format
model.layers.0.input_layernorm.weight
model.layers.0.post_attention_layernorm.weight
model.layers.1.self_attn.q_pro

In [None]:
  from peft import LoraConfig, get_peft_model

  # Configuration parameters for the LoRA process. r is the rank, alpha is a constant in r.
  # We scale delta_W with alpha/r. Targeting the q, k, o, and v weights.
  # In this specific model, the weights are in a pack.
  config = LoraConfig(
      r=16,
      lora_alpha=32,
      target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
      lora_dropout=0.05,
      bias="none",
      task_type="CAUSAL_LM"
  )

  # Creation of the new lora model based on the above configuration.
  model = get_peft_model(model, config)
  print_trainable_parameters(model)

trainable params: 10649600 || all_params: 3437123200 || trainable%: 0.309840508481046


In [None]:
# Lets now train:
from datasets import load_dataset
data = load_dataset("Abirate/english_quotes")
# Takes argument samples and applies the tokenizer to the "quote" field in each sample. Done in batches.
data = data.map(lambda samples: tokenizer(samples["quote"]), batched=True)

trainer = transformers.Trainer(
    model=model,
    train_dataset=data['train'],
    args=transformers.TrainingArguments(
        per_device_train_batch_size=4,
        gradient_accumulation_steps=4,
        warmup_steps=100,
        max_steps=200,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=1,
        output_dir='outputs'
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
)
# Silence the warnings. Re-Enable for inference.
import warnings
warnings.filterwarnings("ignore")

# No need to cache past hidden states at each step since we are training, so set as False.
model.config.use_cache = False
trainer.train()



  0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/2508 [00:00<?, ? examples/s]

Step,Training Loss
1,2.0773
2,2.1923
3,1.5683
4,2.27
5,2.1393
6,2.1358
7,2.4721
8,2.4362
9,2.4639
10,1.6917


TrainOutput(global_step=200, training_loss=1.6828941681981087, metrics={'train_runtime': 937.6468, 'train_samples_per_second': 3.413, 'train_steps_per_second': 0.213, 'total_flos': 5968647650073600.0, 'train_loss': 1.6828941681981087, 'epoch': 1.28})

In [None]:
# Lets now do inference on the newly trained model we just created.
batch = tokenizer("I believe that everything happens: ", return_tensors='pt').to(model.device)

with torch.cuda.amp.autocast():
    output_tokens = model.generate(**batch, max_new_tokens=50)

print('\n\n', tokenizer.decode(output_tokens[0], skip_special_tokens=True))




 I believe that everything happens: for good or for bad. I believe that everything happens for a reason. I believe that everything happens for a reason. I believe that everything happens for a reason. I believe that everything happens for a reason. I believe that everything happens for a reason
