In [None]:
!pip install wandb
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps xformers trl peft accelerate bitsandbytes

In [None]:
import wandb
from huggingface_hub import notebook_login, Repository 
from datasets import load_from_disk

from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported
from unsloth import UnslothTrainer, UnslothTrainingArguments
from unsloth import FastLanguageModel
import torch

## Hyperparameters and Config

In [None]:
# Models
HF_BASE_MODEL_ID = "unsloth/Meta-Llama-3.1-8B-bnb-4bit"
HF_MODEL_ID = "RodrigoSalazar-U/ang-base"
HF_MODEL_CHECKPOINT_ID = HF_MODEL_ID+"-checkpoints"

# Dataset
DATASET_PATH = "./hf-repo/train"

# Training 
## General config
TRAIN_LR = 2*5e-5
TRAIN_EMBEDDING_LR = TRAIN_LR / 2
TRAIN_WARMUP_RATIO = 0.1
TRAIN_EPOCHS = 5
TRAIN_OPTIMIZER = "adamw_8bit"
TRAIN_WEIGHT_DECAY = 0.00
TRAIN_LR_SCHEDULER = "cosine"
TRAIN_SEED = 512

## Batching config
BATCH_PER_DEVICE = 16
GRADIENT_ACCUMULATION = 2 
# Effective batch size= BATCH_SIZE_PER_DEVICE * GRADIENT_ACCUMULATION = 32

## Lora config
ADAPTER_R     = 256
ADAPTER_ALPHA = 32
ADAPTER_SEED  = 3407
ADAPTER_TARGET_MODULES = ["q_proj", "k_proj", "v_proj", "o_proj", # Attention
                  "gate_proj", "up_proj", "down_proj", # Gate+Up+Down Proj
                  "embed_tokens", "lm_head",] # Embedding + LM head for CPT
ADAPTER_RSLORA = True

## Model config
MODEL_MAX_SEQ_LENGTH = 2048
MODEL_DTYPE = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
MODEL_LOAD_IN_4BIT = True # Use 4bit quantization to reduce memory usage

## Accounts login

In [None]:
# Login
notebook_login()

In [None]:
# Logging to wandb

wandb.login()
wandb.init(
  project=f"LLM-EN2ANG",
  entity="rodrigo-salazar-utec",
  name=HF_MODEL_ID
)

## Load model

Download base model and initialize using unslothed for training

In [None]:


print(f"Loading model {HF_BASE_MODEL_ID}")
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = HF_BASE_MODEL_ID,
    max_seq_length = MODEL_MAX_SEQ_LENGTH,
    dtype = MODEL_DTYPE,
    load_in_4bit = MODEL_LOAD_IN_4BIT,
)

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = ADAPTER_R,
    target_modules = ADAPTER_TARGET_MODULES,
    lora_alpha = ADAPTER_ALPHA,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    use_gradient_checkpointing = "unsloth",
    random_state = ADAPTER_SEED, # Random seed for repro
    use_rslora = ADAPTER_RSLORA,  # Rank stabilized LoRA
    loftq_config = None, # LoftQ disabled
)

## Load Dataset

Expects HF datasets file.
Format of the dataset is as follows:
- `prompt`: the prompt for the task
- `answer`: the answer for the task
- `text`: full text. should be equal to `prompt` + `answer`

In [None]:
# Load
dataset = load_from_disk(DATASET_PATH)

# Select only prompt, answer and text columns
columns = ["prompt", "answer", "text"]
removed_columns = list(set(dataset.column_names) - set(columns))
dataset = dataset.remove_columns(removed_columns)

# Append EOS token to text
EOS_TOKEN = tokenizer.eos_token
def append_eos_token(examples):
  text = examples["text"] + EOS_TOKEN
  return {"text": text}
dataset = dataset.map(append_eos_token)

# Display dataset
print(dataset)

## Trainer

In [None]:
# Trainer
trainer = UnslothTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = MODEL_MAX_SEQ_LENGTH,
    dataset_num_proc = 8,
    packing = False, # Can make training 5x faster for short sequences.
    args = UnslothTrainingArguments(
        per_device_train_batch_size = BATCH_PER_DEVICE, #2,
        gradient_accumulation_steps = GRADIENT_ACCUMULATION, #64,
        warmup_ratio = TRAIN_WARMUP_RATIO,
        num_train_epochs = TRAIN_EPOCHS,
        learning_rate = TRAIN_LR,
        embedding_learning_rate = TRAIN_EMBEDDING_LR,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 100,
        optim = TRAIN_OPTIMIZER,
        weight_decay = TRAIN_WEIGHT_DECAY,
        lr_scheduler_type = TRAIN_LR_SCHEDULER,
        seed = TRAIN_SEED,
        output_dir = "outputs",
        #W&B
        run_name = HF_MODEL_ID,
        report_to = "wandb",
        # Checkpoints
        save_steps = 500,
        save_total_limit = 3,
        push_to_hub = True,
        hub_model_id = HF_MODEL_CHECKPOINT_ID,
        hub_strategy = "checkpoint",
    ),
)

In [None]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

In [None]:
trainer_stats = trainer.train()

In [None]:
#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

In [None]:
# Push merged model
model.push_to_hub_merged(HF_MODEL_ID, tokenizer=tokenizer, private=True)

In [None]:
# Finish wandb
wandb.finish()

In [None]:
# Shutdown the Colab runtime
from google.colab import runtime
runtime.unassign()