<a href="https://colab.research.google.com/github/swati-git/FineTuneLLM/blob/main/FineTuning_a_LLM_LIMA_CPU.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q transformers==4.57.3  peft==0.5.0 trl==0.19.1

In [None]:
!pip install -q torch==2.9.0

In [None]:
import torch
from transformers import AutoModelForCausalLM

def check_gpu_and_load(model_name, required_memory_gb=16):
    """Check if GPU has enough memory before loading model"""

    if not torch.cuda.is_available():
        print("⚠️  No GPU available, will use CPU")
    else:
        print("✓ GPU available")

    # Check each GPU
    suitable_gpus = []
    for i in range(torch.cuda.device_count()):
        props = torch.cuda.get_device_properties(i)
        total_gb = props.total_memory / (1024**3)
        reserved_gb = torch.cuda.memory_reserved(i) / (1024**3)
        free_gb = total_gb - reserved_gb

        print(f"GPU {i} ({props.name}): {free_gb:.1f} GB free / {total_gb:.1f} GB total")

        if free_gb >= required_memory_gb:
            suitable_gpus.append(i)

    if not suitable_gpus:
        print(f"⚠️  No GPU with {required_memory_gb} GB free. Use device_map='auto'")
    else :
      print(f"✓ Loading on GPU {suitable_gpus[0]}")
    # return AutoModelForCausalLM.from_pretrained(
    #     model_name,
    #     device_map=f"cuda:{suitable_gpus[0]}",
    #     torch_dtype=torch.float16
    # )


In [None]:

# Usage
model = check_gpu_and_load("facebook/opt-1.3b", required_memory_gb=16)

# Find the model specs
This will help in configuring the memory and compute required

In [None]:
from transformers import AutoConfig

config = AutoConfig.from_pretrained("facebook/opt-1.3b")
print(f"Data type of the parameters: {config.dtype} ")
print(f"Model name: {config.model_type}")
print(f"Hidden size: {config.hidden_size}")
print(f"Number of layers: {config.num_hidden_layers}")
print(f"Vocabulary size: {config.vocab_size}")
print(f"Max sequence length: {config.max_position_embeddings}")

*We will load the model in bfloat16 datatype because, bfloat16 has a wider range than float16*

In [None]:
from transformers import AutoModelForCausalLM
import torch

model = AutoModelForCausalLM.from_pretrained("facebook/opt-1.3b",
                                             torch_dtype=torch.bfloat16,
                                             device_map = "auto")

In [None]:
def get_model_size(model):
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

    # Calculate memory (in GB)
    bytes_per_param = 2 if str(model.dtype) == "torch.bfloat16" else 4
    memory_gb = (total_params * bytes_per_param) / (1024**3)

    print(f"Total parameters: {total_params:,}")
    print(f"Trainable parameters: {trainable_params:,}")
    print(f"Model size in memory: {memory_gb:.2f} GB")
    print(f"Data type: {model.dtype}")

In [None]:
get_model_size(model)

In [None]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [None]:
print_trainable_parameters(model)

In [None]:
#Rule of thumb: Need 3-4x model size for training (gradients, optimizer states, etc.)
#2.6 GB model → need ~8-10 GB GPU for training

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("facebook/opt-1.3b")

# ===== CHECK THESE =====
print(f"Vocab size (tokenizer): {len(tokenizer)}")
print(f"Vocab size (model): {model.config.vocab_size}")

# These should match!
#assert len(tokenizer) == model.config.vocab_size, "Mismatch!"

# Check special tokens
print(f"Padding token: {tokenizer.pad_token}")
print(f"EOS token: {tokenizer.eos_token}")
print(f"BOS token: {tokenizer.bos_token}")

# Test tokenization
sample = "Write a product description for headphones"
tokens = tokenizer.encode(sample)
print(f"Sample tokenization: {tokens}")
print(f"Number of tokens: {len(tokens)}")

In [None]:
!pip install -q deeplake==3.7.1

In [None]:
import deeplake

# Connect to the training and testing datasets
ds = deeplake.load('hub://genai360/GAIR-lima-train-set')
ds_test = deeplake.load('hub://genai360/GAIR-lima-test-set')

In [None]:
ds

In [None]:
def prepare_sample_text(example):
    """Prepare the text from a sample of the dataset."""
    text = f"Question: {example['question'].text()}\n\nAnswer: {example['answer'].text()}"
    return text

In [None]:
#!pip install -q trl==0.26.2
#https://github.com/unslothai/unsloth/issues/3057


In [None]:
#Given that the model's max sequence length is 2048 tokens as per  "{config.max_position_embeddings}" we'll structure our dataset to match it.

from trl.trainer import ConstantLengthDataset

train_dataset = ConstantLengthDataset(
    tokenizer,
    ds,
    formatting_func=prepare_sample_text,
    infinite=True,
    seq_length=2048
)



In [None]:
train_dataset

In [None]:
from trl.trainer import ConstantLengthDataset

eval_dataset = ConstantLengthDataset(
    tokenizer,
    ds_test,
    formatting_func=prepare_sample_text,
    infinite=True,
    seq_length=2048
)


**Rank Selection Guidelines**

Small models (< 1B parameters): 8-16

Medium models (1B-10B): 16-32

Large models (> 10B): 32-64

**Alpha-to-Rank Relationship**

Typically set to r or 2 * r

Higher alpha increases the adaptation's impact

Lower alpha reduces the adaptation's influence

In [None]:
from peft import LoraConfig

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

In [None]:
!pip install -q wandb

In [None]:
import wandb

# Initialize W&B
wandb.init(
    project="opt-finetuning",
    #name="OPT-fine_tuned-LIMA-CPU",
    config={
        "model": "facebook/opt-1.3b",
    }
)

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./OPT-fine_tuned-LIMA-CPU",

    # Training settings

    num_train_epochs=3,
    per_device_train_batch_size=4, # Reduced batch size
    per_device_eval_batch_size=4,  # Reduced batch size
    learning_rate=1e-5,
    dataloader_drop_last=True,
    lr_scheduler_type="cosine",
    warmup_steps=100,

    # Evaluation settings
    #evaluation_strategy="epoch",
    save_strategy="epoch",

    # Logging settings
    logging_dir="./logs",
    logging_steps=1,

    #num_train_epochs=10,

    # per_device_train_batch_size=1,
    # per_device_eval_batch_size=1,
    learning_rate=1e-4,

    gradient_accumulation_steps=2, # Increased gradient accumulation steps
    #bf16=True,
    weight_decay=0.05,
    run_name="OPT-fine_tuned-LIMA-CPU",

    # W&B integration (automatic!)
    report_to="wandb",
)

In [None]:
from trl import SFTTrainer

trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    peft_config=lora_config,
    #packing=True,
)

In [None]:
# Check GPU memory for each device
for i in range(torch.cuda.device_count()):
    props = torch.cuda.get_device_properties(i)
    total_memory = props.total_memory / (1024**3)  # Convert to GB
    print(f"GPU {i}: {props.name}, {total_memory:.2f} GB")

In [None]:
print("Training...")
trainer.train()

In [None]:
#!pip install pipdeptree

In [None]:
#!pipdeptree -p transformers