<a href="https://colab.research.google.com/github/swati-git/FineTuneLLM/blob/main/FineTuning_a_LLM_LIMA_CPU.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q transformers==4.57.3  peft==0.5.0 trl==0.19.1

In [2]:
!pip install -q torch==2.9.0

In [3]:
import torch
from transformers import AutoModelForCausalLM

def check_gpu_and_load(model_name, required_memory_gb=16):
    """Check if GPU has enough memory before loading model"""

    if not torch.cuda.is_available():
        print("⚠️  No GPU available, will use CPU")
    else:
        print("✓ GPU available")

    # Check each GPU
    suitable_gpus = []
    for i in range(torch.cuda.device_count()):
        props = torch.cuda.get_device_properties(i)
        total_gb = props.total_memory / (1024**3)
        reserved_gb = torch.cuda.memory_reserved(i) / (1024**3)
        free_gb = total_gb - reserved_gb

        print(f"GPU {i} ({props.name}): {free_gb:.1f} GB free / {total_gb:.1f} GB total")

        if free_gb >= required_memory_gb:
            suitable_gpus.append(i)

    if not suitable_gpus:
        print(f"⚠️  No GPU with {required_memory_gb} GB free. Use device_map='auto'")
    else :
      print(f"✓ Loading on GPU {suitable_gpus[0]}")
    # return AutoModelForCausalLM.from_pretrained(
    #     model_name,
    #     device_map=f"cuda:{suitable_gpus[0]}",
    #     torch_dtype=torch.float16
    # )


In [4]:

check_gpu_and_load("facebook/opt-1.3b", required_memory_gb=16)

✓ GPU available
GPU 0 (Tesla T4): 14.7 GB free / 14.7 GB total
⚠️  No GPU with 16 GB free. Use device_map='auto'


# Find the model specs
This will help in configuring the memory and compute required

In [5]:
from transformers import AutoConfig

config = AutoConfig.from_pretrained("facebook/opt-1.3b")
print(f"Data type of the parameters: {config.dtype} ")
print(f"Model name: {config.model_type}")
print(f"Hidden size: {config.hidden_size}")
print(f"Number of layers: {config.num_hidden_layers}")
print(f"Vocabulary size: {config.vocab_size}")
print(f"Max sequence length: {config.max_position_embeddings}")

Data type of the parameters: torch.float16 
Model name: opt
Hidden size: 2048
Number of layers: 24
Vocabulary size: 50272
Max sequence length: 2048


*We will load the model in bfloat16 datatype because, bfloat16 has a wider range than float16*

In [6]:
from transformers import AutoModelForCausalLM
import torch

model = AutoModelForCausalLM.from_pretrained("facebook/opt-1.3b",
                                             dtype=torch.bfloat16,
                                             device_map = "auto")

`torch_dtype` is deprecated! Use `dtype` instead!


In [7]:
def get_model_size(model):
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

    # Calculate memory (in GB)
    bytes_per_param = 2 if str(model.dtype) == "torch.bfloat16" else 4
    memory_gb = (total_params * bytes_per_param) / (1024**3)

    print(f"Total parameters: {total_params:,}")
    print(f"Trainable parameters: {trainable_params:,}")
    print(f"Model size in memory: {memory_gb:.2f} GB")
    print(f"Data type: {model.dtype}")

In [8]:
get_model_size(model)

Total parameters: 1,315,758,080
Trainable parameters: 1,315,758,080
Model size in memory: 2.45 GB
Data type: torch.bfloat16


In [9]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [10]:
print_trainable_parameters(model)

trainable params: 1315758080 || all params: 1315758080 || trainable%: 100.0


In [11]:
#Rule of thumb: Need 3-4x model size for training (gradients, optimizer states, etc.)
#2.6 GB model → need ~8-10 GB GPU for training

In [12]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("facebook/opt-1.3b")

# ===== CHECK THESE =====
print(f"Vocab size (tokenizer): {len(tokenizer)}")
print(f"Vocab size (model): {model.config.vocab_size}")

# These should match!
#assert len(tokenizer) == model.config.vocab_size, "Mismatch!"

# Check special tokens
print(f"Padding token: {tokenizer.pad_token}")
print(f"EOS token: {tokenizer.eos_token}")
print(f"BOS token: {tokenizer.bos_token}")

# Test tokenization
sample = "Write a product description for headphones"
tokens = tokenizer.encode(sample)
print(f"Sample tokenization: {tokens}")
print(f"Number of tokens: {len(tokens)}")

Vocab size (tokenizer): 50265
Vocab size (model): 50272
Padding token: <pad>
EOS token: </s>
BOS token: </s>
Sample tokenization: [2, 45714, 10, 1152, 8194, 13, 15684]
Number of tokens: 7


In [2]:
!pip install -q deeplake==3.7.1

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/554.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━[0m [32m307.2/554.7 kB[0m [31m9.8 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m553.0/554.7 kB[0m [31m13.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m554.7/554.7 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.3/139.3 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.2/9.2 MB[0m [3

In [1]:
import deeplake

# Connect to the training and testing datasets
ds = deeplake.load('hub://genai360/GAIR-lima-train-set')
ds_test = deeplake.load('hub://genai360/GAIR-lima-test-set')

ModuleNotFoundError: No module named 'deeplake'

In [15]:
ds

Dataset(path='hub://genai360/GAIR-lima-train-set', read_only=True, tensors=['answer', 'embeddings', 'question', 'source'])

In [16]:
def prepare_sample_text(example):
    """Prepare the text from a sample of the dataset."""
    text = f"Question: {example['question'].text()}\n\nAnswer: {example['answer'].text()}"
    return text

In [17]:
#!pip install -q trl==0.26.2
#https://github.com/unslothai/unsloth/issues/3057


In [18]:
#Given that the model's max sequence length is 2048 tokens as per  "{config.max_position_embeddings}" we'll structure our dataset to match it.

from trl.trainer import ConstantLengthDataset

train_dataset = ConstantLengthDataset(
    tokenizer,
    ds,
    formatting_func=prepare_sample_text,
    infinite=True,
    seq_length=2048
)



In [19]:
train_dataset

<trl.trainer.utils.ConstantLengthDataset at 0x78d5d74e4710>

In [20]:
from trl.trainer import ConstantLengthDataset

eval_dataset = ConstantLengthDataset(
    tokenizer,
    ds_test,
    formatting_func=prepare_sample_text,
    infinite=True,
    seq_length=2048
)


**Rank Selection Guidelines**

Small models (< 1B parameters): 8-16

Medium models (1B-10B): 16-32

Large models (> 10B): 32-64

**Alpha-to-Rank Relationship**

Typically set to r or 2 * r

Higher alpha increases the adaptation's impact

Lower alpha reduces the adaptation's influence

In [21]:
from peft import LoraConfig

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

In [22]:
!pip install -q wandb

In [None]:
import wandb

# Initialize W&B
wandb.init(
    project="opt-finetuning",
    #name="OPT-fine_tuned-LIMA-CPU",
    config={
        "model": "facebook/opt-1.3b",
    }
)

In [24]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./OPT-fine_tuned-LIMA-CPU",

    # Training settings

    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    learning_rate=1e-5,
    dataloader_drop_last=True,
    lr_scheduler_type="cosine",
    warmup_steps=100,

    # Evaluation settings
    #evaluation_strategy="epoch",
    save_strategy="epoch",

    # Logging settings
    logging_dir="./logs",
    logging_steps=1,

    #num_train_epochs=10,

    # per_device_train_batch_size=1,
    # per_device_eval_batch_size=1,
    #learning_rate=1e-4,

    gradient_accumulation_steps=4,
    bf16=True,
    weight_decay=0.05,
    run_name="OPT-fine_tuned-LIMA-CPU",

    # W&B integration (automatic!)
    report_to="wandb",
)

In [25]:
model.lm_head

Linear(in_features=2048, out_features=50272, bias=False)

 Note: The initial run caused "CUDA out of memory" error as the training started because all the paramters of this model are trainable.

In [26]:
import torch.nn as nn

for param in model.parameters():
  param.requires_grad = False  # freeze the model - train adapters later
  if param.ndim == 1:
    # cast the small parameters (e.g. layernorm) to fp32 for stability
    param.data = param.data.to(torch.float32)

model.gradient_checkpointing_enable()  # reduce number of stored activations
model.enable_input_require_grads()

class CastOutputToFloat(nn.Sequential):
  def forward(self, x): return super().forward(x).to(torch.float32)


In [27]:
model.lm_head = CastOutputToFloat(model.lm_head)

In [28]:
from peft import LoraConfig, get_peft_model

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
#print_trainable_parameters(model)

trainable params: 3,145,728 || all params: 1,318,903,808 || trainable%: 0.23851079820371554


In [29]:
from trl import SFTTrainer

trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    peft_config=lora_config,
    #packing=True,
)

The model is already on multiple devices. Skipping the move to device specified in `args`.


In [30]:
# # Check GPU memory for each device
# for i in range(torch.cuda.device_count()):
#     props = torch.cuda.get_device_properties(i)
#     total_memory = props.total_memory / (1024**3)  # Convert to GB
#     print(f"GPU {i}: {props.name}, {total_memory:.2f} GB")

In [31]:
import torch
import gc

# Clear GPU cache
torch.cuda.empty_cache()
gc.collect()

# Check memory before starting
print(f"GPU memory allocated: {torch.cuda.memory_allocated(0)/1024**3:.2f} GB")
print(f"GPU memory reserved: {torch.cuda.memory_reserved(0)/1024**3:.2f} GB")

GPU memory allocated: 2.46 GB
GPU memory reserved: 2.53 GB


In [None]:
print("Training...")
trainer.train()

Training...


  input_ids = [torch.tensor(example["input_ids"]) for example in examples]
  labels = [torch.tensor(example["labels"]) for example in examples]
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
1,2.2804
2,2.4246
3,2.5049
4,2.4193
5,2.4529
6,2.317
7,2.4862
8,2.5009
9,2.401
10,2.4304


In [None]:
#!pip install pipdeptree

In [None]:
#!pipdeptree -p transformers