<a href="https://colab.research.google.com/github/swati-git/FineTuneLLM/blob/main/FineTuning_a_LLM_LIMA_CPU._version2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install the required dependencies

In [None]:
!pip install -q transformers==4.57.3  peft==0.5.0 trl==0.19.1

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.6/85.6 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m376.2/376.2 kB[0m [31m25.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
!pip install -q torch==2.9.0

# Checking on the available GPU configurations

In [None]:
#Rule of thumb: Need 3-4x model size for training (gradients, optimizer states, etc.)
#2.6 GB model → need ~8-10 GB GPU for training

In [None]:
import torch
from transformers import AutoModelForCausalLM

def check_gpu_and_load(model_name, required_memory_gb=10):
    print("GPU CONFIGS........................")

    if not torch.cuda.is_available():
        print("⚠️  No GPU available, will use CPU")
    else:
        print("✓ GPU available")

    # Check each GPU
    suitable_gpus = []
    for i in range(torch.cuda.device_count()):
        props = torch.cuda.get_device_properties(i)
        total_gb = props.total_memory / (1024**3)
        reserved_gb = torch.cuda.memory_reserved(i) / (1024**3)
        free_gb = total_gb - reserved_gb
        print(f"GPU {i} ({props.name}): {free_gb:.1f} GB free / {total_gb:.1f} GB total")
        print(f"Total memory: {total_gb:.2f} GB")
        print(f"Reserved memory: {reserved_gb:.2f} GB")
        print(f"Free memory: {free_gb:.2f} GB")
        print(f"Required memory: required_memory_gb GB")
        if free_gb >= required_memory_gb:
            suitable_gpus.append(i)


    print(f"Suitable GPUs: {suitable_gpus}")
    # if not suitable_gpus:
    #     print(f"⚠️  No GPU with {required_memory_gb} GB free. Use device_map='auto'")
    # else :
    #     print(f"✓ Loading on GPU {suitable_gpus[0]}")

In [None]:

check_gpu_and_load("facebook/opt-1.3b", required_memory_gb=16)

GPU CONFIGS........................
✓ GPU available
GPU 0 (Tesla T4): 14.7 GB free / 14.7 GB total
Total memory: 14.74 GB
Reserved memory: 0.00 GB
Free memory: 14.74 GB
Required memory: 16.00 GB
Suitable GPUs: []
⚠️  No GPU with 16 GB free. Use device_map='auto'


# Check the base model specs.This will help in configuring the memory and compute required

In [None]:
from transformers import AutoConfig

config = AutoConfig.from_pretrained("facebook/opt-1.3b")
print("BASE MODEL CONFIGS........................")
print(f"Data type of the parameters: {config.dtype} ")
print(f"Model name: {config.model_type}")
print(f"Hidden size: {config.hidden_size}")
print(f"Number of layers: {config.num_hidden_layers}")
print(f"Vocabulary size: {config.vocab_size}")
print(f"Max sequence length: {config.max_position_embeddings}")

config.json:   0%|          | 0.00/653 [00:00<?, ?B/s]

BASE MODEL CONFIGS........................
Data type of the parameters: torch.float16 
Model name: opt
Hidden size: 2048
Number of layers: 24
Vocabulary size: 50272
Max sequence length: 2048


# Load the base model

*We will load the model in bfloat16 datatype because, bfloat16 has a wider range than float16*

In [None]:
from transformers import AutoModelForCausalLM
import torch

base_model = AutoModelForCausalLM.from_pretrained("facebook/opt-1.3b",
                                             dtype=torch.bfloat16,
                                              device_map = "cuda")

KeyboardInterrupt: 

*Check the paramaters of the loaded model*

In [None]:
def get_model_size(model):
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

    # Calculate memory (in GB)
    bytes_per_param = 2 if str(model.dtype) == "torch.bfloat16" else 4
    memory_gb = (total_params * bytes_per_param) / (1024**3)

    print(f"Total parameters: {total_params:,}")
    print(f"Trainable parameters: {trainable_params:,}")
    print(f"Model size in memory: {memory_gb:.2f} GB")
    print(
        f"trainable params: {trainable_params} || all params: {total_params} || trainable%: {100 * trainable_params / total_params}"
    )
    print(f"Data type: {model.dtype}")

In [None]:
print("BASE MODEL CONFIGS................")
get_model_size(base_model)

# LOAD THE TOKENIZER AND CHECK CONFIGS

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("facebook/opt-1.3b")

# ===== CHECK THESE =====
print("TOKENIZER CONFIGS........................")
print(f"Vocab size (tokenizer): {len(tokenizer)}")
print(f"Vocab size (model): {model.config.vocab_size}")

# These should match!
#assert len(tokenizer) == model.config.vocab_size, "Mismatch!"

# Check special tokens
print(f"Padding token: {tokenizer.pad_token}")
print(f"EOS token: {tokenizer.eos_token}")
print(f"BOS token: {tokenizer.bos_token}")

# Test tokenization
sample = "Write a product description for headphones"
tokens = tokenizer.encode(sample)
print(f"Sample tokenization: {tokens}")
print(f"Number of tokens: {len(tokens)}")

# Load the training and testing dataset from active loop

In [None]:
!pip install -q deeplake==3.7.1

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/554.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m554.7/554.7 kB[0m [31m40.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.3/139.3 kB[0m [31m13.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.2/9.2 MB[0m [31m118.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m82.3/82.3 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.1/14.1 MB[0m [31m133.7 MB/s[0m eta [36m0:00

In [None]:
import deeplake

# Connect to the training and testing datasets
ds = deeplake.load('hub://genai360/GAIR-lima-train-set')
ds_test = deeplake.load('hub://genai360/GAIR-lima-test-set')

|

Opening dataset in read-only mode as you don't have write permissions.



|

This dataset can be visualized in Jupyter Notebook by ds.visualize() or at https://app.activeloop.ai/genai360/GAIR-lima-train-set



/

hub://genai360/GAIR-lima-train-set loaded successfully.



/

Opening dataset in read-only mode as you don't have write permissions.


/

This dataset can be visualized in Jupyter Notebook by ds.visualize() or at https://app.activeloop.ai/genai360/GAIR-lima-test-set



/

hub://genai360/GAIR-lima-test-set loaded successfully.





In [None]:
ds.summary()

Dataset(path='hub://genai360/GAIR-lima-train-set', read_only=True, tensors=['answer', 'embeddings', 'question', 'source'])

   tensor     htype      shape       dtype  compression
  -------    -------    -------     -------  ------- 
   answer     text     (1030, 1)      str     None   
 embeddings  generic  (1030, 1024)  float32   None   
  question    text     (1030, 1)      str     None   
   source     text     (1030, 1)      str     None   


In [None]:
#ds.visualize()

In [None]:
def prepare_sample_text(example):
    """Prepare the text from a sample of the dataset."""
    text = f"Question: {example['question'].text()}\n\nAnswer: {example['answer'].text()}"
    return text

In [None]:
#Given that the model's max sequence length is 2048 tokens as per  "{config.max_position_embeddings}" we'll structure our dataset to match it.

from trl.trainer import ConstantLengthDataset

train_dataset = ConstantLengthDataset(
    tokenizer,
    ds,
    formatting_func=prepare_sample_text,
    infinite=True,
    seq_length=1024               #2048
)



In [None]:
#!pip install -q trl==0.26.2
#https://github.com/unslothai/unsloth/issues/3057


In [None]:
from trl.trainer import ConstantLengthDataset

eval_dataset = ConstantLengthDataset(
    tokenizer,
    ds_test,
    formatting_func=prepare_sample_text,
    infinite=True,
    seq_length=1024            #2048
)

# We are fine-tuning  facebook/opt1.3B using LoRA


**Rank Selection Guidelines**

Small models (< 1B parameters): 8-16

Medium models (1B-10B): 16-32

Large models (> 10B): 32-64

**Alpha-to-Rank Relationship**

Typically set to r or 2 * r

Higher alpha increases the adaptation's impact

Lower alpha reduces the adaptation's influence

In [None]:
from peft import LoraConfig

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

# Use WANDB to track the progress of the model training

In [None]:
!pip install -q wandb

In [None]:
import wandb

wandb.init(
    project="opt-finetuning",
    #name="OPT-fine_tuned-LIMA-CPU",
    config={
        "model": "facebook/opt-1.3b",
    }
)

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./OPT-fine_tuned-LIMA-CPU",

    # Training settings

    num_train_epochs=1,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    learning_rate=1e-5,
    dataloader_drop_last=True,
    lr_scheduler_type="cosine",
    warmup_steps=100,

    # Evaluation settings
    #evaluation_strategy="epoch",
    save_strategy="epoch",

    # Logging settings
    logging_dir="./logs",
    logging_steps=1,

    # per_device_train_batch_size=1,
    # per_device_eval_batch_size=1,
    #learning_rate=1e-4,

    gradient_accumulation_steps=4,
    bf16=True,
    weight_decay=0.05,
    run_name="OPT-fine_tuned-LIMA-CPU",

    # W&B integration (automatic!)
    report_to="wandb",
)

In [None]:
import torch.nn as nn

for param in base_model.parameters():
  param.requires_grad = False  # freeze the model - train adapters later
  if param.ndim == 1:
    # cast the small parameters (e.g. layernorm) to fp32 for stability
    param.data = param.data.to(torch.float32)

base_model.gradient_checkpointing_enable()
base_model.enable_input_require_grads()

class CastOutputToFloat(nn.Sequential):
  def forward(self, x): return super().forward(x).to(torch.float32)


In [None]:
base_model.lm_head = CastOutputToFloat(base_model.lm_head)

# Check the number of trainable paramters after creating the adapter matrix using Lora

In [None]:
from peft import get_peft_model

peft_model = get_peft_model(base_model, lora_config)
peft_model.print_trainable_parameters()
#print_trainable_parameters(model)

trainable params: 3,145,728 || all params: 1,318,903,808 || trainable%: 0.23851079820371554


In [None]:
get_model_size(peft_model)

In [None]:
from trl import SFTTrainer

trainer = SFTTrainer(
    model=peft_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    peft_config=lora_config,
    #packing=True,
)

In [None]:
print("\n=== Starting Training ===")

# Save adapter weights before training
before_weights = peft_model.base_model.model.transformer.h[0].attn.c_attn.lora_A['default'].weight.data.clone()


In [None]:
print("Training...")
trainer.train()

Training...


  input_ids = [torch.tensor(example["input_ids"]) for example in examples]
  labels = [torch.tensor(example["labels"]) for example in examples]
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
1,2.4878
2,2.4909
3,2.3745
4,2.3289
5,2.4367
6,2.6542
7,2.4353
8,2.6364
9,2.5299
10,2.4728


  input_ids = [torch.tensor(example["input_ids"]) for example in examples]
  labels = [torch.tensor(example["labels"]) for example in examples]


Step,Training Loss
1,2.4878
2,2.4909
3,2.3745
4,2.3289
5,2.4367
6,2.6542
7,2.4353
8,2.6364
9,2.5299
10,2.4728




TrainOutput(global_step=65, training_loss=2.498688085262592, metrics={'train_runtime': 4685.3034, 'train_samples_per_second': 0.22, 'train_steps_per_second': 0.014, 'total_flos': 7683221294678016.0, 'train_loss': 2.498688085262592})

In [None]:
# Check adapter weights after training (they should change)
after_weights = peft_model.base_model.model.transformer.h[0].attn.c_attn.lora_A['default'].weight.data

print("\n=== After Training ===")
print(f"Adapter weights changed? {not torch.allclose(before_weights, after_weights)}")  # True

# Check base weights (they should NOT change)
# Note: We can't easily compare because base_model was modified in-place,
# but we can verify requires_grad is still False
base_layer = peft_model.base_model.model.transformer.h[0].attn.c_attn.base_layer
print(f"Base weights still frozen? {not base_layer.weight.requires_grad}")  # True


In [None]:
# The layers are modified in-place:
original_layer = base_model.transformer.h[0].attn.c_attn
peft_layer = peft_model.base_model.model.transformer.h[0].attn.c_attn

print(f"Same layer object? {original_layer is peft_layer}")  # True (in-place modification)
print(f"Has LoRA? {hasattr(peft_layer, 'lora_A')}")  # True

# After training, merge adapters into base model

In [None]:
merged_model = peft_model.merge_and_unload()

In [None]:
#Save merged model

merged_model.save_pretrained("./merged_model")
tokenizer.save_pretrained("./merged_model")


In [None]:

#  Later, load for inference (no PEFT needed!)

from transformers import AutoModelForCausalLM, AutoTokenizer

# Load as a standard model (no PEFT dependency)
model = AutoModelForCausalLM.from_pretrained(
    "./merged_model",
    torch_dtype=torch.float16,
    device_map="auto"
)

tokenizer = AutoTokenizer.from_pretrained("./merged_model")


# STEP 4: Run inference

model.eval()

inputs = tokenizer("Hello, how are you?", return_tensors="pt").to(model.device)

with torch.no_grad():
    outputs = model.generate(**inputs, max_length=50)

print(tokenizer.decode(outputs[0], skip_special_tokens=True))

# Save and Load Adapters(during dev)

In [None]:

peft_model.save_pretrained("./my_lora_adapters")
# Saves only adapter weights (~1-50 MB typically)

# Also save tokenizer
tokenizer.save_pretrained("./my_lora_adapters")

# STEP 2: Later, load for inference

from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

# Load base model
base_model = AutoModelForCausalLM.from_pretrained(
    "gpt2",  # Original base model
    torch_dtype=torch.float16,
    device_map="auto"
)

# Load LoRA adapters on top of base model
model = PeftModel.from_pretrained(
    base_model,
    "./my_lora_adapters"  # Path to saved adapters
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("./my_lora_adapters")

# ========================================
# STEP 3: Run inference
# ========================================
model.eval()  # Set to evaluation mode

inputs = tokenizer("Hello, how are you?", return_tensors="pt").to(model.device)

with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_length=50,
        temperature=0.7,
        do_sample=True
    )

print(tokenizer.decode(outputs[0], skip_special_tokens=True))

# Direct Inference

In [None]:
from transformers import pipeline

# Create a text generation pipeline
generator = pipeline(
    "text-generation",
    model=peft_model,
    tokenizer=tokenizer,
)

# Generate text
output = generator("Once upon a time", max_length=50)
print(output[0]['generated_text'])

In [None]:

# base_model = AutoModelForCausalLM.from_pretrained("facebook/opt-1.3b",
#                                              dtype=torch.bfloat16,
#                                               device_map = "cuda")



In [None]:
# from peft import PeftModel

# # Load the base model with the trained adapters from the checkpoint
# combined_model = PeftModel.from_pretrained(
#     base_model,
#     "./OPT-fine_tuned-LIMA-CPU/checkpoint-65")

In [None]:
# model = combined_model.merge_and_unload()

In [None]:
# model.save_pretrained("./OPT-fine_tuned-LIMA-CPU/merged")

In [None]:
import torch
import gc

# Clear GPU cache
torch.cuda.empty_cache()
gc.collect()

# Check memory before starting
print(f"GPU memory allocated: {torch.cuda.memory_allocated(0)/1024**3:.2f} GB")
print(f"GPU memory reserved: {torch.cuda.memory_reserved(0)/1024**3:.2f} GB")

GPU memory allocated: 2.46 GB
GPU memory reserved: 2.53 GB


In [None]:
#!pip install pipdeptree

In [None]:
#!pipdeptree -p transformers