GPU check

In [1]:
!nvidia-smi

Fri Aug 29 05:52:08 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.247.01             Driver Version: 535.247.01   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-SXM4-80GB          Off | 00000000:00:05.0 Off |                    0 |
| N/A   34C    P0              54W / 400W |      0MiB / 81920MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [2]:
!pip install -q diffusers transformers accelerate peft

In [3]:
!pip install datasets pynvml



In [4]:
pip install bitsandbytes

Note: you may need to restart the kernel to use updated packages.


In [5]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    DataCollatorForLanguageModeling,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig,
    TrainerCallback,
    logging
)
from transformers import TrainerCallback
import pynvml
from torch.nn import functional as F
from peft import get_peft_model, LoraConfig, prepare_model_for_kbit_training
from huggingface_hub import login

logger = logging.get_logger(__name__)

In [6]:
q_lora = False
lora_r, lora_alpha, lora_dropout = 8, 16, 0.05
brevity_B, brevity_lambda = 10, 0.1

model_path = "gemma_model"
tokenizer_path = "gemma_tokenizer"
dataset_path = "qa_pairs.json"
output_dir = "./gemma-lora-finetuned"

In [None]:
# If loading from huggingface
login(token= os.getenv("hf_token"))
tokenizer = AutoTokenizer.from_pretrained("google/gemma-3-270m", use_fast=True)

if q_lora:
    bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16)
    model = AutoModelForCausalLM.from_pretrained("google/gemma-3-270m", quantization_config=bnb_config, device_map="auto")
    model = prepare_model_for_kbit_training(model)
else:
    model = AutoModelForCausalLM.from_pretrained("google/gemma-3-270m", torch_dtype=torch.bfloat16, device_map="auto", attn_implementation='eager')

In [None]:
import json

input_file = "../data/qa_pairs.json"
output_file = "qa_pairs1.json"

with open(input_file, "r") as f:
    data = json.load(f)

for row in data:
    for key in row:
        if key == "english translation":
            row["english_translation"] = row.pop("english translation")

    for key, value in row.items():
        if isinstance(value, list):
            row[key] = " ".join(map(str, value))  
        elif isinstance(value, dict):
            row[key] = json.dumps(value, ensure_ascii=False) 

with open(output_file, "w") as f:
    json.dump(data, f, ensure_ascii=False, indent=2)

In [9]:
ds = load_dataset("json", data_files="qa_pairs1.json")
dataset = ds["train"]

dataset = dataset.remove_columns(
    [col for col in dataset.column_names if col not in ["question", "kural_id", "kural", "english_translation", "explanation"]]
)

Generating train split: 0 examples [00:00, ? examples/s]

In [10]:
def build_prompt(ex):
    # Input (query)
    prompt = f"Query: {ex['question']}\nAnswer:"

    # Output (gold answer with kural, translation, and explanation)
    answer_parts = []
    if "kural_id" in ex and ex["kural_id"]:
        answer_parts.append(f"Kural ID: {ex['kural_id']}")
    if "kural" in ex and ex["kural"]:
        answer_parts.append(f"Kural: {ex['kural']}")
    if "english_translation" in ex and ex["english_translation"]:
        answer_parts.append(f"Translation: {ex['english_translation']}")
    if "explanation" in ex and ex["explanation"]:
        answer_parts.append(f"Explanation: {ex['explanation']}")

    answer_text = "\n".join(answer_parts)

    full = f"{prompt} {answer_text}"
    return prompt, full


def preprocess(ex):
    prompt, full = build_prompt(ex)

    # Get the tokenized length of only the prompt
    prompt_ids_len = len(tokenizer(prompt, truncation=True, max_length=512)["input_ids"])

    # Encode the full input+output sequence
    full_enc = tokenizer(
        full,
        truncation=True,
        padding="max_length",
        max_length=512,
    )

    labels = full_enc["input_ids"].copy()
    labels[:prompt_ids_len] = [-100] * prompt_ids_len

    full_enc["labels"] = labels
    return full_enc


if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({"pad_token": "<pad>"})
model.resize_token_embeddings(len(tokenizer))

tokenized_ds = ds["train"].map(preprocess, remove_columns=ds["train"].column_names)

target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "up_proj", "down_proj"]
config = LoraConfig(
    r=lora_r,
    lora_alpha=lora_alpha,
    target_modules=target_modules,
    lora_dropout=lora_dropout,
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, config)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)


The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

In [None]:
import torch
from transformers import Trainer 




class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        ce_loss = outputs.loss

        # print(f"debug: ce_loss.requires_grad = {ce_loss.requires_grad}")

        logits = outputs.logits

        # alternative approach 1: always compute brevity_loss with gradients
        if self.is_in_train and labels is not None:
            answer_lengths = (labels != -100).sum(dim=1)
            # these variables need to be defined, for example in your trainer's __init__
            # or passed to the compute_loss method
            brevity_lambda = kwargs.get("brevity_lambda", 0.1) 
            brevity_B = kwargs.get("brevity_B", 5.0)
            penalty = brevity_lambda * torch.clamp(answer_lengths.float() - brevity_B, min=0)
            brevity_loss = penalty.mean()
        else:
            # for validation/eval, we'll create a zero tensor.
            # to ensure it always plays nice with gradient computation if ce_loss expects it,
            # we'll create it explicitly with requires_grad=true.
            # it won't actually contribute to gradients since it's zero,
            # but this satisfies pytorch's graph tracking if it was the issue.
            brevity_loss = torch.tensor(0.0, device=ce_loss.device, dtype=ce_loss.dtype, requires_grad=True)
        

        total_loss = ce_loss + brevity_loss

        preds = torch.argmax(logits, dim=-1)
        mask = labels != -100
        correct = (preds[mask] == labels[mask]).float().mean()

        if return_outputs:
            return total_loss, outputs
        return total_loss

In [26]:
pynvml.nvmlInit()

class CustomLogCallback(TrainerCallback):
    def __init__(self, log_file_path, gpu_index=0):
        self.log_file_path = log_file_path
        self.gpu_index = gpu_index


        os.makedirs(os.path.dirname(log_file_path), exist_ok=True)

        with open(self.log_file_path, "w") as f:
            f.write("Custom Training Logs\n" + "="*22 + "\n")

    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs is not None:
            # GPU memory details
            handle = pynvml.nvmlDeviceGetHandleByIndex(self.gpu_index)
            mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
            util = pynvml.nvmlDeviceGetUtilizationRates(handle)

            allocated = torch.cuda.memory_allocated(self.gpu_index) / 1024**2
            reserved = torch.cuda.memory_reserved(self.gpu_index) / 1024**2

            log_str = (
                f"Step: {state.global_step}, "
                f"Loss: {logs.get('loss', 'N/A')}, "
                f"CE Loss: {logs.get('ce_loss', 'N/A')}, "
                f"Brevity Loss: {logs.get('brevity_loss', 'N/A')}, "
                f"LR: {logs.get('learning_rate', 'N/A')}, "
                f"GPU Util: {util.gpu}%, "
                f"VRAM Used: {mem_info.used/1024**2:.2f} MB / {mem_info.total/1024**2:.2f} MB, "
                f"Allocated: {allocated:.2f} MB, "
                f"Reserved: {reserved:.2f} MB\n"
            )

            with open(self.log_file_path, "a") as f:
                f.write(log_str)


In [27]:
for name, param in model.named_parameters():
    param.requires_grad = True

In [None]:
custom_log_path = os.path.join(output_dir, "custom_metrics.log")
custom_logger = CustomLogCallback(log_file_path=custom_log_path)


args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=5,
    per_device_train_batch_size=16,      
    gradient_accumulation_steps=4,       # effective batch = 64
    learning_rate=2e-4,
    bf16=True,
    fp16=False,
    optim="adamw_bnb_8bit",              # efficient optimizer
    lr_scheduler_type="cosine",
    warmup_ratio=0.03,
    logging_steps=10,
    save_strategy="steps",
    save_steps=500,
    save_total_limit=3,
    gradient_checkpointing=True,
    dataloader_num_workers=4,
    report_to="tensorboard"          #tensorboard --logdir gemma-lora-finetuned/runs  # to check the logs in tensorboard
)

trainer = CustomTrainer(
    model=model,
    args=args,
    train_dataset=tokenized_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
    callbacks=[custom_logger]
)

  trainer = CustomTrainer(


In [29]:
trainer.train()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

debug: ce_loss.requires_grad = True
debug: ce_loss.requires_grad = True
debug: ce_loss.requires_grad = True
debug: ce_loss.requires_grad = True


Step,Training Loss
10,81.2951


debug: ce_loss.requires_grad = True
debug: ce_loss.requires_grad = True
debug: ce_loss.requires_grad = True
debug: ce_loss.requires_grad = True
debug: ce_loss.requires_grad = True
debug: ce_loss.requires_grad = True
debug: ce_loss.requires_grad = True
debug: ce_loss.requires_grad = True
debug: ce_loss.requires_grad = True
debug: ce_loss.requires_grad = True
debug: ce_loss.requires_grad = True
debug: ce_loss.requires_grad = True
debug: ce_loss.requires_grad = True
debug: ce_loss.requires_grad = True
debug: ce_loss.requires_grad = True
debug: ce_loss.requires_grad = True
debug: ce_loss.requires_grad = True
debug: ce_loss.requires_grad = True
debug: ce_loss.requires_grad = True
debug: ce_loss.requires_grad = True
debug: ce_loss.requires_grad = True
debug: ce_loss.requires_grad = True
debug: ce_loss.requires_grad = True
debug: ce_loss.requires_grad = True
debug: ce_loss.requires_grad = True
debug: ce_loss.requires_grad = True
debug: ce_loss.requires_grad = True
debug: ce_loss.requires_grad

KeyboardInterrupt: 

In [None]:
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)