In [1]:
!nvidia-smi

Thu Sep 11 10:47:07 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 570.172.08             Driver Version: 570.172.08     CUDA Version: 12.8     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA L40S                    Off |   00000000:34:00.0 Off |                    0 |
| N/A   27C    P8             32W /  350W |       0MiB /  46068MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
!pip install -q diffusers transformers accelerate peft datasets pynvml bitsandbytes

In [3]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    DataCollatorForLanguageModeling,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig,
    TrainerCallback,
    logging
)
from transformers import TrainerCallback
import pynvml
from torch.nn import functional as F
from peft import get_peft_model, LoraConfig, prepare_model_for_kbit_training
from huggingface_hub import login

logger = logging.get_logger(__name__)

import warnings
warnings.filterwarnings('ignore')

  import pynvml  # type: ignore[import]


In [4]:
lora_r, lora_alpha, lora_dropout = 8, 16, 0.1

In [None]:
login(os.getenv("hf_token"))

In [6]:
tokenizer = AutoTokenizer.from_pretrained("google/gemma-3-270m", use_fast=True)
model = AutoModelForCausalLM.from_pretrained("google/gemma-3-270m", dtype=torch.bfloat16, device_map="auto")

In [7]:
import json

input_file = "sample.json"
output_file = "qa_pairs1.json"

with open(input_file, "r") as f:
    data = json.load(f)

for row in data:
    for key in row:
        if key == "english translation":
            row["english_translation"] = row.pop("english translation")

    for key, value in row.items():
        if isinstance(value, list):
            row[key] = " ".join(map(str, value))  
        elif isinstance(value, dict):
            row[key] = json.dumps(value, ensure_ascii=False) 

with open(output_file, "w") as f:
    json.dump(data, f, ensure_ascii=False, indent=2)

In [None]:
ds = load_dataset("json", data_files="qa_pairs1.json")
dataset = ds["train"]

dataset = dataset.remove_columns(
 
   [col for col in dataset.column_names if col not in ["question", "kural_id", "english_translation", "explanation"]]
)

Generating train split: 0 examples [00:00, ? examples/s]

In [9]:
def build_prompt(ex):
    prompt = f"Query: {ex['question']}\nAnswer:\n"

    answer_text = (
        f"<kural_id>{ex['kural_id']}</kural_id>\n"
        f"<kural_translation>{ex['english_translation']}</kural_translation>\n"
        f"<explanation>{ex['explanation']}</explanation>"
    )
    full = f"{prompt}{answer_text}"
    return prompt, full

def preprocess(ex):
    prompt, full = build_prompt(ex)

    # Get the tokenized length of only the prompt
    prompt_ids_len = len(tokenizer(prompt, truncation=True, max_length=512)["input_ids"])

    # Encode the full input+output sequence
    full_enc = tokenizer(
        full,
        truncation=True,
        padding="max_length",
        max_length=512,
    )

    labels = full_enc["input_ids"].copy()
    labels[:prompt_ids_len] = [-100] * prompt_ids_len

    full_enc["labels"] = labels
    return full_enc


if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({"pad_token": "<pad>"})
model.resize_token_embeddings(len(tokenizer))

tokenized_ds = ds["train"].map(preprocess, remove_columns=ds["train"].column_names)

target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "up_proj", "down_proj"]
config = LoraConfig(
    r=lora_r,
    lora_alpha=lora_alpha,
    target_modules=target_modules,
    lora_dropout=lora_dropout,
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, config)

from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    return_tensors="pt",
)

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Map:   0%|          | 0/1581 [00:00<?, ? examples/s]

In [None]:
tokenizer.add_special_tokens({
    "additional_special_tokens": [
        "<kural_id>", "</kural_id>",
        "<kural_translation>", "</kural_translation>",
        "<explanation>", "</explanation>"
    ]
})
model.resize_token_embeddings(len(tokenizer))


In [10]:
import torch
from transformers import Trainer

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")

        if isinstance(labels, list):
            labels = torch.tensor(labels, dtype=torch.long, device=model.device)

        outputs = model(**inputs)
        ce_loss = outputs.loss  # Tensor with grad
        logits = outputs.logits

        preds = torch.argmax(logits, dim=-1)
        mask = labels != -100
        correct = (preds[mask] == labels[mask]).float().mean()

        # Log **only floats** for display, but keep the tensor for backward
        self.log({"loss": ce_loss.item(), "ce_loss": ce_loss.item(), "accuracy": correct.item()})

        # Return tensor for Trainer to backward
        return (ce_loss, outputs) if return_outputs else ce_loss


In [11]:
pynvml.nvmlInit()

class CustomLogCallback(TrainerCallback):
    def __init__(self, log_file_path, gpu_index=0):
        self.log_file_path = log_file_path
        self.gpu_index = gpu_index


        os.makedirs(os.path.dirname(log_file_path), exist_ok=True)

        with open(self.log_file_path, "w") as f:
            f.write("Custom Training Logs\n" + "="*22 + "\n")

    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs is not None:
            # GPU memory details
            handle = pynvml.nvmlDeviceGetHandleByIndex(self.gpu_index)
            mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
            util = pynvml.nvmlDeviceGetUtilizationRates(handle)

            allocated = torch.cuda.memory_allocated(self.gpu_index) / 1024**2
            reserved = torch.cuda.memory_reserved(self.gpu_index) / 1024**2

            log_str = (
            f"Step: {state.global_step}, "
            f"Loss: {logs.get('loss', 'N/A')}, "
            f"CE Loss: {logs.get('ce_loss', 'N/A')}, "
            f"Accuracy: {logs.get('accuracy', 'N/A')}, "
            f"LR: {logs.get('learning_rate', 'N/A')}, "
            f"GPU Util: {util.gpu}%, "
            f"VRAM Used: {mem_info.used/1024**2:.2f} MB / {mem_info.total/1024**2:.2f} MB, "
            f"Allocated: {allocated:.2f} MB, "
            f"Reserved: {reserved:.2f} MB\n")


            with open(self.log_file_path, "a") as f:
                f.write(log_str)


In [12]:
for name, param in model.named_parameters():
    param.requires_grad = True

In [14]:
output_dir = "./gemma"

custom_log_path = os.path.join(output_dir, "custom_metrics.log")
custom_logger = CustomLogCallback(log_file_path=custom_log_path)


args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=10,
    per_device_train_batch_size=16,      
    gradient_accumulation_steps=4,       # effective batch = 64
    learning_rate=1e-4,
    bf16=True,
    fp16=False,
    optim="adamw_torch",              # efficient optimizer
    lr_scheduler_type="cosine",
    warmup_ratio=0.03,
    # gradient_clip_norm=1.0,
    logging_steps=10,
    save_strategy="steps",
    save_steps=500,
    save_total_limit=3,
    gradient_checkpointing=True,
    dataloader_num_workers=4,
    report_to="tensorboard"          #tensorboard --logdir gemma-lora-finetuned/runs  # to check the logs in tensorboard
)

trainer = CustomTrainer(
    model=model,
    args=args,
    train_dataset=tokenized_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
    callbacks=[custom_logger]
)

In [15]:
trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 1, 'bos_token_id': 2, 'pad_token_id': 0}.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
It is strongly recommended to train Gemma3 models with the `eager` attention implementation instead of `sdpa`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
0,26.09231
1,25.601736
2,18.508308
3,13.290073
4,21.151043
5,17.434454
6,17.894424
7,6.933785
8,6.616109
9,6.078526


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

TrainOutput(global_step=250, training_loss=3.9495179166793823, metrics={'train_runtime': 740.5999, 'train_samples_per_second': 21.348, 'train_steps_per_second': 0.338, 'total_flos': 4946073373900800.0, 'train_loss': 3.9495179166793823, 'epoch': 10.0})

In [16]:
model.save_pretrained("./updated_model")
tokenizer.save_pretrained("./updated_token")

('./updated_token/tokenizer_config.json',
 './updated_token/special_tokens_map.json',
 './updated_token/tokenizer.json')