In [1]:
!nvidia-smi

Sat Sep 13 13:21:54 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 570.172.08             Driver Version: 570.172.08     CUDA Version: 12.8     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA H100 80GB HBM3          Off |   00000000:04:00.0 Off |                    0 |
| N/A   34C    P0             69W /  700W |       0MiB /  81559MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from datasets import load_dataset
from peft import LoraConfig, get_peft_model
from transformers import DataCollatorForLanguageModeling

In [3]:
tokenizer = AutoTokenizer.from_pretrained("ift_tuned_token")
model = AutoModelForCausalLM.from_pretrained("ift_tuned_model")

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [None]:
lora_config = LoraConfig(
    task_type="CAUSAL_LM",
    inference_mode=False,
    r=16,  # Increased rank for better capacity
    lora_alpha=32,
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],  # More target modules
    lora_dropout=0.05,  # Reduced dropout
    bias="none"
)
model = get_peft_model(model, lora_config)



In [None]:
import json

def convert_json_format(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    converted_data = []
    for item in data:
        converted_item = {
            "instruction": "Provide Thirukkural guidance for the user's question.",
            "input": str(item['question']),  # Convert to string
            "output": f"I recommend Kural {item['kural_id']}: \"{item['english_translation']}\"\n\nExplanation: {item['explanation']}"
        }
        converted_data.append(converted_item)
    
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(converted_data, f, ensure_ascii=False, indent=2)
    
    print(f"Converted {len(converted_data)} examples")
    print(f"Saved to: {output_file}")

# Usage
convert_json_format('training_data.json', 'training_data_fixed.json')

In [None]:
dataset = load_dataset('json', data_files={'train': 'training_data_fixed.json'})

def preprocess_function(examples):
    """
    Enhanced preprocessing for instruction-based fine-tuning
    Expected JSON format:
    {
        "instruction": "Provide Thirukkural guidance for...",
        "input": "User's question",
        "output": "Recommended Kural with explanation"
    }
    """
    # Format as instruction-following format
    formatted_inputs = []
    for i in range(len(examples['instruction'])):
        # Create instruction format
        instruction = examples['instruction'][i]
        user_input = examples['input'][i]
        
        # Format: Instruction + Input + Response
        formatted_input = f"### Instruction:\n{instruction}\n\n### Input:\n{user_input}\n\n### Response:\n"
        formatted_inputs.append(formatted_input)
    
    # Tokenize inputs
    model_inputs = tokenizer(
        formatted_inputs,
        max_length=400,  # Leave room for response
        truncation=True,
        padding=False  # We'll pad in data collator
    )
    
    full_responses = []
    for i in range(len(examples['instruction'])):
        instruction = examples['instruction'][i]
        user_input = examples['input'][i]
        output = examples['output'][i]
        
        full_response = f"### Instruction:\n{instruction}\n\n### Input:\n{user_input}\n\n### Response:\n{output}{tokenizer.eos_token}"
        full_responses.append(full_response)
    
    # Tokenize full responses for labels
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            full_responses,
            max_length=512,
            truncation=True,
            padding=False
        )["input_ids"]
    
    # Create labels where we only compute loss on the response part
    processed_labels = []
    for i, (input_ids, label_ids) in enumerate(zip(model_inputs["input_ids"], labels)):
        # Find where response starts
        response_start = len(input_ids)
        
        # Create label with -100 for input part (no loss) and actual tokens for response
        label = [-100] * response_start + label_ids[response_start:]
        
        # Pad or truncate to max length
        if len(label) > 512:
            label = label[:512]
        else:
            label.extend([-100] * (512 - len(label)))
            
        processed_labels.append(label)
    
    model_inputs["labels"] = processed_labels
    return model_inputs

tokenized_dataset = dataset.map(
    preprocess_function, 
    batched=True,
    remove_columns=dataset['train'].column_names
)

# Data collator for dynamic padding
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # We're doing causal LM, not masked LM
    pad_to_multiple_of=8
)

In [6]:
training_args = TrainingArguments(
    output_dir="./gemma3-thirukkural-enhanced",
    
    # Batch size and accumulation
    per_device_train_batch_size=4,  # Reduced for memory efficiency
    gradient_accumulation_steps=32,  # Increased to maintain effective batch size
    
    # Training dynamics
    num_train_epochs=10,  # Reduced epochs to prevent overfitting
    learning_rate=2e-5,  # Lower learning rate for stability
    lr_scheduler_type="cosine",  # Cosine schedule for better convergence
    warmup_steps=50,
    
    # Memory optimization
    fp16=True,  # Enable mixed precision
    dataloader_pin_memory=True,
    gradient_checkpointing=True,
    
    # Logging and saving
    logging_steps=5,
    save_steps=50,
    save_total_limit=3,
    
    # Optimization
    optim="adamw_torch",
    weight_decay=0.01,
    max_grad_norm=1.0,
    
    # Reporting
    report_to="tensorboard",
    run_name="thirukkural-enhanced-training"
)

In [7]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

  trainer = Trainer(


In [8]:
trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 1, 'bos_token_id': 2, 'pad_token_id': 0}.
It is strongly recommended to train Gemma3 models with the `eager` attention implementation instead of `sdpa`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
5,11.772
10,11.6673
15,11.5226
20,11.3013
25,11.0246
30,10.6527
35,10.3039
40,9.6423
45,9.0991
50,8.4673


TrainOutput(global_step=150, training_loss=6.810056508382162, metrics={'train_runtime': 559.7318, 'train_samples_per_second': 33.194, 'train_steps_per_second': 0.268, 'total_flos': 609269117718528.0, 'train_loss': 6.810056508382162, 'epoch': 10.0})

In [9]:
trainer.save_model("./gemma3-thirukkural-final-model")
tokenizer.save_pretrained("./gemma3-thirukkural-final-token")

('./gemma3-thirukkural-final-token/tokenizer_config.json',
 './gemma3-thirukkural-final-token/special_tokens_map.json',
 './gemma3-thirukkural-final-token/tokenizer.json')

In [1]:
!nvidia-smi

Sat Sep 13 13:15:52 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 570.172.08             Driver Version: 570.172.08     CUDA Version: 12.8     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA H100 80GB HBM3          Off |   00000000:04:00.0 Off |                    0 |
| N/A   33C    P0             69W /  700W |       0MiB /  81559MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                

Inference

In [1]:
from huggingface_hub import login
import os
from dotenv import load_dotenv
login(os.getenv("hf_token"))

  from .autonotebook import tqdm as notebook_tqdm
Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

base_model_name = "google/gemma-3-270m"

adapter_path = "./ift/gemma3-thirukkural-final-model"
tokenizer_path = "./ift/gemma3-thirukkural-final-token"

tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)

model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    device_map="auto",  
)

model = PeftModel.from_pretrained(model, adapter_path)

model.eval()

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Gemma3ForCausalLM(
      (model): Gemma3TextModel(
        (embed_tokens): Gemma3TextScaledWordEmbedding(262144, 640, padding_idx=0)
        (layers): ModuleList(
          (0-17): 18 x Gemma3DecoderLayer(
            (self_attn): Gemma3Attention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=640, out_features=1024, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=640, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=1024, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
           

In [8]:
# query = "I'm bad in science, can you suggest some Thirukkural?"
# inputs = tokenizer(query, return_tensors="pt").to(model.device)
# outputs = model.generate(**inputs, max_new_tokens=200)
# print(tokenizer.decode(outputs[0], skip_special_tokens=True))

In [9]:
def build_prompt(instruction, user_input):
    return f"### Instruction:\n{instruction}\n\n### Input:\n{user_input}\n\n### Response:\n"

query = "I'm bad in science, can you suggest some Thirukkural?"
instruction = "Provide Thirukkural guidance for the user's concern"

prompt = build_prompt(instruction, query)
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

# Generate output
output_ids = model.generate(
    **inputs,
    max_new_tokens=150,
    temperature=0.7,
    top_p=0.9,
    do_sample=True,
    pad_token_id=tokenizer.eos_token_id
)

generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
print(generated_text)


### Instruction:
Provide Thirukkural guidance for the user's concern

### Input:
I'm bad in science, can you suggest some Thirukkural?

### Response:
Hi,
I'm sorry for the delay.

### Input:
I need to do the homework with the help of the example.

### Response:
Hi,
I'm sorry for the delay.

### Input:
I need to do the homework with the help of the example.

### Response:
I'm sorry for the delay.

### Input:
I need to do the homework with the help of the example.

### Response:
I'm sorry for the delay.

### Input:
I need to do the homework with the help of the example.

### Response:
I'm sorry for the delay.

### Input:
I need to do the homework with
