In [None]:
!nvidia-smi

Sun Sep 14 12:42:46 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 570.172.08             Driver Version: 570.172.08     CUDA Version: 12.8     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100-SXM4-80GB          Off |   00000000:00:05.0 Off |                    0 |
| N/A   35C    P0             56W /  400W |       0MiB /  81920MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
pip uninstall -y transformers peft accelerate trl datasets

Found existing installation: transformers 4.41.2
Uninstalling transformers-4.41.2:
  Successfully uninstalled transformers-4.41.2
Found existing installation: peft 0.11.1
Uninstalling peft-0.11.1:
  Successfully uninstalled peft-0.11.1
Found existing installation: accelerate 0.30.1
Uninstalling accelerate-0.30.1:
  Successfully uninstalled accelerate-0.30.1
Found existing installation: trl 0.9.4
Uninstalling trl-0.9.4:
  Successfully uninstalled trl-0.9.4
Found existing installation: datasets 4.0.0
Uninstalling datasets-4.0.0:
  Successfully uninstalled datasets-4.0.0


In [1]:
pip install --upgrade transformers peft accelerate trl datasets

Collecting trl
  Downloading trl-0.23.0-py3-none-any.whl.metadata (11 kB)
Downloading trl-0.23.0-py3-none-any.whl (564 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m564.7/564.7 kB[0m [31m23.0 MB/s[0m  [33m0:00:00[0m
[?25hInstalling collected packages: trl
Successfully installed trl-0.23.0
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import torch
from datasets import Dataset
from peft import LoraConfig
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from trl import SFTTrainer
import os
from huggingface_hub import login

os.environ["TOKENIZERS_PARALLELISM"] = "false"

  import pynvml  # type: ignore[import]


In [None]:
login(os.getenv("hf_token"))

In [4]:
excel_file_path = 'thirukkural_multi_language.xlsx'


print(f"Loading data from {excel_file_path}...")
try:
    tamil_df = pd.read_excel(excel_file_path, sheet_name='Tamil')
    english_df = pd.read_excel(excel_file_path, sheet_name='English')
    hindi_df = pd.read_excel(excel_file_path, sheet_name='Hindi')
    print("Hi")
except Exception as e:
    print(f"ERROR: Could not read the Excel file. Please ensure the file is in the correct path and the sheet names are correct. Details: {e}")

# --- Clean and standardize the DataFrames ---
print("Cleaning and standardizing data...")
tamil_df.rename(columns={'Kural no': 'kural_no'}, inplace=True)
english_df.rename(columns={'kural no': 'kural_no', 'Translation  Line': 'english_translation'}, inplace=True)
hindi_df.rename(columns={'g': 'kural_no', 'Explanation': 'hindi_translation'}, inplace=True)
hindi_df['kural_no'] = range(1, len(hindi_df) + 1) # Fix potential numbering issues in Hindi sheet

# --- Select and merge the necessary columns ---
tamil_subset = tamil_df[['kural_no', 'Line 1', 'Line 2']]
english_subset = english_df[['kural_no', 'english_translation']]
hindi_subset = hindi_df[['kural_no', 'hindi_translation']]

merged_df = pd.merge(tamil_subset, english_subset, on='kural_no')
merged_df = pd.merge(merged_df, hindi_subset, on='kural_no')
merged_df['kural_tamil'] = merged_df['Line 1'] + '\n' + merged_df['Line 2']
print("Data merged successfully.")

# --- Format the text into the final instruction format ---
def format_row(row):
    prompt = f"<kural_id> {row['kural_no']}"
    response = (
        f"குறள்: {row['kural_tamil']}\n\n"
        f"English Translation: {row['english_translation']}\n\n"
        f"English Explanation: {row['english_translation']}\n\n"
        f"Hindi Translation: {row['hindi_translation']}"
    )
    # This is the required format for Gemma Instruct models
    return f"<s>[INST] {prompt} [/INST] {response} </s>"

merged_df['text'] = merged_df.apply(format_row, axis=1)

# --- Convert to a Hugging Face Dataset object ---
thirukkural_dataset = Dataset.from_pandas(merged_df[['text']])

# --- Verification Step ---
print(f"\nDataset prepared with {len(thirukkural_dataset)} examples.")
print("\n--- SAMPLE TRAINING EXAMPLE ---")
print(thirukkural_dataset[13]['text'])
print("-----------------------------")

Loading data from thirukkural_multi_language.xlsx...


Hi
Cleaning and standardizing data...
Data merged successfully.

Dataset prepared with 1329 examples.

--- SAMPLE TRAINING EXAMPLE ---
<s>[INST] <kural_id> 14 [/INST] குறள்: ஏரின் உழாஅர் உழவர் புயலென்னும்
வாரி வளங்குன்றிக் கால்

English Translation: Husbandmen would cease to ply the plough if the fountains of the heavens are dried up.

English Explanation: Husbandmen would cease to ply the plough if the fountains of the heavens are dried up.

Hindi Translation: यदि वृष्टि का होना बंद हो जाए तो खाद्य पदार्थों की पैदाइश करने वाले कृषक लोग भी हल नहीं चला पाते। </s>
-----------------------------


In [5]:
# --- Define the model ---
model_name = "google/gemma-3-270m-it"
compute_dtype = torch.bfloat16 # Datatype for H100 performance

# --- Load the model ---
print(f"Loading base model: {model_name}...")
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    dtype=compute_dtype,
    device_map="auto", # Automatically use the H100
)
model.config.use_cache = False
print("Model loaded successfully.")

# --- Load the tokenizer ---
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
print("Tokenizer loaded successfully.")

Loading base model: google/gemma-3-270m-it...


Model loaded successfully.
Tokenizer loaded successfully.


In [6]:
# --- LoRA Configuration ---
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=128,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
)

# --- Training Arguments ---
training_args = TrainingArguments(
    output_dir="./gemma-thirukkural-270m-finetuned",
    num_train_epochs=15,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=1,
    optim="adamw_torch",
    save_steps=50,
    logging_steps=10,
    learning_rate=1e-5,
    weight_decay=0.001,
    fp16=True,
    bf16=False, # Must be true for H100
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    report_to="tensorboard",
)


In [7]:
trainer = SFTTrainer(
    model=model,
    train_dataset=thirukkural_dataset,
    peft_config=peft_config,
    # max_seq_length=512,
    # tokenizer=tokenizer,
    args=training_args,
)

Adding EOS to train dataset:   0%|          | 0/1329 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/1329 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/1329 [00:00<?, ? examples/s]

In [8]:
trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': 2, 'pad_token_id': 0}.
It is strongly recommended to train Gemma3 models with the `eager` attention implementation instead of `sdpa`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`.


Step,Training Loss
10,4.833
20,4.6209
30,4.4909
40,4.3703
50,4.0722
60,3.9079
70,3.849
80,3.7254
90,3.5554
100,3.3829


TrainOutput(global_step=2505, training_loss=2.605553501761126, metrics={'train_runtime': 563.0898, 'train_samples_per_second': 35.403, 'train_steps_per_second': 4.449, 'total_flos': 2310109687493376.0, 'train_loss': 2.605553501761126, 'entropy': 2.445075035095215, 'num_tokens': 2919675.0, 'mean_token_accuracy': 0.5593865215778351, 'epoch': 15.0})

In [9]:
final_adapter_path = "./gemma-thirukkural-270m-final"
trainer.model.save_pretrained(final_adapter_path)
print(f"\nFine-tuned model adapter saved to: {final_adapter_path}")


Fine-tuned model adapter saved to: ./gemma-thirukkural-270m-final


In [2]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from peft import PeftModel

base_model_id = "google/gemma-3-270m-it"

# The path to your LoRA adapter.
# This is the directory where your checkpoints are saved.
# Make sure to point to a specific checkpoint, e.g., "checkpoint-150".
adapter_path = "./gemma-thirukkural-270m-final" #<-- UPDATE THIS

print("Loading base model...")
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True,
)

print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(base_model_id, trust_remote_code=True)

# --- 3. Load the LoRA adapter and merge ---
print("Loading LoRA adapter...")
model = PeftModel.from_pretrained(base_model, adapter_path)

print("Model loaded successfully!")

# --- 4. Create an inference pipeline ---
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device_map="auto"
)

# --- 5. Create a prompt and generate text ---
# Use the same format as your training data
prompt = "திருக்குறள்: யாதானும் நாடாமால் ஊராமால் என்னொருவன் சாந்துணையும் கல்லாத வாறு.\nவிளக்கம்:"

print("\nGenerating response...")
# Generate the text
sequences = pipe(
    prompt,
    max_new_tokens=256,       # Max number of new tokens to generate
    do_sample=True,           # Use sampling for more creative output
    temperature=0.7,          # Controls randomness (lower is more deterministic)
    top_p=0.9,                # Nucleus sampling
    eos_token_id=tokenizer.eos_token_id,
)

# Print the result
print(sequences[0]['generated_text'])

Loading base model...
Loading tokenizer...


Loading LoRA adapter...


Device set to use cpu


Model loaded successfully!

Generating response...
திருக்குறள்: யாதானும் நாடாமால் ஊராமால் என்னொருவன் சாந்துணையும் கல்லாத வாறு.
விளக்கம்: ஒளித்தாழ் என்று ஓவன்.

வேறு ஏதாவது தலைப்பு என்று சொல்லு.
இன்னும் எனது தலைப்பு என்ன?

என்குறித்து எங்கு சொல்லப் போகும்?

சூழல்.

எல்லா தலைப்பு எப்படி சொல்லப் போகும்?

எனைத்து எங்கு சொல்லப் போகும்?

எல்லா தலைப்பு எப்படி சொல்லப் போகும்?

எனைத்து எங்கு சொல்லப் போகும்?

எல்லா தலைப்பு எப்படி சொல்லப் போகும்?

எனைத்து எங்கு சொல்லப் போகும்?

எல்லா தலைப்பு எப்படி சொல்லப் போகும்?

எனைத்து எங்கு சொல்லப் போகும்?

எல்லா தலைப்பு எப்படி சொல்லப் போகும்?

எனைத்து எங்கு சொல்லப் போகும்?

எல்லா தலைப்பு எப்படி சொல்லப் போகும்?

எனைத்து எங்கு சொல்லப் போகும்?

எல்லா தலைப்பு எப்படி சொல்லப் போகும்?

எனைத்து எங்கு சொல்லப் போகும்?

எல்லா தலைப்பு எப்படி சொல்லப் போகும்?

எனைத்து எங்கு சொல்லப் போகும்?

எல்லா தலைப்பு எப்படி சொல்லப் போகும்?

எனைத்து எங்கு சொல்லப் போகும்?

எல்லா
