In [6]:
import pandas as pd

file_path = 'thirukkural_multi_language.xlsx'

tamil_df = pd.read_excel(file_path, sheet_name='Tamil')
english_df = pd.read_excel(file_path, sheet_name='English')
hindi_df = pd.read_excel(file_path, sheet_name='Only Hindi- English Format')

output_lines = []
for idx in range(len(tamil_df)):
    kural_id = tamil_df.loc[idx, 'Kural no']
    
    tamil_line = str(tamil_df.loc[idx, 'Line 1']).strip() + " " + str(tamil_df.loc[idx, 'Line 2']).strip()
    english_line = str(english_df.loc[idx, 'Translation  Line']).strip()
    hindi_line = str(hindi_df.loc[idx, 'Explanation']).strip() 

    explanation_line = str(english_df.loc[idx, 'Translation  Line']).strip() 

    text_block = f"""<kural_id>: {kural_id}
Tamil: {tamil_line}
English: {english_line}
Hindi: {hindi_line}
Explanation: {explanation_line}

"""
    output_lines.append(text_block)

with open('thirukkural_plain_text.txt', 'w', encoding='utf-8') as f:
    f.writelines(output_lines)


In [1]:
!nvidia-smi

Sat Sep 13 08:30:04 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 570.172.08             Driver Version: 570.172.08     CUDA Version: 12.8     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100-SXM4-80GB          Off |   00000000:00:05.0 Off |                    0 |
| N/A   38C    P0             57W /  400W |       0MiB /  81920MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
!pip install huggingface_hub transformers datasets accelerate



In [None]:
from huggingface_hub import login
login(os.getenv("hf_token"))

In [4]:
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from datasets import load_dataset

model_name = "google/gemma-3-270m"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

dataset = load_dataset('text', data_files={'train': 'thirukkural_plain_text.txt'})

def tokenize_fn(examples):
    return tokenizer(examples['text'], truncation=True, max_length=512)

tokenized_dataset = dataset.map(tokenize_fn, batched=True)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

training_args = TrainingArguments(
    output_dir="./gemma3-thirukkural-sft",
    per_device_train_batch_size=16,
    num_train_epochs=15,
    logging_steps=100,
    save_steps=500,
    save_total_limit=2,
    optim="adamw_torch",
    learning_rate=5e-5,
    report_to="tensorboard"
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    data_collator=data_collator,
)

In [5]:
trainer.train()

It is strongly recommended to train Gemma3 models with the `eager` attention implementation instead of `sdpa`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`.


Step,Training Loss
100,3.7015
200,3.3247
300,3.0887
400,2.9239
500,2.7326
600,1.9077
700,1.9265
800,1.9198
900,1.8237
1000,1.766


TrainOutput(global_step=7500, training_loss=0.7838853388468424, metrics={'train_runtime': 1199.5796, 'train_samples_per_second': 99.923, 'train_steps_per_second': 6.252, 'total_flos': 3211252777506816.0, 'train_loss': 0.7838853388468424, 'epoch': 15.0})

In [6]:
model.save_pretrained("./sft_tuned_model")
tokenizer.save_pretrained("./sft_tuned_token")

('./sft_tuned_token/tokenizer_config.json',
 './sft_tuned_token/special_tokens_map.json',
 './sft_tuned_token/tokenizer.json')

#Inference

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

model = AutoModelForCausalLM.from_pretrained("./sft/updated_model")
tokenizer = AutoTokenizer.from_pretrained("./sft/updated_token")

model.eval()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

In [2]:
prompt = "What is Kural 1"

inputs = tokenizer(prompt, return_tensors="pt").to(device)

output_ids = model.generate(
    inputs["input_ids"],
    max_length=200,             # Max length of generated sequence
    num_beams=5,                # Beam search for better results
    no_repeat_ngram_size=2,    # Avoid repetition
    early_stopping=True
)

generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

print("Generated Text:\n", generated_text)


Generated Text:
 What is Kural 1012? It is the one who can judge aright the (men and) means of achieving great enterprises. The other way is like the placing of an unwashed foot. Righteousness leadeth unto heaven. For it keepeth men from the path of rectitude even if they run in endless incarnations. (Even so do evil deeds bring forth lasting good as well as they do wrong). For they will never be saved. So is it that the wise man runneth not after evil in the day of performance of his good. Only let him be blamed for his negligence. Of all that beg I shall beg but this one man. All other men rely on his fame and his memory. He will be found out to be an easy prey to the sceptre of the prince.

Explanation: Behold the man who is prepared to strain his every nerve like bull buffalo to wade through every difficulty: he may meet with obstacles but he will send them
