# Theni Tamil Dialect Translator
**Before running:**
1. Runtime → Disconnect and delete runtime
2. Runtime → Change runtime type → T4 GPU → Save
3. Run each cell top to bottom

In [1]:
# CELL 1 - Install
!pip install -q transformers sentencepiece peft accelerate
print('Done')

Done


In [2]:
# CELL 2 - Imports
import os, gc, io, torch, shutil
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, get_linear_schedule_with_warmup
from peft import get_peft_model, LoraConfig, TaskType, PeftModel
from tqdm.notebook import tqdm

gc.collect()
torch.cuda.empty_cache()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Device: {device}')
if torch.cuda.is_available():
    free  = torch.cuda.mem_get_info()[0] / 1e9
    total = torch.cuda.get_device_properties(0).total_memory / 1e9
    print(f'GPU: {torch.cuda.get_device_name(0)}')
    print(f'VRAM: {total:.1f} GB total | {free:.1f} GB free')

Device: cuda
GPU: Tesla T4
VRAM: 15.6 GB total | 15.5 GB free


In [3]:
# CELL 3 - Upload CSV
from google.colab import files
print('Upload your dataset.csv ...')
uploaded = files.upload()
filename = list(uploaded.keys())[0]
df_raw   = pd.read_csv(io.BytesIO(uploaded[filename]))
print(f'Uploaded: {filename}')
print(f'Rows: {len(df_raw)}')
print(f'Columns: {list(df_raw.columns)}')
df_raw.head(3)

Upload your dataset.csv ...


Saving datasett.csv to datasett.csv
Uploaded: datasett.csv
Rows: 3111
Columns: ['normal_tamil', 'theni_tamil']


Unnamed: 0,normal_tamil,theni_tamil
0,நீங்கள் எப்படி இருக்கிறீர்கள்,நீங்க எப்படி இருக்கீங்க
1,இன்று மழை பெய்கிறது,இன்னைக்கு மழை பெய்யுது
2,நான் சாப்பிட போகிறேன்,நான் சாப்பிட போறேன்


In [4]:
# CELL 4 - Settings (change column names if needed)
SOURCE_COL = 'normal_tamil'
TARGET_COL = 'theni_tamil'

MODEL_NAME = 'facebook/nllb-200-distilled-600M'
SRC_LANG   = 'tam_Taml'
TGT_LANG   = 'tam_Taml'
MAX_LENGTH = 128
BATCH_SIZE = 8
EPOCHS     = 20
LR         = 3e-4
MODEL_DIR  = '/content/best_model'

df = df_raw[[SOURCE_COL, TARGET_COL]].dropna()
df[SOURCE_COL] = df[SOURCE_COL].str.strip()
df[TARGET_COL] = df[TARGET_COL].str.strip()
df = df[df[SOURCE_COL].str.len() > 0]
df = df[df[TARGET_COL].str.len() > 0]

src_train, src_val, tgt_train, tgt_val = train_test_split(
    df[SOURCE_COL].tolist(), df[TARGET_COL].tolist(),
    test_size=0.1, random_state=42
)
print(f'Train: {len(src_train)} | Val: {len(src_val)}')

Train: 2798 | Val: 311


In [5]:
# CELL 5 - Load model + LoRA
gc.collect()
torch.cuda.empty_cache()

print('Loading tokenizer...')
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

print('Loading base model in fp16...')
base_model = AutoModelForSeq2SeqLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16
)
base_model.config.use_cache = False

print('Applying LoRA...')
lora_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    r=16,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=['q_proj', 'v_proj']
)
model = get_peft_model(base_model, lora_config)

# Cast LoRA params to float32 so gradients work correctly
for name, param in model.named_parameters():
    if param.requires_grad:
        param.data = param.data.float()

model.to(device)

total     = sum(p.numel() for p in model.parameters())
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'Total params    : {total/1e6:.1f}M')
print(f'Trainable params: {trainable/1e6:.2f}M ({100*trainable/total:.2f}%)')
free = torch.cuda.mem_get_info()[0] / 1e9
print(f'Free VRAM       : {free:.1f} GB')

Loading tokenizer...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/846 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/564 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.3M [00:00<?, ?B/s]



special_tokens_map.json: 0.00B [00:00, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


Loading base model in fp16...


pytorch_model.bin:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

Loading weights:   0%|          | 0/512 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/2.46G [00:00<?, ?B/s]



generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

Applying LoRA...
Total params    : 1404.5M
Trainable params: 2.36M (0.17%)
Free VRAM       : 12.7 GB


In [6]:
# CELL 6 - Dataset
class TheniDataset(Dataset):
    def __init__(self, sources, targets, tokenizer, max_length=128):
        self.sources   = sources
        self.targets   = targets
        self.tokenizer = tokenizer
        self.max_len   = max_length

    def __len__(self):
        return len(self.sources)

    def __getitem__(self, idx):
        self.tokenizer.src_lang = SRC_LANG
        encoded = self.tokenizer(
            str(self.sources[idx]),
            text_target=str(self.targets[idx]),
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return {
            'input_ids':      encoded['input_ids'].squeeze(),
            'attention_mask': encoded['attention_mask'].squeeze(),
            'labels':         encoded['labels'].squeeze()
        }

train_dataset = TheniDataset(src_train, tgt_train, tokenizer, MAX_LENGTH)
val_dataset   = TheniDataset(src_val,   tgt_val,   tokenizer, MAX_LENGTH)
train_loader  = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True,  num_workers=2, pin_memory=True)
val_loader    = DataLoader(val_dataset,   batch_size=BATCH_SIZE, shuffle=False, num_workers=2, pin_memory=True)
print(f'Steps/epoch: {len(train_loader)}')

Steps/epoch: 350


In [7]:
# CELL 7 - Train
# Only optimize trainable LoRA parameters
optimizer = torch.optim.AdamW(
    [p for p in model.parameters() if p.requires_grad],
    lr=LR, weight_decay=0.01
)
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=100,
    num_training_steps=len(train_loader) * EPOCHS
)

os.makedirs(MODEL_DIR, exist_ok=True)
best_val_loss = float('inf')

print(f'Training {EPOCHS} epochs on {device}...')

for epoch in range(1, EPOCHS + 1):
    # Train
    model.train()
    total_train = 0
    for batch in tqdm(train_loader, desc=f'Epoch {epoch}/{EPOCHS} [Train]'):
        input_ids      = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels         = batch['labels'].to(device)
        labels[labels == tokenizer.pad_token_id] = -100

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        loss = outputs.loss
        total_train += loss.item()

        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

        del input_ids, attention_mask, labels, outputs, loss
        torch.cuda.empty_cache()

    # Validate
    model.eval()
    total_val = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids      = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels         = batch['labels'].to(device)
            labels[labels == tokenizer.pad_token_id] = -100
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            total_val += outputs.loss.item()
            del input_ids, attention_mask, labels, outputs
            torch.cuda.empty_cache()

    avg_train = total_train / len(train_loader)
    avg_val   = total_val   / len(val_loader)
    print(f'Epoch {epoch:02d} | Train: {avg_train:.4f} | Val: {avg_val:.4f}')

    if avg_val < best_val_loss:
        best_val_loss = avg_val
        model.save_pretrained(MODEL_DIR)
        tokenizer.save_pretrained(MODEL_DIR)
        print(f'  Saved best model (val={best_val_loss:.4f})')

print('Training complete!')

Training 20 epochs on cuda...


Epoch 1/20 [Train]:   0%|          | 0/350 [00:00<?, ?it/s]

Epoch 01 | Train: 2.9413 | Val: 1.8517
  Saved best model (val=1.8517)


Epoch 2/20 [Train]:   0%|          | 0/350 [00:00<?, ?it/s]

Epoch 02 | Train: 1.9201 | Val: 1.6491
  Saved best model (val=1.6491)


Epoch 3/20 [Train]:   0%|          | 0/350 [00:00<?, ?it/s]

Epoch 03 | Train: 1.7431 | Val: 1.5716
  Saved best model (val=1.5716)


Epoch 4/20 [Train]:   0%|          | 0/350 [00:00<?, ?it/s]

Epoch 04 | Train: 1.6188 | Val: 1.5080
  Saved best model (val=1.5080)


Epoch 5/20 [Train]:   0%|          | 0/350 [00:00<?, ?it/s]

Epoch 05 | Train: 1.5303 | Val: 1.4738
  Saved best model (val=1.4738)


Epoch 6/20 [Train]:   0%|          | 0/350 [00:00<?, ?it/s]

Epoch 06 | Train: 1.4619 | Val: 1.4398
  Saved best model (val=1.4398)


Epoch 7/20 [Train]:   0%|          | 0/350 [00:00<?, ?it/s]

Epoch 07 | Train: 1.4063 | Val: 1.4304
  Saved best model (val=1.4304)


Epoch 8/20 [Train]:   0%|          | 0/350 [00:00<?, ?it/s]

Epoch 08 | Train: 1.3563 | Val: 1.3974
  Saved best model (val=1.3974)


Epoch 9/20 [Train]:   0%|          | 0/350 [00:00<?, ?it/s]

Epoch 09 | Train: 1.3175 | Val: 1.3846
  Saved best model (val=1.3846)


Epoch 10/20 [Train]:   0%|          | 0/350 [00:00<?, ?it/s]

Epoch 10 | Train: 1.2756 | Val: 1.3700
  Saved best model (val=1.3700)


Epoch 11/20 [Train]:   0%|          | 0/350 [00:00<?, ?it/s]

Epoch 11 | Train: 1.2360 | Val: 1.3674
  Saved best model (val=1.3674)


Epoch 12/20 [Train]:   0%|          | 0/350 [00:00<?, ?it/s]

Epoch 12 | Train: 1.2037 | Val: 1.3525
  Saved best model (val=1.3525)


Epoch 13/20 [Train]:   0%|          | 0/350 [00:00<?, ?it/s]

Epoch 13 | Train: 1.1738 | Val: 1.3406
  Saved best model (val=1.3406)


Epoch 14/20 [Train]:   0%|          | 0/350 [00:00<?, ?it/s]

Epoch 14 | Train: 1.1529 | Val: 1.3375
  Saved best model (val=1.3375)


Epoch 15/20 [Train]:   0%|          | 0/350 [00:00<?, ?it/s]

Epoch 15 | Train: 1.1303 | Val: 1.3343
  Saved best model (val=1.3343)


Epoch 16/20 [Train]:   0%|          | 0/350 [00:00<?, ?it/s]

Epoch 16 | Train: 1.1154 | Val: 1.3325
  Saved best model (val=1.3325)


Epoch 17/20 [Train]:   0%|          | 0/350 [00:00<?, ?it/s]

Epoch 17 | Train: 1.0951 | Val: 1.3325
  Saved best model (val=1.3325)


Epoch 18/20 [Train]:   0%|          | 0/350 [00:00<?, ?it/s]

Epoch 18 | Train: 1.0822 | Val: 1.3252
  Saved best model (val=1.3252)


Epoch 19/20 [Train]:   0%|          | 0/350 [00:00<?, ?it/s]

Epoch 19 | Train: 1.0715 | Val: 1.3239
  Saved best model (val=1.3239)


Epoch 20/20 [Train]:   0%|          | 0/350 [00:00<?, ?it/s]

Epoch 20 | Train: 1.0547 | Val: 1.3238
  Saved best model (val=1.3238)
Training complete!


In [8]:
# CELL 8 - Download model
from google.colab import files
shutil.make_archive('/content/theni_model', 'zip', MODEL_DIR)
files.download('/content/theni_model.zip')
print('Downloaded!')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloaded!


In [9]:
# CELL 9 - Load model for translation
gc.collect()
torch.cuda.empty_cache()

tokenizer_t = AutoTokenizer.from_pretrained(MODEL_DIR, local_files_only=True)
base_t      = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME, torch_dtype=torch.float16)
model_t     = PeftModel.from_pretrained(base_t, MODEL_DIR, local_files_only=True)
model_t     = model_t.merge_and_unload()
model_t.to(device)
model_t.eval()
print('Model ready!')

def translate(text):
    tokenizer_t.src_lang = SRC_LANG
    inputs = tokenizer_t(
        text, return_tensors='pt',
        max_length=MAX_LENGTH, truncation=True
    ).to(device)
    target_id = tokenizer_t.convert_tokens_to_ids(TGT_LANG)
    with torch.no_grad():
        out = model_t.generate(
            **inputs,
            forced_bos_token_id=target_id,
            num_beams=5,
            max_length=MAX_LENGTH,
            early_stopping=True
        )
    return tokenizer_t.decode(out[0], skip_special_tokens=True)

tests = [
    'நீங்கள் எப்படி இருக்கிறீர்கள்',
    'இன்று மழை பெய்கிறது',
    'நான் சாப்பிட போகிறேன்',
    'அவர் வீட்டிற்கு வருகிறார்'
]
print('\n--- Test Results ---')
for t in tests:
    print(f'Normal Tamil : {t}')
    print(f'Theni Slang  : {translate(t)}')
    print('-' * 50)

Loading weights:   0%|          | 0/512 [00:00<?, ?it/s]



Model ready!

--- Test Results ---
Normal Tamil : நீங்கள் எப்படி இருக்கிறீர்கள்
Theni Slang  : நீங்க எப்படி இருக்கீங்க
--------------------------------------------------
Normal Tamil : இன்று மழை பெய்கிறது
Theni Slang  : இன்று மழை பெய்யுது
--------------------------------------------------
Normal Tamil : நான் சாப்பிட போகிறேன்
Theni Slang  : நான் சாப்பிட போறேன்
--------------------------------------------------
Normal Tamil : அவர் வீட்டிற்கு வருகிறார்
Theni Slang  : அவர் வீட்டுக்கு வருறாருப்பா
--------------------------------------------------


In [13]:
# CELL 10 - Type your own sentence
print('Type any Normal Tamil. Type quit to stop.\n')
while True:
    text = input('Normal Tamil : ').strip()
    if text.lower() == 'quit':
        break
    if text:
        print(f'Theni Slang  : {translate(text)}\n')