# Theni Tamil Dialect Translator — mBART Fine-tune
**Before running:**
1. Runtime → Disconnect and delete runtime
2. Runtime → Change runtime type → T4 GPU → Save
3. Run each cell top to bottom

In [1]:
# CELL 1 - Install
!pip install -q transformers sentencepiece peft accelerate sacrebleu
print('Done')

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/100.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m100.8/100.8 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25hDone


In [2]:
# CELL 2 - Imports
import os, gc, io, torch, shutil, math
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from transformers import (
    MBartForConditionalGeneration, MBart50Tokenizer,
    get_cosine_schedule_with_warmup, DataCollatorForSeq2Seq
)
from peft import get_peft_model, LoraConfig, TaskType, PeftModel
from tqdm.notebook import tqdm
import sacrebleu   # pip install sacrebleu

gc.collect()
torch.cuda.empty_cache()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Device: {device}')
if torch.cuda.is_available():
    free  = torch.cuda.mem_get_info()[0] / 1e9
    total = torch.cuda.get_device_properties(0).total_memory / 1e9
    print(f'GPU: {torch.cuda.get_device_name(0)}')
    print(f'VRAM: {total:.1f} GB total | {free:.1f} GB free')


Device: cuda
GPU: Tesla T4
VRAM: 15.6 GB total | 15.5 GB free


In [3]:
# CELL 3 - Upload CSV
from google.colab import files
print('Upload your dataset.csv ...')
uploaded = files.upload()
filename = list(uploaded.keys())[0]
df_raw   = pd.read_csv(io.BytesIO(uploaded[filename]))
print(f'Uploaded: {filename}')
print(f'Rows: {len(df_raw)}')
print(f'Columns: {list(df_raw.columns)}')
df_raw.head(3)

Upload your dataset.csv ...


Saving datasett.csv to datasett.csv
Uploaded: datasett.csv
Rows: 3111
Columns: ['normal_tamil', 'theni_tamil']


Unnamed: 0,normal_tamil,theni_tamil
0,நீங்கள் எப்படி இருக்கிறீர்கள்,நீங்க எப்படி இருக்கீங்க
1,இன்று மழை பெய்கிறது,இன்னைக்கு மழை பெய்யுது
2,நான் சாப்பிட போகிறேன்,நான் சாப்பிட போறேன்


In [4]:
# CELL 4 - Settings
SOURCE_COL = 'normal_tamil'   # change if your CSV column is named differently
TARGET_COL = 'theni_tamil'    # change if your CSV column is named differently

MODEL_NAME = 'facebook/mbart-large-50'
SRC_LANG   = 'ta_IN'
TGT_LANG   = 'ta_IN'

MAX_LENGTH         = 128
BATCH_SIZE         = 8          # physical batch size (fits T4)
GRAD_ACCUM_STEPS   = 4          # effective batch = 8 × 4 = 32
EPOCHS             = 30         # more epochs with early stopping
LR                 = 5e-5       # lower LR → stabler training
LABEL_SMOOTHING    = 0.1        # regularisation
PATIENCE           = 5          # early-stop if val loss doesn't improve
MODEL_DIR          = '/content/best_model'

df = df_raw[[SOURCE_COL, TARGET_COL]].dropna()
df[SOURCE_COL] = df[SOURCE_COL].str.strip()
df[TARGET_COL] = df[TARGET_COL].str.strip()
df = df[(df[SOURCE_COL].str.len() > 0) & (df[TARGET_COL].str.len() > 0)]

# ── Data augmentation: duplicate rare / short pairs ─────────────────────────
df_aug = df.copy()
# Duplicate pairs where source has fewer than 5 words (harder short sentences)
short_mask = df[SOURCE_COL].str.split().str.len() < 5
df_aug = pd.concat([df, df[short_mask]], ignore_index=True)
print(f'Original pairs: {len(df)} | After augmentation: {len(df_aug)}')

src_train, src_val, tgt_train, tgt_val = train_test_split(
    df_aug[SOURCE_COL].tolist(), df_aug[TARGET_COL].tolist(),
    test_size=0.1, random_state=42
)
print(f'Train: {len(src_train)} | Val: {len(src_val)}')


Original pairs: 3109 | After augmentation: 5442
Train: 4897 | Val: 545


In [5]:
# CELL 5 - Load mBART + LoRA
gc.collect()
torch.cuda.empty_cache()

print('Loading tokenizer...')
tokenizer = MBart50Tokenizer.from_pretrained(MODEL_NAME)
tokenizer.src_lang = SRC_LANG
tokenizer.tgt_lang = TGT_LANG

print('Loading base model in fp16...')
base_model = MBartForConditionalGeneration.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16
)
base_model.config.use_cache = False

print('Applying LoRA...')
lora_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    r=32,                          # ↑ from 16 → 32 (more capacity)
    lora_alpha=64,                 # keep alpha = 2×r
    lora_dropout=0.05,             # small dropout
    # ── target BOTH encoder AND decoder attention + FFN gates ────────────
    target_modules=[
        'q_proj', 'k_proj', 'v_proj', 'out_proj',   # attention
        'fc1', 'fc2'                                  # feed-forward
    ],
    bias='none',
)
model = get_peft_model(base_model, lora_config)

# Cast LoRA params to float32 for stable gradients with fp16 base
for name, param in model.named_parameters():
    if param.requires_grad:
        param.data = param.data.float()

model.to(device)

total     = sum(p.numel() for p in model.parameters())
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'Total params    : {total/1e6:.1f}M')
print(f'Trainable params: {trainable/1e6:.2f}M ({100*trainable/total:.2f}%)')
free = torch.cuda.mem_get_info()[0] / 1e9
print(f'Free VRAM       : {free:.1f} GB')


Loading tokenizer...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/531 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]

Loading base model in fp16...




config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

Loading weights:   0%|          | 0/519 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]



generation_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

Applying LoRA...
Total params    : 1396.3M
Trainable params: 17.30M (1.24%)
Free VRAM       : 12.7 GB


In [6]:
# CELL 6 - Dataset  (dynamic padding via DataCollatorForSeq2Seq)
class TheniDataset(Dataset):
    def __init__(self, sources, targets, tokenizer, max_length=128):
        self.sources   = sources
        self.targets   = targets
        self.tokenizer = tokenizer
        self.max_len   = max_length

    def __len__(self):
        return len(self.sources)

    def __getitem__(self, idx):
        # Encode source
        self.tokenizer.src_lang = SRC_LANG
        model_inputs = self.tokenizer(
            str(self.sources[idx]),
            max_length=self.max_len,
            truncation=True,
        )
        # Encode target: MBart50Tokenizer uses tgt_lang to set the BOS token
        self.tokenizer.src_lang = TGT_LANG
        labels = self.tokenizer(
            str(self.targets[idx]),
            max_length=self.max_len,
            truncation=True,
        )
        self.tokenizer.src_lang = SRC_LANG  # restore
        model_inputs['labels'] = labels['input_ids']
        return model_inputs

# Dynamic padding collator — pads each batch to its longest sequence
collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=-100,   # -100 is ignored in loss
    pad_to_multiple_of=8,      # good for tensor-core alignment
)

train_dataset = TheniDataset(src_train, tgt_train, tokenizer, MAX_LENGTH)
val_dataset   = TheniDataset(src_val,   tgt_val,   tokenizer, MAX_LENGTH)
train_loader  = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True,
                           collate_fn=collator, num_workers=2, pin_memory=True)
val_loader    = DataLoader(val_dataset,   batch_size=BATCH_SIZE, shuffle=False,
                           collate_fn=collator, num_workers=2, pin_memory=True)
print(f'Steps/epoch: {len(train_loader)}  |  Eff. batch size: {BATCH_SIZE * GRAD_ACCUM_STEPS}')


Steps/epoch: 613  |  Eff. batch size: 32


In [8]:
# CELL 7 - Train  (label smoothing + cosine LR + gradient accumulation + early stopping + BLEU)
import torch.nn.functional as F

def smooth_loss(logits, labels, eps=LABEL_SMOOTHING):
    """Cross-entropy with label smoothing; ignores -100 positions."""
    vocab_size = logits.size(-1)
    # flat
    logits_flat = logits.view(-1, vocab_size)
    labels_flat = labels.view(-1)

    mask = labels_flat != -100
    logits_flat = logits_flat[mask]
    labels_flat = labels_flat[mask]

    log_probs = F.log_softmax(logits_flat, dim=-1)
    nll = F.nll_loss(log_probs, labels_flat, reduction='mean')
    smooth = -log_probs.mean()
    return (1 - eps) * nll + eps * smooth

total_steps  = len(train_loader) // GRAD_ACCUM_STEPS * EPOCHS
warmup_steps = total_steps // 10   # 10 % warm-up

optimizer = torch.optim.AdamW(
    [p for p in model.parameters() if p.requires_grad],
    lr=LR, weight_decay=0.01, betas=(0.9, 0.98)
)
scheduler = get_cosine_schedule_with_warmup(
    optimizer,
    num_warmup_steps=warmup_steps,
    num_training_steps=total_steps
)

os.makedirs(MODEL_DIR, exist_ok=True)
best_val_loss   = float('inf')
patience_count  = 0
history         = []

print(f'Training {EPOCHS} epochs | {total_steps} total opt-steps | '
      f'{warmup_steps} warmup steps')

for epoch in range(1, EPOCHS + 1):
    # ── Train ───────────────────────────────────────────────────────────────
    model.train()
    total_train = 0
    optimizer.zero_grad()

    for step, batch in enumerate(tqdm(train_loader, desc=f'Epoch {epoch}/{EPOCHS} [Train]')):
        input_ids      = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels         = batch['labels'].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        # Use label-smoothed loss instead of model's raw CE
        loss = smooth_loss(outputs.logits, labels) / GRAD_ACCUM_STEPS
        total_train += loss.item() * GRAD_ACCUM_STEPS
        loss.backward()

        if (step + 1) % GRAD_ACCUM_STEPS == 0:
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()

        del input_ids, attention_mask, labels, outputs, loss
        torch.cuda.empty_cache()

    # ── Validate  (loss + BLEU/chrF) ────────────────────────────────────────
    model.eval()
    total_val    = 0
    hyps, refs   = [], []

    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f'Epoch {epoch}/{EPOCHS} [Val]'):
            input_ids      = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels         = batch['labels'].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            total_val += smooth_loss(outputs.logits, labels).item()

            # ── Generate for BLEU (every epoch) ─────────────────────────────
            forced_bos = tokenizer.lang_code_to_id[TGT_LANG]
            gen_ids = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                forced_bos_token_id=forced_bos,
                num_beams=4,
                max_length=MAX_LENGTH,
                early_stopping=True
            )
            decoded_hyp = tokenizer.batch_decode(gen_ids, skip_special_tokens=True)
            # Recover reference strings (replace -100 back to pad_id for decoding)
            ref_ids = labels.clone()
            ref_ids[ref_ids == -100] = tokenizer.pad_token_id
            decoded_ref = tokenizer.batch_decode(ref_ids, skip_special_tokens=True)

            hyps.extend(decoded_hyp)
            refs.extend(decoded_ref)

            del input_ids, attention_mask, labels, outputs, gen_ids
            torch.cuda.empty_cache()

    avg_train = total_train / len(train_loader)
    avg_val   = total_val   / len(val_loader)

    bleu_score = sacrebleu.corpus_bleu(hyps, [refs]).score
    chrf_score = sacrebleu.corpus_chrf(hyps, [refs]).score

    history.append({'epoch': epoch, 'train': avg_train, 'val': avg_val,
                    'bleu': bleu_score, 'chrf': chrf_score})

    print(f'Epoch {epoch:02d} | Train: {avg_train:.4f} | Val: {avg_val:.4f} '
          f'| BLEU: {bleu_score:.2f} | chrF: {chrf_score:.2f}')

    if avg_val < best_val_loss:
        best_val_loss  = avg_val
        patience_count = 0
        model.save_pretrained(MODEL_DIR)
        tokenizer.save_pretrained(MODEL_DIR)
        print(f'  ✓ Saved best model (val={best_val_loss:.4f}, BLEU={bleu_score:.2f})')
    else:
        patience_count += 1
        print(f'  No improvement ({patience_count}/{PATIENCE})')
        if patience_count >= PATIENCE:
            print('Early stopping triggered.')
            break

print('\nTraining complete!')
print(f'Best val loss: {best_val_loss:.4f}')

# ── Print training curve ─────────────────────────────────────────────────────
print('\nEpoch | Train  | Val    | BLEU  | chrF')
print('-'*45)
for h in history:
    print(f"{h['epoch']:5d} | {h['train']:.4f} | {h['val']:.4f} | {h['bleu']:5.2f} | {h['chrf']:.2f}")


Training 30 epochs | 4590 total opt-steps | 459 warmup steps


Epoch 1/30 [Train]:   0%|          | 0/613 [00:00<?, ?it/s]

Epoch 1/30 [Val]:   0%|          | 0/69 [00:00<?, ?it/s]

Epoch 01 | Train: 5.2418 | Val: 5.0969 | BLEU: 5.60 | chrF: 39.87
  ✓ Saved best model (val=5.0969, BLEU=5.60)


Epoch 2/30 [Train]:   0%|          | 0/613 [00:00<?, ?it/s]

Epoch 2/30 [Val]:   0%|          | 0/69 [00:00<?, ?it/s]

Epoch 02 | Train: 5.0642 | Val: 4.8809 | BLEU: 5.88 | chrF: 40.32
  ✓ Saved best model (val=4.8809, BLEU=5.88)


Epoch 3/30 [Train]:   0%|          | 0/613 [00:00<?, ?it/s]

Epoch 3/30 [Val]:   0%|          | 0/69 [00:00<?, ?it/s]

Epoch 03 | Train: 4.8678 | Val: 4.7162 | BLEU: 8.68 | chrF: 41.16
  ✓ Saved best model (val=4.7162, BLEU=8.68)


Epoch 4/30 [Train]:   0%|          | 0/613 [00:00<?, ?it/s]

Epoch 4/30 [Val]:   0%|          | 0/69 [00:00<?, ?it/s]

Epoch 04 | Train: 4.7049 | Val: 4.5394 | BLEU: 8.45 | chrF: 42.14
  ✓ Saved best model (val=4.5394, BLEU=8.45)


Epoch 5/30 [Train]:   0%|          | 0/613 [00:00<?, ?it/s]

Epoch 5/30 [Val]:   0%|          | 0/69 [00:00<?, ?it/s]

Epoch 05 | Train: 4.5445 | Val: 4.4622 | BLEU: 11.37 | chrF: 43.60
  ✓ Saved best model (val=4.4622, BLEU=11.37)


Epoch 6/30 [Train]:   0%|          | 0/613 [00:00<?, ?it/s]

Epoch 6/30 [Val]:   0%|          | 0/69 [00:00<?, ?it/s]

Epoch 06 | Train: 4.4478 | Val: 4.3987 | BLEU: 10.13 | chrF: 44.55
  ✓ Saved best model (val=4.3987, BLEU=10.13)


Epoch 7/30 [Train]:   0%|          | 0/613 [00:00<?, ?it/s]

Epoch 7/30 [Val]:   0%|          | 0/69 [00:00<?, ?it/s]

Epoch 07 | Train: 4.3633 | Val: 4.3948 | BLEU: 10.84 | chrF: 45.34
  ✓ Saved best model (val=4.3948, BLEU=10.84)


Epoch 8/30 [Train]:   0%|          | 0/613 [00:00<?, ?it/s]

Epoch 8/30 [Val]:   0%|          | 0/69 [00:00<?, ?it/s]

Epoch 08 | Train: 4.2957 | Val: 4.3160 | BLEU: 10.42 | chrF: 43.86
  ✓ Saved best model (val=4.3160, BLEU=10.42)


Epoch 9/30 [Train]:   0%|          | 0/613 [00:00<?, ?it/s]

Epoch 9/30 [Val]:   0%|          | 0/69 [00:00<?, ?it/s]

Epoch 09 | Train: 4.2636 | Val: 4.2782 | BLEU: 10.50 | chrF: 44.97
  ✓ Saved best model (val=4.2782, BLEU=10.50)


Epoch 10/30 [Train]:   0%|          | 0/613 [00:00<?, ?it/s]

Epoch 10/30 [Val]:   0%|          | 0/69 [00:00<?, ?it/s]

Epoch 10 | Train: 4.2127 | Val: 4.2719 | BLEU: 10.70 | chrF: 44.79
  ✓ Saved best model (val=4.2719, BLEU=10.70)


Epoch 11/30 [Train]:   0%|          | 0/613 [00:00<?, ?it/s]

Epoch 11/30 [Val]:   0%|          | 0/69 [00:00<?, ?it/s]

Epoch 11 | Train: 4.1418 | Val: 4.2324 | BLEU: 12.28 | chrF: 46.56
  ✓ Saved best model (val=4.2324, BLEU=12.28)


Epoch 12/30 [Train]:   0%|          | 0/613 [00:00<?, ?it/s]

Epoch 12/30 [Val]:   0%|          | 0/69 [00:00<?, ?it/s]

Epoch 12 | Train: 4.0906 | Val: 4.2271 | BLEU: 12.98 | chrF: 47.13
  ✓ Saved best model (val=4.2271, BLEU=12.98)


Epoch 13/30 [Train]:   0%|          | 0/613 [00:00<?, ?it/s]

Epoch 13/30 [Val]:   0%|          | 0/69 [00:00<?, ?it/s]

Epoch 13 | Train: 4.0437 | Val: 4.1929 | BLEU: 10.70 | chrF: 47.48
  ✓ Saved best model (val=4.1929, BLEU=10.70)


Epoch 14/30 [Train]:   0%|          | 0/613 [00:00<?, ?it/s]

Epoch 14/30 [Val]:   0%|          | 0/69 [00:00<?, ?it/s]

Epoch 14 | Train: 3.9970 | Val: 4.1729 | BLEU: 14.37 | chrF: 48.71
  ✓ Saved best model (val=4.1729, BLEU=14.37)


Epoch 15/30 [Train]:   0%|          | 0/613 [00:00<?, ?it/s]

Epoch 15/30 [Val]:   0%|          | 0/69 [00:00<?, ?it/s]

Epoch 15 | Train: 3.9646 | Val: 4.1204 | BLEU: 14.96 | chrF: 48.03
  ✓ Saved best model (val=4.1204, BLEU=14.96)


Epoch 16/30 [Train]:   0%|          | 0/613 [00:00<?, ?it/s]

Epoch 16/30 [Val]:   0%|          | 0/69 [00:00<?, ?it/s]

Epoch 16 | Train: 3.9106 | Val: 4.1272 | BLEU: 14.51 | chrF: 49.11
  No improvement (1/5)


Epoch 17/30 [Train]:   0%|          | 0/613 [00:00<?, ?it/s]

Epoch 17/30 [Val]:   0%|          | 0/69 [00:00<?, ?it/s]

Epoch 17 | Train: 3.8975 | Val: 4.1163 | BLEU: 18.67 | chrF: 50.61
  ✓ Saved best model (val=4.1163, BLEU=18.67)


Epoch 18/30 [Train]:   0%|          | 0/613 [00:00<?, ?it/s]

Epoch 18/30 [Val]:   0%|          | 0/69 [00:00<?, ?it/s]

Epoch 18 | Train: 3.8592 | Val: 4.0688 | BLEU: 14.57 | chrF: 51.32
  ✓ Saved best model (val=4.0688, BLEU=14.57)


Epoch 19/30 [Train]:   0%|          | 0/613 [00:00<?, ?it/s]

Epoch 19/30 [Val]:   0%|          | 0/69 [00:00<?, ?it/s]

Epoch 19 | Train: 3.8273 | Val: 4.0779 | BLEU: 17.17 | chrF: 51.35
  No improvement (1/5)


Epoch 20/30 [Train]:   0%|          | 0/613 [00:00<?, ?it/s]

Epoch 20/30 [Val]:   0%|          | 0/69 [00:00<?, ?it/s]

Epoch 20 | Train: 3.8044 | Val: 4.0454 | BLEU: 17.71 | chrF: 51.59
  ✓ Saved best model (val=4.0454, BLEU=17.71)


Epoch 21/30 [Train]:   0%|          | 0/613 [00:00<?, ?it/s]

Epoch 21/30 [Val]:   0%|          | 0/69 [00:00<?, ?it/s]

Epoch 21 | Train: 3.7770 | Val: 4.0390 | BLEU: 19.46 | chrF: 52.50
  ✓ Saved best model (val=4.0390, BLEU=19.46)


Epoch 22/30 [Train]:   0%|          | 0/613 [00:00<?, ?it/s]

Epoch 22/30 [Val]:   0%|          | 0/69 [00:00<?, ?it/s]

Epoch 22 | Train: 3.7581 | Val: 4.0210 | BLEU: 19.54 | chrF: 52.51
  ✓ Saved best model (val=4.0210, BLEU=19.54)


Epoch 23/30 [Train]:   0%|          | 0/613 [00:00<?, ?it/s]

Epoch 23/30 [Val]:   0%|          | 0/69 [00:00<?, ?it/s]

Epoch 23 | Train: 3.7511 | Val: 4.0278 | BLEU: 19.38 | chrF: 52.19
  No improvement (1/5)


Epoch 24/30 [Train]:   0%|          | 0/613 [00:00<?, ?it/s]

Epoch 24/30 [Val]:   0%|          | 0/69 [00:00<?, ?it/s]

Epoch 24 | Train: 3.7324 | Val: 4.0127 | BLEU: 19.74 | chrF: 52.86
  ✓ Saved best model (val=4.0127, BLEU=19.74)


Epoch 25/30 [Train]:   0%|          | 0/613 [00:00<?, ?it/s]

Epoch 25/30 [Val]:   0%|          | 0/69 [00:00<?, ?it/s]

Epoch 25 | Train: 3.7143 | Val: 4.0186 | BLEU: 20.84 | chrF: 53.18
  No improvement (1/5)


Epoch 26/30 [Train]:   0%|          | 0/613 [00:00<?, ?it/s]

Epoch 26/30 [Val]:   0%|          | 0/69 [00:00<?, ?it/s]

Epoch 26 | Train: 3.7094 | Val: 4.0062 | BLEU: 20.82 | chrF: 52.74
  ✓ Saved best model (val=4.0062, BLEU=20.82)


Epoch 27/30 [Train]:   0%|          | 0/613 [00:00<?, ?it/s]

Epoch 27/30 [Val]:   0%|          | 0/69 [00:00<?, ?it/s]

Epoch 27 | Train: 3.6953 | Val: 3.9947 | BLEU: 20.34 | chrF: 53.12
  ✓ Saved best model (val=3.9947, BLEU=20.34)


Epoch 28/30 [Train]:   0%|          | 0/613 [00:00<?, ?it/s]

Epoch 28/30 [Val]:   0%|          | 0/69 [00:00<?, ?it/s]

Epoch 28 | Train: 3.6933 | Val: 3.9962 | BLEU: 20.44 | chrF: 53.36
  No improvement (1/5)


Epoch 29/30 [Train]:   0%|          | 0/613 [00:00<?, ?it/s]

Epoch 29/30 [Val]:   0%|          | 0/69 [00:00<?, ?it/s]

Epoch 29 | Train: 3.6943 | Val: 3.9963 | BLEU: 20.04 | chrF: 53.18
  No improvement (2/5)


Epoch 30/30 [Train]:   0%|          | 0/613 [00:00<?, ?it/s]

Epoch 30/30 [Val]:   0%|          | 0/69 [00:00<?, ?it/s]

Epoch 30 | Train: 3.6907 | Val: 3.9957 | BLEU: 20.40 | chrF: 52.91
  No improvement (3/5)

Training complete!
Best val loss: 3.9947

Epoch | Train  | Val    | BLEU  | chrF
---------------------------------------------
    1 | 5.2418 | 5.0969 |  5.60 | 39.87
    2 | 5.0642 | 4.8809 |  5.88 | 40.32
    3 | 4.8678 | 4.7162 |  8.68 | 41.16
    4 | 4.7049 | 4.5394 |  8.45 | 42.14
    5 | 4.5445 | 4.4622 | 11.37 | 43.60
    6 | 4.4478 | 4.3987 | 10.13 | 44.55
    7 | 4.3633 | 4.3948 | 10.84 | 45.34
    8 | 4.2957 | 4.3160 | 10.42 | 43.86
    9 | 4.2636 | 4.2782 | 10.50 | 44.97
   10 | 4.2127 | 4.2719 | 10.70 | 44.79
   11 | 4.1418 | 4.2324 | 12.28 | 46.56
   12 | 4.0906 | 4.2271 | 12.98 | 47.13
   13 | 4.0437 | 4.1929 | 10.70 | 47.48
   14 | 3.9970 | 4.1729 | 14.37 | 48.71
   15 | 3.9646 | 4.1204 | 14.96 | 48.03
   16 | 3.9106 | 4.1272 | 14.51 | 49.11
   17 | 3.8975 | 4.1163 | 18.67 | 50.61
   18 | 3.8592 | 4.0688 | 14.57 | 51.32
   19 | 3.8273 | 4.0779 | 17.17 | 51.35
   20 | 3.8044 | 4.045

In [9]:
# CELL 8 - Download model
from google.colab import files
shutil.make_archive('/content/theni_mbart_model', 'zip', MODEL_DIR)
files.download('/content/theni_mbart_model.zip')
print('Downloaded!')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloaded!


In [10]:
# CELL 9 - Load model for translation
gc.collect()
torch.cuda.empty_cache()

# MBart50Tokenizer must be used (not AutoTokenizer) to preserve lang_code_to_id
tokenizer_t = MBart50Tokenizer.from_pretrained(MODEL_DIR, local_files_only=True)
tokenizer_t.src_lang = SRC_LANG
tokenizer_t.tgt_lang = TGT_LANG

base_t  = MBartForConditionalGeneration.from_pretrained(MODEL_NAME, torch_dtype=torch.float16)
model_t = PeftModel.from_pretrained(base_t, MODEL_DIR, local_files_only=True)
model_t = model_t.merge_and_unload()
model_t.to(device)
model_t.eval()
print('Model ready!')

def translate(text):
    tokenizer_t.src_lang = SRC_LANG
    inputs = tokenizer_t(
        text, return_tensors='pt',
        max_length=MAX_LENGTH, truncation=True
    ).to(device)
    # For mBART, forced_bos_token_id must be the target language token id
    forced_bos_token_id = tokenizer_t.lang_code_to_id[TGT_LANG]
    with torch.no_grad():
        out = model_t.generate(
            **inputs,
            forced_bos_token_id=forced_bos_token_id,
            num_beams=5,
            max_length=MAX_LENGTH,
            early_stopping=True
        )
    return tokenizer_t.decode(out[0], skip_special_tokens=True)

tests = [
    'நீங்கள் எப்படி இருக்கிறீர்கள்',
    'இன்று மழை பெய்கிறது',
    'நான் சாப்பிட போகிறேன்',
    'அவர் வீட்டிற்கு வருகிறார்'
]
print('\n--- Test Results ---')
for t in tests:
    print(f'Normal Tamil : {t}')
    print(f'Theni Slang  : {translate(t)}')
    print('-' * 50)

Loading weights:   0%|          | 0/519 [00:00<?, ?it/s]



Model ready!

--- Test Results ---
Normal Tamil : நீங்கள் எப்படி இருக்கிறீர்கள்
Theni Slang  : நீங்க எப்படி இருக்கீங்க
--------------------------------------------------
Normal Tamil : இன்று மழை பெய்கிறது
Theni Slang  : இன்னைக்கு மழை பெய்யுது
--------------------------------------------------
Normal Tamil : நான் சாப்பிட போகிறேன்
Theni Slang  : நான் சாப்பிட போறேன்
--------------------------------------------------
Normal Tamil : அவர் வீட்டிற்கு வருகிறார்
Theni Slang  : அவரு வீட்டுக்கு வராரு
--------------------------------------------------


In [12]:
# CELL 10 - Interactive translation
print('Type any Normal Tamil sentence. Type quit to stop.\n')
while True:
    text = input('Normal Tamil : ').strip()
    if text.lower() == 'quit':
        break
    if text:
        print(f'Theni Slang  : {translate(text)}\n')