In [None]:
# Install required libraries
!pip install transformers datasets peft accelerate sentencepiece

In [None]:
import torch
import pandas as pd
from datasets import Dataset
from transformers import (
    MBartForConditionalGeneration,
    MBart50TokenizerFast,
    TrainingArguments,
    Trainer
)
from peft import LoraConfig, get_peft_model, PeftModel

print('GPU Available:', torch.cuda.is_available())
if torch.cuda.is_available():
    print('GPU Name:', torch.cuda.get_device_name(0))

## üìÇ Upload Dataset
Upload `theni_slang_dataset.csv` with columns:
- normal_tamil
- theni_slang

In [None]:
# Load Dataset
df = pd.read_csv('/content/theni_slang_dataset.csv')
dataset = Dataset.from_pandas(df)
df.head()

In [None]:
# Load Base mBART Model
model_name = 'facebook/mbart-large-50-many-to-many-mmt'

tokenizer = MBart50TokenizerFast.from_pretrained(model_name)
model = MBartForConditionalGeneration.from_pretrained(model_name)

tokenizer.src_lang = 'ta_IN'
model.gradient_checkpointing_enable()

In [None]:
# Preprocessing
def preprocess(example):
    inputs = tokenizer(
        example['normal_tamil'],
        max_length=64,
        truncation=True
    )

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            example['theni_slang'],
            max_length=64,
            truncation=True
        )

    inputs['labels'] = labels['input_ids']
    return inputs

dataset = dataset.map(preprocess, batched=True)

In [None]:
# Apply LoRA
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=['q_proj', 'v_proj'],
    lora_dropout=0.1,
    bias='none',
    task_type='SEQ_2_SEQ_LM'
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

In [None]:
# Training Arguments
training_args = TrainingArguments(
    output_dir='./theni_mbart_lora',
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    learning_rate=2e-4,
    fp16=True,
    logging_steps=50,
    save_total_limit=1,
    report_to='none'
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset
)

trainer.train()

In [None]:
# Save LoRA Adapter
model.save_pretrained('./theni_mbart_lora')
tokenizer.save_pretrained('./theni_mbart_lora')

## üîÆ Inference Example

In [None]:
# Load for Inference
base_model = MBartForConditionalGeneration.from_pretrained(model_name)
model = PeftModel.from_pretrained(base_model, './theni_mbart_lora')

text = '‡Æ®‡Ææ‡Æ©‡Øç ‡Æµ‡Æ∞‡ØÅ‡Æï‡Æø‡Æ±‡Øá‡Æ©‡Øç'

tokenizer.src_lang = 'ta_IN'
inputs = tokenizer(text, return_tensors='pt').to(model.device)

generated = model.generate(
    **inputs,
    forced_bos_token_id=tokenizer.lang_code_to_id['ta_IN'],
    max_length=64
)

print('Input:', text)
print('Output:', tokenizer.decode(generated[0], skip_special_tokens=True))