In [1]:
!pip install transformers datasets torch scikit-learn numpy -q

# Importig Libraries

In [2]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import (
    MBartForConditionalGeneration,
    MBartTokenizer,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq
)
from datasets import load_dataset
import numpy as np
from sklearn.model_selection import train_test_split

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


## Exploring Dataset Structure

In [3]:
# Load dataset and explore its structure
dataset = load_dataset("SKNahin/bengali-transliteration-data")
print("Dataset structure:")
print(dataset)

# Print column names
print("\nColumns in training set:")
print(dataset['train'].column_names)

# Print a few examples
print("\nFirst few examples:")
for idx, example in enumerate(dataset['train'].select(range(3))):
    print(f"\nExample {idx + 1}:")
    print(example)

Dataset structure:
DatasetDict({
    train: Dataset({
        features: ['bn', 'rm'],
        num_rows: 5006
    })
})

Columns in training set:
['bn', 'rm']

First few examples:

Example 1:
{'bn': 'স্ক্রোল করে ২০/৩০ সেকেন্ড এর ভিডিও পান নাই???', 'rm': 'scroll kore 20/30 second er video pann nai???'}

Example 2:
{'bn': 'ও গুলা টরেন্ট সাইট এ পাবেন', 'rm': 'o gula Torrent site e paben'}

Example 3:
{'bn': 'ভক্কর চক্কর পোস্ট একটা করলেই এপ্রুভড.… নিশ্চই  ঘাবলা আছে', 'rm': 'vokkor chokkor post akta korlei approved…. nishchoi ghabla ache'}


## Data loading Function

In [4]:
def load_and_preprocess_data():
    # Load dataset from Hugging Face
    dataset = load_dataset("SKNahin/bengali-transliteration-data")
    
    # Extract Banglish (romanized) and Bengali texts
    banglish_texts = dataset['train']['rm']  # 'rm' for romanized
    bengali_texts = dataset['train']['bn']   # 'bn' for Bengali
    
    # Split into train and validation sets (90-10 split)
    train_texts, val_texts, train_labels, val_labels = train_test_split(
        banglish_texts, 
        bengali_texts,
        test_size=0.1,
        random_state=42
    )
    
    return train_texts, val_texts, train_labels, val_labels

# Load the data
train_texts, val_texts, train_labels, val_labels = load_and_preprocess_data()
print(f"Training samples: {len(train_texts)}")
print(f"Validation samples: {len(val_texts)}")

# Print a few examples
print("\nFirst few examples:")
for i in range(3):
    print(f"\nExample {i+1}:")
    print(f"Banglish: {train_texts[i]}")
    print(f"Bengali: {train_labels[i]}")

Training samples: 4505
Validation samples: 501

First few examples:

Example 1:
Banglish: 2 minute ar account block kore dice…..post delete kore din
Bengali: ২ মিনিট এর একাউন্ট ব্লক করে দিছে…..পোস্ট ডিলিট করে দিন

Example 2:
Banglish: Voy ke joy korun
Bengali: ভয় কে জয় করুন 

Example 3:
Banglish: apnar phoner net speed app er nam ta ki ar apps ta ki link dite parben ki
Bengali: আপনার ফোনের নেট স্পিড অ্যাপ এর নাম টা কি আর অ্যাপ্স টা কি লিংক দিতে পারবেন কি 


## Creating dataset class

In [5]:
class BanglishBengaliDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        # Encode Banglish input
        text_encoding = self.tokenizer(
            self.texts[idx],
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        
        # Encode Bengali target
        label_encoding = self.tokenizer(
            self.labels[idx],
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        return {
            'input_ids': text_encoding['input_ids'].squeeze(),
            'attention_mask': text_encoding['attention_mask'].squeeze(),
            'labels': label_encoding['input_ids'].squeeze()
        }

## Initializing Model and tokenizer

In [6]:
def prepare_model_and_tokenizer():
    model_name = "facebook/mbart-large-cc25"
    tokenizer = MBartTokenizer.from_pretrained(model_name)
    model = MBartForConditionalGeneration.from_pretrained(model_name)
    
    # Set source and target language
    tokenizer.src_lang = "en_XX"  # Using English tokens for Banglish
    tokenizer.tgt_lang = "bn_IN"  # Bengali
    
    # Move model to GPU
    model = model.to(device)
    
    return model, tokenizer

# Create model and tokenizer
model, tokenizer = prepare_model_and_tokenizer()
print("Model loaded and moved to GPU")



Model loaded and moved to GPU


## Creating evaluation Matrix

In [7]:
def levenshtein_distance(s1, s2):
    if len(s1) < len(s2):
        return levenshtein_distance(s2, s1)

    if len(s2) == 0:
        return len(s1)

    previous_row = range(len(s2) + 1)
    for i, c1 in enumerate(s1):
        current_row = [i + 1]
        for j, c2 in enumerate(s2):
            insertions = previous_row[j + 1] + 1
            deletions = current_row[j] + 1
            substitutions = previous_row[j] + (c1 != c2)
            current_row.append(min(insertions, deletions, substitutions))
        previous_row = current_row

    return previous_row[-1]

# Enhanced compute_metrics function with more metrics
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Calculate character error rate (CER)
    total_cer = 0
    total_matches = 0
    total_chars = 0
    
    for pred, label in zip(decoded_preds, decoded_labels):
        distance = levenshtein_distance(pred, label)
        total_cer += distance / len(label)
        
        # Calculate exact matches
        if pred == label:
            total_matches += 1
            
        # Calculate character-level accuracy
        total_chars += len(label)
    
    # Calculate metrics
    cer = total_cer / len(decoded_labels)
    exact_match_ratio = total_matches / len(decoded_labels)
    
    return {
        "character_error_rate": cer,
        "exact_match_ratio": exact_match_ratio,
    }

## Creating training Dataset

In [8]:
# Create training and validation datasets
train_dataset = BanglishBengaliDataset(train_texts, train_labels, tokenizer)
val_dataset = BanglishBengaliDataset(val_texts, val_labels, tokenizer)

print(f"Training dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(val_dataset)}")

Training dataset size: 4505
Validation dataset size: 501


## Setting up Training arguments and Trainner

In [9]:
# Training arguments with matching save and eval strategies
training_args = Seq2SeqTrainingArguments(
    output_dir="./banglish-bengali-translator",
    evaluation_strategy="steps",     # Evaluate every N steps
    save_strategy="steps",          # Changed to match evaluation_strategy
    eval_steps=100,                 # Evaluate every 100 steps
    save_steps=100,                 # Save every 100 steps (matching eval_steps)
    logging_strategy="steps",       # Log metrics
    logging_steps=5,               # Log every 5 steps
    learning_rate=2e-5,
    per_device_train_batch_size=8,   # Adjust if needed based on GPU memory
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=3,
    predict_with_generate=True,
    fp16=True,                       # Enable mixed precision training
    no_cuda=False,                   # Enable GPU usage
    load_best_model_at_end=True,    # Load the best model when training ends
    metric_for_best_model="character_error_rate",  # Use CER to determine best model
    greater_is_better=False         # Lower CER is better
)

# Initialize trainer with progress bar
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer, model=model),
    compute_metrics=compute_metrics
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


## Traning the model

In [None]:
print("\nStarting training...")
train_result = trainer.train()

# Print training metrics
print("\nTraining completed. Final metrics:")
print(f"Training loss: {train_result.training_loss:.4f}")


Starting training...


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

In [None]:
!nvidia-smi

## Simple Eval

In [None]:
# Evaluate on the validation set
eval_results = trainer.evaluate()
print("\nFinal Evaluation Results:")
for metric, value in eval_results.items():
    print(f"{metric}: {value:.4f}")

## Saving the model

In [None]:
# Save the model and tokenizer
output_dir = "./banglish-bengali-translator-final"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"Model saved to {output_dir}")

## Testing the Model with Some Examples

In [None]:
def translate_text(text, model, tokenizer):
    # Prepare the text into tokenized ids
    inputs = tokenizer(text, return_tensors="pt", padding=True)
    
    # Move to GPU if available
    if torch.cuda.is_available():
        inputs = {k: v.to('cuda') for k, v in inputs.items()}
        model = model.to('cuda')
    
    # Generate translation
    translated = model.generate(
        **inputs,
        max_length=128,
        num_beams=4,
        length_penalty=1.0,
        early_stopping=True,
        forced_bos_token_id=tokenizer.lang_code_to_id["bn_IN"]
    )
    
    # Decode the generated tokens to text
    translated_text = tokenizer.decode(translated[0], skip_special_tokens=True)
    return translated_text

# Test with some examples
test_examples = [
    "ami tomake bhalobashi",
    "tumi kemon acho",
    "bangla bhasha amader praner bhasha"
]

print("\nTesting the model with some examples:")
for text in test_examples:
    translated = translate_text(text, model, tokenizer)
    print(f"\nInput (Banglish): {text}")
    print(f"Output (Bengali): {translated}")