In [21]:

from datasets import load_dataset

# Load the dataset
dataset = load_dataset("SKNahin/bengali-transliteration-data")

dataset = dataset.rename_column("rm", "feature")
dataset = dataset.rename_column("bn", "target")

print(dataset)

# Split into training and validation (80/20 split)
train_data = dataset["train"].train_test_split(test_size=0.2)["train"]
val_data = dataset["train"].train_test_split(test_size=0.2)["test"]

print(train_data, val_data)


DatasetDict({
    train: Dataset({
        features: ['target', 'feature'],
        num_rows: 5006
    })
})
Dataset({
    features: ['target', 'feature'],
    num_rows: 4004
}) Dataset({
    features: ['target', 'feature'],
    num_rows: 1002
})


In [22]:
print(train_data.column_names)
print(train_data[0])


['target', 'feature']
{'target': 'নন স্মোকার দের সংখ্যা খুব ই কম', 'feature': 'non smoker der sonngkha khub e kom'}


In [23]:
def filter_valid_rows(example):
    return example["target"] is not None and example["feature"] is not None and example["target"].strip() != "" and example["feature"].strip() != ""

train_data = train_data.filter(filter_valid_rows)
val_data = val_data.filter(filter_valid_rows)


Filter:   0%|          | 0/4004 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1002 [00:00<?, ? examples/s]

In [24]:
print(train_data[0])

{'target': 'নন স্মোকার দের সংখ্যা খুব ই কম', 'feature': 'non smoker der sonngkha khub e kom'}


In [25]:
from transformers import MBart50TokenizerFast

# Load the tokenizer
tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50", src_lang="en_XX", tgt_lang="bn_IN")

# Define a preprocessing function
def preprocess_function(examples):
    return tokenizer(examples["target"], text_target=examples["feature"], truncation=True)

# Apply tokenization to the dataset
tokenized_train = train_data.map(preprocess_function, batched=True)
tokenized_val = val_data.map(preprocess_function, batched=True)


Map:   0%|          | 0/4004 [00:00<?, ? examples/s]

Map:   0%|          | 0/1002 [00:00<?, ? examples/s]

In [26]:
print(train_data[0])

{'target': 'নন স্মোকার দের সংখ্যা খুব ই কম', 'feature': 'non smoker der sonngkha khub e kom'}


In [27]:
# Filter out overly short or excessively long sentences
def filter_valid_rows(example):
    banglish = example["target"]
    bengali = example["feature"]
    return banglish and bengali and 2 <= len(banglish.split()) <= 100 and 2 <= len(bengali.split()) <= 100

train_data = train_data.filter(filter_valid_rows)
val_data = val_data.filter(filter_valid_rows)


Filter:   0%|          | 0/4004 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1002 [00:00<?, ? examples/s]

In [28]:
print(train_data[0])

{'target': 'নন স্মোকার দের সংখ্যা খুব ই কম', 'feature': 'non smoker der sonngkha khub e kom'}


In [29]:
import re

# Normalize Banglish text (e.g., remove extra spaces)
def normalize_banglish(text):
    text = re.sub(r"\s+", " ", text.strip())  # Remove extra spaces
    return text

# Normalize Bengali text (e.g., handle special characters, remove extra spaces)
def normalize_bangla(text):
    text = re.sub(r"\s+", " ", text.strip())  # Remove extra spaces
    return text

# Apply normalization independently
train_data = train_data.map(lambda x: {
    "target": normalize_banglish(x["target"]),
    "feature": normalize_bangla(x["feature"])
})

val_data = val_data.map(lambda x: {
    "target": normalize_banglish(x["target"]),
    "feature": normalize_bangla(x["feature"])
})


Map:   0%|          | 0/3896 [00:00<?, ? examples/s]

Map:   0%|          | 0/975 [00:00<?, ? examples/s]

In [30]:
print(train_data[0])


{'target': 'নন স্মোকার দের সংখ্যা খুব ই কম', 'feature': 'non smoker der sonngkha khub e kom'}


For this task, mBART (Multilingual BART) is a suitable choice due to its pre-trained multilingual capabilities and sequence-to-sequence design tailored for translation and low-resource languages.

Why mBART?


1. Multilingual Pre-Training:

 mBART is pre-trained on large-scale multilingual datasets across multiple languages.

 It supports both low-resource and high-resource language pairs, making it ideal for transliteration tasks like Banglish to Bengali.



2. Sequence-to-Sequence Architecture:

   mBART’s encoder-decoder structure is specifically designed for translation tasks.It allows efficient handling of context in source (Banglish) and target (Bengali) sequences.


3. Suitability for Low-Resource Tasks:

   Bengali is considered a low-resource language in NLP.
   mBART includes training on Indic scripts like Bengali, providing better adaptability for this task.


4. Tokenization Efficiency:

   mBART uses SentencePiece tokenization, which handles multilingual text efficiently, supporting both Latin (Banglish) and Indic (Bengali) scripts.

5. Ease of Fine-Tuning:
   The Hugging Face library offers a simple interface to fine-tune mBART models, saving time and resources during training.

In [67]:
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast

model_name = "facebook/mbart-large-50"
model = MBartForConditionalGeneration.from_pretrained(model_name)
tokenizer = MBart50TokenizerFast.from_pretrained(model_name)


In [68]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)


In [69]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=5,
    predict_with_generate=True,
    fp16=True
)




In [86]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
)


  trainer = Seq2SeqTrainer(



Learning Rate (5e-5):

 A common starting point for fine-tuning pre-trained models. Prevents overshooting during gradient descent.

Batch Size (16):

Balances memory usage and training speed. Increase or decrease based on available GPU/CPU resources.

Epochs (5):

Ensures adequate training without overfitting. Early stopping can be added if validation performance plateaus.

Weight Decay (0.01):

Adds regularization to prevent overfitting, especially for smaller datasets.

In [None]:
import evaluate


metric = evaluate.load("sacrebleu")

predictions, labels, _ = trainer.predict(tokenized_val)

decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)


result = metric.compute(predictions=decoded_preds, references=[[label] for label in decoded_labels])

print(result)
