In [None]:
!pip install transformers datasets



In [None]:
from datasets import load_dataset

ds = load_dataset("SKNahin/bengali-transliteration-data")

In [None]:
from datasets import DatasetDict
# Split into train and validation
split_ratio = 0.9
train_test_split = ds["train"].train_test_split(test_size=1-split_ratio)
data = DatasetDict({
    "train": train_test_split["train"],
    "validation": train_test_split["test"]
})

In [None]:
print(data["train"][50])

{'bn': 'ফেইক পোস্ট, প্রিমিয়াম বাইপাস হয়নি', 'rm': 'Fake post, premium bypass hoyni '}


In [None]:
from transformers import AutoTokenizer

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("sagorsarker/bangla-bert-base")

# Tokenization function
def preprocess_function(examples):
    source = examples["rm"]  # Banglish text
    target = examples["bn"]  # Bengali text
    model_inputs = tokenizer(
        source,
        text_target=target,
        max_length=128,  # Maximum sequence length
        padding="max_length",  # Ensures all sequences are padded to the same length
        truncation=True  # Truncate sequences longer than max_length
    )
    return model_inputs

# Apply preprocessing
tokenized_data = data.map(preprocess_function, batched=True)


config.json:   0%|          | 0.00/491 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/2.24M [00:00<?, ?B/s]

Map:   0%|          | 0/4505 [00:00<?, ? examples/s]

Map:   0%|          | 0/501 [00:00<?, ? examples/s]

In [None]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

# Load pre-trained model
model = AutoModelForSeq2SeqLM.from_pretrained("google/mt5-small")

# Training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=4,
    save_total_limit=2,
    predict_with_generate=True,
    logging_dir="./logs",
    logging_steps=100,
    save_steps=1000
)

# Trainer setup
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["validation"],
    tokenizer=tokenizer
)

# Train the model
trainer.train()


In [None]:
def translate(text):
    inputs = tokenizer(text, return_tensors="pt", max_length=128, truncation=True)
    outputs = model.generate(**inputs)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Example
translate("Ami banglay gan gai")
