In [1]:
!pip install transformers datasets torch tokenizers
!pip install huggingface_hub
!pip install evaluate



In [2]:
from datasets import load_dataset
import pandas as pd

# Load dataset
dataset = load_dataset("SKNahin/bengali-transliteration-data")

# View dataset structure
print(dataset)

# Display first 5 rows
df = pd.DataFrame(dataset['train'])
print(df.head())

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


DatasetDict({
    train: Dataset({
        features: ['bn', 'rm'],
        num_rows: 5006
    })
})
                                                  bn  \
0      স্ক্রোল করে ২০/৩০ সেকেন্ড এর ভিডিও পান নাই???   
1                         ও গুলা টরেন্ট সাইট এ পাবেন   
2  ভক্কর চক্কর পোস্ট একটা করলেই এপ্রুভড.… নিশ্চই ...   
3                           আমি টেস্ট করেই কোড দিছি…   
4  এতো কষ্টের কি আছে সাকিবওয়াপ.টক,সাকিবওয়াপ.মল&এআ...   

                                                  rm  
0      scroll kore 20/30 second er video pann nai???  
1                        o gula Torrent site e paben  
2  vokkor chokkor post akta korlei approved…. nis...  
3                         ami test koreii code disi…  
4  eto koster ki ache shakibwap.tk,shakibwap.ml&a...  


In [3]:
from transformers import AutoTokenizer

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("google/mt5-small", legacy=False)

# Preprocessing function
def preprocess_data(examples):
    # Process input ('rm') and target ('bn') columns
    inputs = tokenizer(examples['rm'], padding="max_length", truncation=True, max_length=64)
    targets = tokenizer(examples['bn'], padding="max_length", truncation=True, max_length=64)
    inputs['labels'] = targets['input_ids']
    return inputs



In [4]:
from sklearn.model_selection import train_test_split
import datasets

# Load data into DataFrame
df = pd.DataFrame(dataset['train'])

# Split 90% training and 10% validation
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['rm'], df['bn'], test_size=0.1
)

# Rebuild datasets
train_data = datasets.Dataset.from_dict({"rm": train_texts, "bn": train_labels})
val_data = datasets.Dataset.from_dict({"rm": val_texts, "bn": val_labels})

# Preprocess datasets
train_dataset = train_data.map(preprocess_data, batched=True)
val_dataset = val_data.map(preprocess_data, batched=True)

Map:   0%|          | 0/4505 [00:00<?, ? examples/s]

Map:   0%|          | 0/501 [00:00<?, ? examples/s]

In [5]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments

# Load pre-trained model
model = AutoModelForSeq2SeqLM.from_pretrained("google/mt5-small")

# Training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=5,
    predict_with_generate=True,
    logging_dir='./logs',
    logging_steps=10,
)

# Define trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
)

  trainer = Seq2SeqTrainer(


In [6]:
# Train model
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mnishatmahmud[0m ([33mnishatmahmud-jagannath-university[0m). Use [1m`wandb login --relogin`[0m to force relogin


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,17.2656,11.828429
2,7.4852,4.776445
3,3.7302,2.637905
4,2.7304,2.071134
5,2.7462,1.993669


TrainOutput(global_step=1410, training_loss=10.53494504292806, metrics={'train_runtime': 1048.8064, 'train_samples_per_second': 21.477, 'train_steps_per_second': 1.344, 'total_flos': 1488760848384000.0, 'train_loss': 10.53494504292806, 'epoch': 5.0})

In [7]:
!pip install evaluate



In [11]:
import evaluate
from transformers import pipeline
import torch

# Set device (GPU or CPU)
device = 0 if torch.cuda.is_available() else -1

# Load the trained model and tokenizer
translator = pipeline("translation", model="./banglish-to-bangla-model", tokenizer=tokenizer, device=device)

# Load BLEU metric
metric = evaluate.load("bleu")

def compute_bleu():
    predictions = []
    references = []

    # Evaluate predictions
    for data in val_dataset:
        input_text = data['rm']
        label_text = data['bn']

        # Add translation prefix for better context
        input_text = "translate Banglish to Bangla: " + input_text

        # Perform translation with increased max_length
        pred = translator(input_text, max_length=128, truncation=True)[0]['translation_text']
        predictions.append(pred)
        references.append([label_text])  # References need nested format

    # Compute BLEU score
    result = metric.compute(predictions=predictions, references=references)
    print("BLEU Score:", result['bleu'])

# Call the BLEU computation function
compute_bleu()

Device set to use cuda:0


BLEU Score: 0.0


In [13]:
# Save the model locally
model.save_pretrained("./banglish-to-bangla-model")
tokenizer.save_pretrained("./banglish-to-bangla-model")

# Upload to HuggingFace Hub
from huggingface_hub import notebook_login
notebook_login()

model.push_to_hub("banglish-to-bangla-transliterator")
tokenizer.push_to_hub("banglish-to-bangla-transliterator")

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

tokenizer.json:   0%|          | 0.00/16.3M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/nishatmahmud/banglish-to-bangla-transliterator/commit/06e731c331246cc1e5f3fedc2332aac319359e6a', commit_message='Upload tokenizer', commit_description='', oid='06e731c331246cc1e5f3fedc2332aac319359e6a', pr_url=None, repo_url=RepoUrl('https://huggingface.co/nishatmahmud/banglish-to-bangla-transliterator', endpoint='https://huggingface.co', repo_type='model', repo_id='nishatmahmud/banglish-to-bangla-transliterator'), pr_revision=None, pr_num=None)

In [15]:
from transformers import pipeline
import torch

# Set device (GPU or CPU)
device = 0 if torch.cuda.is_available() else -1

# Load model and tokenizer
translator = pipeline("translation", model="./banglish-to-bangla-model", tokenizer=tokenizer, device=device)

# Test input with task prefix
test_input = "translate Banglish to Bangla: ami tomake bhalobashi"

# Generate output
output = translator(test_input, max_length=64, truncation=True)

print("Predicted Output:", output[0]['translation_text'])

Device set to use cuda:0


Predicted Output: <extra_id_0> করলে                  
