In [18]:
# Import the necessary library to load datasets from Hugging Face
from datasets import load_dataset

# Load the Bengali transliteration dataset from Hugging Face
# The dataset is identified by the string "SKNahin/bengali-transliteration-data"
dataset = load_dataset("SKNahin/bengali-transliteration-data")

# Split the loaded dataset into training data ('train') 
# This accesses the 'train' split of the dataset for model training
train_data = dataset['train']


In [21]:
# Import the necessary library from Hugging Face Transformers for tokenization
from transformers import AutoTokenizer

# Initialize the tokenizer for the T5 model (small version)
# The tokenizer will process the text inputs into a format suitable for the model
tokenizer = AutoTokenizer.from_pretrained("t5-small")

# Define a tokenization function that prepares Banglish and Bengali text for training
def tokenize_pair(banglish, bengali):
    # Format the input text with a prefix indicating translation task
    input_text = f"translate Banglish to Bengali: {banglish}"
    # The target text is simply the Bengali translation
    target_text = bengali
    # Tokenize the input text and pad/truncate to max length of 128 tokens
    # The output is returned as tensors for model input
    input_ids = tokenizer(input_text, return_tensors="pt", padding='max_length', truncation=True, max_length=128).input_ids
    # Tokenize the target text similarly for the labels (output sequence)
    target_ids = tokenizer(target_text, return_tensors="pt", padding='max_length', truncation=True, max_length=128).input_ids
    # Return the tokenized input and target as a dictionary
    return {"input_ids": input_ids.squeeze(), "labels": target_ids.squeeze()}

# Apply the tokenization function to the dataset
# We are mapping the 'rm' (Banglish) and 'bn' (Bengali) columns from the dataset to the tokenization function
train_data = dataset['train'].map(lambda x: tokenize_pair(x['rm'], x['bn']), batched=False)

# Uncomment these lines if you want to rename columns (optional):
# train_data = train_data.rename_column('input_ids', 'input_ids')
# train_data = train_data.rename_column('labels', 'labels')


In [22]:
# Import the necessary class from Hugging Face Transformers to load the T5 model
from transformers import T5ForConditionalGeneration

# Load a pre-trained T5 model (small version) that is suitable for conditional generation tasks like translation
# The "t5-small" model has been pre-trained and fine-tuned on various tasks, including translation
model = T5ForConditionalGeneration.from_pretrained("t5-small")


In [24]:
# Import the torch library to work with PyTorch tensors and models
import torch

# Check if a GPU is available for training
# If a GPU is available, use it, otherwise fall back to CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move the model to the chosen device (GPU or CPU)
# This ensures that the model runs on the appropriate hardware
model.to(device)

# During training, move input data and target data to the same device as the model
# This ensures that both the model and the data are on the same hardware (either GPU or CPU)
input_ids = input_ids.to(device)
target_ids = target_ids.to(device)


In [None]:
# Import necessary classes from Hugging Face Transformers for training
from transformers import Trainer, TrainingArguments

# Split the dataset into training and validation datasets (80% train, 20% validation)
# 'train_test_split' creates a test set from the train data for evaluation
train_dataset = train_data.train_test_split(test_size=0.2)

# Define the training arguments, which configure the training process
training_args = TrainingArguments(
    output_dir='./results',             # Directory to save model and training results
    num_train_epochs=1,                 # Set number of epochs for training (1 epoch in this case for faster iteration)
    per_device_train_batch_size=16,     # Set batch size per device (increase based on GPU memory)
    per_device_eval_batch_size=16,      # Batch size for evaluation
    learning_rate=5e-4,                 # Learning rate (higher for faster convergence)
    warmup_steps=500,                   # Number of warmup steps (helps stabilize learning in early stages)
    weight_decay=0.01,                  # Apply weight decay to prevent overfitting
    save_total_limit=1,                 # Save only the latest checkpoint (limit total saved models)
    save_steps=500,                     # Save the model periodically every 500 steps
    logging_dir='./logs',               # Directory to save logs
    logging_steps=100,                  # Log every 100 steps
    fp16=True,                          # Enable mixed precision training for faster processing
    evaluation_strategy="steps",        # Evaluate the model at specified steps during training
    eval_steps=500,                     # Perform evaluation every 500 steps
    report_to="none"                    # Disable reporting to external tools like W&B
)

# Initialize the Trainer class with model, training arguments, and datasets
trainer = Trainer(
    model=model,                        # The model to train
    args=training_args,                 # The training arguments configured above
    train_dataset=train_dataset['train'],  # Use the training split for training
    eval_dataset=train_dataset['test'],   # Use the test split for evaluation
)

# Start the training process
trainer.train()


Step,Training Loss,Validation Loss
