In [18]:
from datasets import load_dataset

# Load the dataset from Hugging Face
dataset = load_dataset("SKNahin/bengali-transliteration-data")

# Split the dataset into training and validation (90/10 split)
train_data = dataset['train']


In [21]:
from transformers import AutoTokenizer

# Initialize the tokenizer (T5 small in this case)
tokenizer = AutoTokenizer.from_pretrained("t5-small")

# Tokenization function for the dataset
def tokenize_pair(banglish, bengali):
    input_text = f"translate Banglish to Bengali: {banglish}"
    target_text = bengali
    input_ids = tokenizer(input_text, return_tensors="pt", padding='max_length', truncation=True, max_length=128).input_ids
    target_ids = tokenizer(target_text, return_tensors="pt", padding='max_length', truncation=True, max_length=128).input_ids
    return {"input_ids": input_ids.squeeze(), "labels": target_ids.squeeze()}

# Apply tokenization to the dataset
train_data = dataset['train'].map(lambda x: tokenize_pair(x['rm'], x['bn']), batched=False)

# train_data = train_data.rename_column('input_ids', 'input_ids')
# train_data = train_data.rename_column('labels', 'labels')


In [22]:
from transformers import T5ForConditionalGeneration

# Load pre-trained T5 model (suitable for translation tasks)
model = T5ForConditionalGeneration.from_pretrained("t5-small")


In [24]:
import torch

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move model to GPU
model.to(device)

# During training, ensure data is also moved to the same device
input_ids = input_ids.to(device)
target_ids = target_ids.to(device)


In [None]:
from transformers import Trainer, TrainingArguments

# Split into train and validation datasets (90% train, 10% validation)
train_dataset = train_data.train_test_split(test_size=0.2)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,                # Reduced epochs
    per_device_train_batch_size=16,    # Increased batch size (adjust to GPU memory)
    per_device_eval_batch_size=16,
    learning_rate=5e-4,                # Higher learning rate for faster convergence
    warmup_steps=500,                  # Adjust based on dataset size
    weight_decay=0.01,
    save_total_limit=1,                # Save fewer checkpoints
    save_steps=500,                    # Save periodically
    logging_dir='./logs',
    logging_steps=100,
    fp16=True,                         # Enable mixed precision
    evaluation_strategy="steps",       # Evaluate during training
    eval_steps=500,
    report_to="none"                   # Disable W&B logging if not required
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset['train'],
    eval_dataset=train_dataset['test'],
)

# Start training
trainer.train()


Step,Training Loss,Validation Loss
