In [1]:
# Cell 1: Imports and Memory Management
import torch
import gc
from transformers import (
    MT5ForConditionalGeneration,
    MT5Tokenizer,
    Trainer,
    TrainingArguments,
    DataCollatorForSeq2Seq
)
from datasets import load_dataset
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split

# Clear memory
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()

In [2]:
# Cell 2: Constants with minimal memory usage
EPOCHS = 3
BATCH_SIZE = 4  # Very small batch size
MAX_LENGTH = 64  # Reduced sequence length
LEARNING_RATE = 1e-4  # Slightly adjusted
MODEL_NAME = "google/mt5-small"

In [3]:
# Cell 3: Load Dataset with Memory Optimization
def load_and_prepare_data():
    dataset = load_dataset("SKNahin/bengali-transliteration-data")
    train_data = dataset["train"]
    
    # Convert to lists and clear dataset from memory
    banglish_texts = list(train_data["rm"])
    bengali_texts = list(train_data["bn"])
    del dataset, train_data
    gc.collect()
    
    return train_test_split(
        banglish_texts, bengali_texts, 
        test_size=0.1, 
        random_state=42
    )

train_banglish, val_banglish, train_bengali, val_bengali = load_and_prepare_data()
print(f"Training samples: {len(train_banglish)}")
print(f"Validation samples: {len(val_banglish)}")

Training samples: 4505
Validation samples: 501


In [4]:
# Cell 4: Memory-Efficient Dataset Class
class BanglishDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        banglish_text = "transliterate: " + str(self.texts[idx])
        bengali_text = str(self.labels[idx])
        
        # Tokenize with smaller max_length
        source = self.tokenizer(
            banglish_text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        
        target = self.tokenizer(
            bengali_text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        
        return {
            "input_ids": source["input_ids"].squeeze(),
            "attention_mask": source["attention_mask"].squeeze(),
            "labels": target["input_ids"].squeeze()
        }

In [12]:
# Cell 5: Initialize Model with Memory Optimizations (Updated)
tokenizer = MT5Tokenizer.from_pretrained(MODEL_NAME)

# Load model with updated config
model = MT5ForConditionalGeneration.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    low_cpu_mem_usage=True,
    use_cache=False
)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Create datasets
train_dataset = BanglishDataset(train_banglish, train_bengali, tokenizer, MAX_LENGTH)
val_dataset = BanglishDataset(val_banglish, val_bengali, tokenizer, MAX_LENGTH)

# Clear memory
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()



In [13]:
# Cell 6: Memory-Optimized Training Arguments
# Cell 6: Memory-Optimized Training Arguments (Updated)
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=4,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    eval_strategy="steps",
    eval_steps=200,
    save_steps=400,
    learning_rate=LEARNING_RATE,
    # Disable fp16 and use bf16 instead if available
    fp16=False,
    bf16=torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8,
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": False},
    report_to="none",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="loss",
    greater_is_better=False,
    remove_unused_columns=True,
    dataloader_num_workers=0,
    optim="adamw_torch"
)
# Initialize data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    padding=True
)

In [14]:
# Cell 7: Training with Error Handling (Updated)
def train_with_error_handling():
    # Initialize trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        data_collator=DataCollatorForSeq2Seq(
            tokenizer,
            model=model,
            padding=True
        )
    )
    
    # Clear memory before training
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    
    try:
        print("Starting training...")
        trainer.train()
        return True
    except Exception as e:
        print(f"Training error: {str(e)}")
        # Print more detailed error information
        import traceback
        print(traceback.format_exc())
        return False

# Start training
success = train_with_error_handling()

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Starting training...


  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]


Step,Training Loss,Validation Loss
200,0.0,
400,0.0,
600,0.0,
800,0.0,


  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]
  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]
There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].


In [None]:
# Cell 8: Improved Test Function
def translate_banglish(text):
    # Prepare input
    inputs = tokenizer("transliterate: " + text, 
                      return_tensors="pt", 
                      padding=True, 
                      truncation=True, 
                      max_length=64)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    # Generate translation with better parameters
    outputs = model.generate(
        **inputs,
        max_length=64,
        num_beams=5,  # Beam search for better results
        length_penalty=1.0,
        early_stopping=True,
        no_repeat_ngram_size=2,
        do_sample=False,  # Deterministic generation
        num_return_sequences=1,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
        bos_token_id=tokenizer.bos_token_id
    )
    
    # Decode properly
    translated = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return translated.strip()

# Test with more examples
test_sentences = [
    "ami tomake bhalobashi",
    "kemon acho",
    "bangla bhasha amader praner bhasha",
    "amar naam",
    "tumi kothay thako"
]

print("\nTest Translations:")
for text in test_sentences:
    translated = translate_banglish(text)
    print(f"Banglish: {text}")
    print(f"Bengali: {translated}")
    print("-" * 40)