In [28]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import Dataset

def generate_burmese_text(dataset, model, tokenizer, device="mps", batch_size=16, max_length=256):
    """
    Generates Burmese text from Burmese prompts using a Seq2Seq model.
    """

    model.eval()  # Set model to evaluation mode

    def predict_fn(batch):
        # Tokenize Burmese input (instead of English)
        inputs = tokenizer(
            batch["burmese"],  # Input should be Burmese text
            padding=True,
            truncation=True,
            max_length=max_length,
            return_tensors="pt"
        ).to(device)

        # Generate text using the Seq2Seq model
        with torch.no_grad():
            output_tokens = model.generate(
                **inputs,
                max_length=max_length,
                do_sample=True,  # Enables diverse outputs
                top_k=50,  # Keeps high-quality token selection
                top_p=0.95,  # Ensures better word diversity
                temperature=0.6,  # Keeps Burmese fluency structured
                repetition_penalty=1.8,  # Avoids repetition of words like "အရည်အသွေး"
                num_beams=5  # Forces the model to generate more contextually correct Burmese
            )

        # Decode predictions
        generated_texts = tokenizer.batch_decode(output_tokens, skip_special_tokens=True)

        return {"generated": generated_texts}  # Return new column with generated text

    # Apply batched processing
    dataset = dataset.map(predict_fn, batched=True, batch_size=batch_size)

    return dataset

In [29]:
# Load trained translation model
model_path = "model-variants/models/mT5_BPE"
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False, legacy=True)
model = AutoModelForSeq2SeqLM.from_pretrained(model_path).to("mps")

# Load dataset (Hugging Face Dataset format)
test_dataset = Dataset.from_dict({
    "burmese": ["အထက် အီဂျစ်မှာ နူဘီးယား ကြမ်းပြင်ပြပွဲကို ကြည့်ဖို့ မထိုက်ပါဘူး။", "သူ့အလုပ်စွမ်းအားဟာ ကျယ်ပြန့်မှုနဲ့ အသေးစိတ်နည်းလမ်းရဲ့ ရလဒ်ပါ။"],
    "english": ["it's not worth seeing the nubian floor exhibition in egypt.", "his workforce is the result of the method of expansion and detail."]
})

# Generate Burmese text
predicted_dataset = generate_burmese_text(test_dataset, model, tokenizer, device="mps", batch_size=16)

# Print predictions
for example in predicted_dataset:
    print(f"🔹 Prompt: {example['burmese']}")
    print(f"✅ Generated: {example['generated']}\n")


Map:   0%|          | 0/2 [00:00<?, ? examples/s]

🔹 Prompt: အထက် အီဂျစ်မှာ နူဘီးယား ကြမ်းပြင်ပြပွဲကို ကြည့်ဖို့ မထိုက်ပါဘူး။
✅ Generated: အီဂျစ်မှာ နူဘီးယား ကြမ်းပြင် ပွဲကို ကြည့်ဖို့ မသေချာဘူး။

🔹 Prompt: သူ့အလုပ်စွမ်းအားဟာ ကျယ်ပြန့်မှုနဲ့ အသေးစိတ်နည်းလမ်းရဲ့ ရလဒ်ပါ။
✅ Generated: သူ့အလုပ် စွမ်းဆောင်ရည် ဟာ ကျယ်ပြန့်သော အရည်အသွေး နဲ့ အလွန် ကြီးမားတဲ့ ရလဒ်ပါ။

