# üåç Ibani-English Translation with ByT5

This notebook trains a ByT5 model for English ‚Üî Ibani translation.

## üìã Before Running:
1. Upload `ibani_eng_training_data.json` to Google Drive
2. Enable GPU: Runtime ‚Üí Change runtime type ‚Üí GPU
3. Run all cells

## ‚è±Ô∏è Expected Time:
- 30-60 minutes on GPU

---

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Install dependencies
!pip install -q transformers datasets accelerate evaluate sacrebleu tensorboard sentencepiece

In [None]:
# Import libraries
import json
import os
from dataclasses import dataclass
import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq,
)
import evaluate
import numpy as np

print("‚úÖ All libraries imported!")
print(f"üêç Python version: {torch.__version__}")
print(f"üíª GPU available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"üéÆ GPU: {torch.cuda.get_device_name(0)}")

In [None]:
# ‚öôÔ∏è CONFIGURATION - Modify these if needed

@dataclass
class TrainingConfig:
    # Model (change to byt5-base or byt5-large for better quality)
    model_name: str = "google/byt5-small"
    
    # Data path - UPDATE THIS to your Google Drive path
    data_path: str = "/content/drive/MyDrive/ibani_eng_training_data.json"
    
    # Output
    output_dir: str = "/content/ibani-byt5-finetuned"
    
    # Training parameters
    num_train_epochs: int = 10
    per_device_train_batch_size: int = 8
    per_device_eval_batch_size: int = 8
    learning_rate: float = 5e-5
    weight_decay: float = 0.01
    warmup_steps: int = 500
    
    # Generation
    max_source_length: int = 256
    max_target_length: int = 256
    
    # Evaluation
    eval_steps: int = 500
    save_steps: int = 500
    logging_steps: int = 100
    eval_split: float = 0.1
    
    # Other
    seed: int = 42
    fp16: bool = torch.cuda.is_available()

config = TrainingConfig()
print("‚úÖ Configuration loaded")
print(f"üìä Model: {config.model_name}")
print(f"üìÅ Data: {config.data_path}")

In [None]:
# Load and prepare data
print(f"üìä Loading data from {config.data_path}...")

with open(config.data_path, 'r', encoding='utf-8') as f:
    data = json.load(f)

english_texts = []
ibani_texts = []

for item in data:
    translation = item.get('translation', {})
    en_text = translation.get('en', '').strip()
    ibani_text = translation.get('ibani', '').strip()
    
    if en_text and ibani_text:
        english_texts.append(en_text)
        ibani_texts.append(ibani_text)

print(f"‚úÖ Loaded {len(english_texts):,} translation pairs")

# Create datasets
dataset_dict = {'english': english_texts, 'ibani': ibani_texts}
dataset = Dataset.from_dict(dataset_dict)
split_dataset = dataset.train_test_split(test_size=config.eval_split, seed=42)

train_dataset = split_dataset['train']
eval_dataset = split_dataset['test']

print(f"üìö Train: {len(train_dataset):,} | Validation: {len(eval_dataset):,}")

In [None]:
# Load model and tokenizer
print(f"ü§ñ Loading {config.model_name}...")

tokenizer = AutoTokenizer.from_pretrained(config.model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(config.model_name)

print(f"‚úÖ Model loaded: {model.num_parameters():,} parameters")

In [None]:
# Preprocessing
def preprocess_function(examples):
    inputs = [f"translate English to Ibani: {text}" for text in examples['english']]
    targets = examples['ibani']
    
    model_inputs = tokenizer(inputs, max_length=config.max_source_length, truncation=True, padding=False)
    labels = tokenizer(targets, max_length=config.max_target_length, truncation=True, padding=False)
    
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

print("üîÑ Preprocessing datasets...")
train_dataset = train_dataset.map(preprocess_function, batched=True, remove_columns=train_dataset.column_names)
eval_dataset = eval_dataset.map(preprocess_function, batched=True, remove_columns=eval_dataset.column_names)
print("‚úÖ Preprocessing complete")

In [None]:
# Setup training
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, padding=True)
bleu = evaluate.load("sacrebleu")

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]
    
    result = bleu.compute(predictions=decoded_preds, references=decoded_labels)
    return {"bleu": result["score"]}

training_args = Seq2SeqTrainingArguments(
    output_dir=config.output_dir,
    evaluation_strategy="steps",
    eval_steps=config.eval_steps,
    save_steps=config.save_steps,
    logging_steps=config.logging_steps,
    learning_rate=config.learning_rate,
    per_device_train_batch_size=config.per_device_train_batch_size,
    per_device_eval_batch_size=config.per_device_eval_batch_size,
    weight_decay=config.weight_decay,
    save_total_limit=3,
    num_train_epochs=config.num_train_epochs,
    predict_with_generate=True,
    fp16=config.fp16,
    warmup_steps=config.warmup_steps,
    load_best_model_at_end=True,
    metric_for_best_model="bleu",
    greater_is_better=True,
    push_to_hub=False,
    report_to=["tensorboard"],
    seed=config.seed,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

print("‚úÖ Trainer initialized")

In [None]:
# üöÄ TRAIN THE MODEL
print("üöÄ Starting training...")
print(f"‚è±Ô∏è  {config.num_train_epochs} epochs")
print(f"üìä {len(train_dataset) // config.per_device_train_batch_size * config.num_train_epochs} total steps")
print("\nThis will take 30-60 minutes on GPU...\n")

trainer.train()

print("\nüéâ Training complete!")

In [None]:
# Save model
print(f"üíæ Saving model to {config.output_dir}")
trainer.save_model(config.output_dir)
tokenizer.save_pretrained(config.output_dir)

# Final evaluation
metrics = trainer.evaluate()
print(f"\nüìä Final BLEU score: {metrics['eval_bleu']:.2f}")

with open(os.path.join(config.output_dir, "final_metrics.json"), 'w') as f:
    json.dump(metrics, f, indent=2)

print("‚úÖ Model saved!")

In [None]:
# Test the model
def translate(text, source_lang="en", target_lang="ibani"):
    if source_lang == "en":
        prompt = f"translate English to Ibani: {text}"
    else:
        prompt = f"translate Ibani to English: {text}"
    
    inputs = tokenizer(prompt, return_tensors="pt", max_length=256, truncation=True).to(model.device)
    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=256, num_beams=4, early_stopping=True)
    return tokenizer.decode(outputs[0], skip_special_tokens=True).strip()

print("üß™ Testing translations:\n")
test_sentences = [
    "Hello, how are you?",
    "Good morning",
    "Thank you very much"
]

for sentence in test_sentences:
    translation = translate(sentence)
    print(f"EN: {sentence}")
    print(f"IB: {translation}\n")

In [None]:
# Save to Google Drive
print("üì• Saving model to Google Drive...")
!cp -r /content/ibani-byt5-finetuned /content/drive/MyDrive/
print("‚úÖ Model saved to Google Drive: /MyDrive/ibani-byt5-finetuned/")
print("\nüéâ All done! Download the folder from Google Drive to use locally.")