# üåç Ibani-English NLLB Translation Model Training

This notebook fine-tunes Meta's NLLB-200 model for Ibani ‚Üî English translation.

**Steps:**
1. Setup environment and install dependencies
2. Upload training data
3. Fine-tune NLLB-200 with LoRA
4. Test the model
5. Download the trained model

**Runtime:** GPU (T4 or better recommended)

## 1Ô∏è‚É£ Setup Environment

In [None]:
# Check GPU availability
!nvidia-smi

In [None]:
# Install dependencies
!pip install -q torch transformers datasets accelerate peft bitsandbytes sentencepiece sacrebleu

In [None]:
# Import libraries
import json
import torch
from pathlib import Path
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq,
)
from datasets import Dataset
from peft import LoraConfig, get_peft_model, TaskType
from google.colab import files
import zipfile

print(f"‚úì PyTorch version: {torch.__version__}")
print(f"‚úì CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"‚úì GPU: {torch.cuda.get_device_name(0)}")

## 2Ô∏è‚É£ Upload Training Data

Upload your `ibani_eng.json` file.

In [None]:
# Upload training data
print("üì§ Please upload your ibani_eng.json file:")
uploaded = files.upload()

# Get the filename
data_file = list(uploaded.keys())[0]
print(f"\n‚úì Uploaded: {data_file}")

In [None]:
# Load and inspect data
with open(data_file, 'r', encoding='utf-8') as f:
    training_data = json.load(f)

print(f"‚úì Loaded {len(training_data)} training examples\n")
print("Sample data:")
for i, item in enumerate(training_data[:3]):
    print(f"\n{i+1}. Ibani: {item['ibani']}")
    print(f"   English: {item['english']}")

## 3Ô∏è‚É£ Prepare Dataset

In [None]:
# Configuration
MODEL_NAME = "facebook/nllb-200-distilled-600M"
OUTPUT_DIR = "ibani-nllb-model"

# Training hyperparameters
NUM_EPOCHS = 10
BATCH_SIZE = 8
LEARNING_RATE = 2e-4
MAX_LENGTH = 128

print("üìã Training Configuration:")
print(f"  Base Model: {MODEL_NAME}")
print(f"  Epochs: {NUM_EPOCHS}")
print(f"  Batch Size: {BATCH_SIZE}")
print(f"  Learning Rate: {LEARNING_RATE}")

In [None]:
# Create bidirectional dataset
examples = []

for item in training_data:
    # Ibani ‚Üí English
    examples.append({
        'source': item['ibani'],
        'target': item['english'],
    })
    
    # English ‚Üí Ibani
    examples.append({
        'source': item['english'],
        'target': item['ibani'],
    })

print(f"‚úì Created {len(examples)} bidirectional examples")

# Convert to Dataset
dataset = Dataset.from_list(examples)
print(f"‚úì Dataset created with {len(dataset)} examples")

## 4Ô∏è‚É£ Load Model and Tokenizer

In [None]:
# Load tokenizer
print(f"üì• Loading tokenizer: {MODEL_NAME}")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
print("‚úì Tokenizer loaded")

In [None]:
# Load model
print(f"üì• Loading model: {MODEL_NAME}")
model = AutoModelForSeq2SeqLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16,
    device_map="auto"
)
print("‚úì Model loaded")
print(f"‚úì Model parameters: {model.num_parameters():,}")

## 5Ô∏è‚É£ Setup LoRA for Efficient Training

In [None]:
# Configure LoRA
lora_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    r=16,  # LoRA rank
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["q_proj", "v_proj", "k_proj", "out_proj", "fc1", "fc2"],
)

# Apply LoRA to model
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

## 6Ô∏è‚É£ Tokenize Dataset

In [None]:
def preprocess_function(examples):
    """Tokenize examples."""
    # Tokenize inputs
    model_inputs = tokenizer(
        examples['source'],
        max_length=MAX_LENGTH,
        truncation=True,
        padding='max_length'
    )
    
    # Tokenize targets
    labels = tokenizer(
        examples['target'],
        max_length=MAX_LENGTH,
        truncation=True,
        padding='max_length'
    )
    
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

# Tokenize dataset
print("üîÑ Tokenizing dataset...")
tokenized_dataset = dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=dataset.column_names
)
print("‚úì Dataset tokenized")

In [None]:
# Split into train/eval
split_dataset = tokenized_dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = split_dataset['train']
eval_dataset = split_dataset['test']

print(f"‚úì Train examples: {len(train_dataset)}")
print(f"‚úì Eval examples: {len(eval_dataset)}")

## 7Ô∏è‚É£ Configure Training

In [None]:
# Training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    learning_rate=LEARNING_RATE,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir=f"{OUTPUT_DIR}/logs",
    logging_steps=10,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=3,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=False,
    report_to=[],
)

# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding=True
)

print("‚úì Training configuration ready")

## 8Ô∏è‚É£ Train the Model üöÄ

In [None]:
# Initialize trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

print("‚úì Trainer initialized")

In [None]:
# Start training
print("\n" + "="*60)
print("üèãÔ∏è Starting Training...")
print("="*60 + "\n")

trainer.train()

print("\n" + "="*60)
print("‚úÖ Training Complete!")
print("="*60)

## 9Ô∏è‚É£ Save the Model

In [None]:
# Save model and tokenizer
print(f"üíæ Saving model to {OUTPUT_DIR}...")
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

# Save training info
training_info = {
    "base_model": MODEL_NAME,
    "num_epochs": NUM_EPOCHS,
    "batch_size": BATCH_SIZE,
    "learning_rate": LEARNING_RATE,
    "num_training_examples": len(training_data),
    "num_bidirectional_examples": len(examples),
    "use_lora": True,
}

with open(f"{OUTPUT_DIR}/training_info.json", 'w') as f:
    json.dump(training_info, f, indent=2)

print("‚úì Model saved!")

## üîü Test the Model

In [None]:
# Test translation function
def translate(text, max_length=200, num_beams=5):
    """Translate text using the trained model."""
    inputs = tokenizer(text, return_tensors="pt", padding=True).to(model.device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_length=max_length,
            num_beams=num_beams,
            early_stopping=True,
            no_repeat_ngram_size=3,
        )
    
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Test with some examples
print("üß™ Testing translations:\n")
print("="*60)

test_examples = [
    "Hello, how are you?",
    "My name is William",
    "Thank you",
    "Good morning",
]

for text in test_examples:
    translation = translate(text)
    print(f"\nEnglish: {text}")
    print(f"Ibani:   {translation}")
    print("-"*60)

## üì¶ Download the Model

In [None]:
# Create a zip file of the model
print("üì¶ Creating model archive...")

zip_filename = "ibani_nllb_model.zip"

!zip -r {zip_filename} {OUTPUT_DIR}

print(f"\n‚úì Model archived to {zip_filename}")
print(f"‚úì Size: {Path(zip_filename).stat().st_size / (1024*1024):.1f} MB")

In [None]:
# Download the model
print("üì• Downloading model...")
files.download(zip_filename)
print("\n‚úÖ Download started! Check your browser's download folder.")

## üéâ Next Steps

1. **Download** the `ibani_nllb_model.zip` file
2. **Extract** it to your local project's `models/` directory
3. **Run** the FastAPI server: `python app.py`
4. **Test** the API at `http://localhost:8080/docs`

### Optional: Push to Hugging Face Hub

To share your model:

```python
from huggingface_hub import login

# Login to Hugging Face
login()

# Push model
model.push_to_hub("your-username/ibani-nllb-translator")
tokenizer.push_to_hub("your-username/ibani-nllb-translator")
```