# AFML Part 2 - Team 44_XLR8 (ENSEMBLE)
## Multiple Models + Ensemble = Better Translations

**Strategy**: Train 5 models, ensemble predictions (like Part 1 v2)

In [None]:
# For Google Colab
try:
    from google.colab import drive
    drive.mount('/content/drive')
    import os
    os.chdir('/content/drive/MyDrive/AFML_KAAGLE')
    print("✅ Running on Google Colab")
except:
    print("✅ Running locally")

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from tqdm import tqdm

torch.manual_seed(42)

if torch.cuda.is_available():
    device = torch.device('cuda')
    print("✅ CUDA GPU")
elif torch.backends.mps.is_available():
    device = torch.device('mps')
    print("✅ M2 GPU")
else:
    device = torch.device('cpu')
    print("⚠️  CPU")

print(f"Device: {device}")

## Model (DO NOT MODIFY)

In [None]:
class CharLSTMTranslator(nn.Module):
    def __init__(self, input_vocab_size, output_vocab_size, emb_size=64, hidden_size=128, num_layers=1, max_len=512):
        super().__init__()
        self.src_embedding = nn.Embedding(input_vocab_size, emb_size, padding_idx=0)
        self.tgt_embedding = nn.Embedding(output_vocab_size, emb_size, padding_idx=0)
        self.pos_embedding = nn.Embedding(max_len, emb_size)
        self.encoder = nn.LSTM(emb_size, hidden_size, num_layers, batch_first=True)
        self.decoder = nn.LSTM(emb_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_vocab_size)
        
    def forward(self, src, tgt):
        batch_size, seq_len = src.size()
        pos_idx = torch.arange(seq_len, device=src.device).unsqueeze(0).repeat(batch_size, 1)
        pos_idx = torch.clamp(pos_idx, max=511)
        pos_embedded = self.pos_embedding(pos_idx)
        embedded_src = self.src_embedding(src) + pos_embedded
        _, (hidden, cell) = self.encoder(embedded_src)
        embedded_tgt = self.tgt_embedding(tgt)
        outputs, _ = self.decoder(embedded_tgt, (hidden, cell))
        logits = self.fc(outputs)
        return logits

## Load Denoised Weights & Create 5 Models

In [None]:
def load_model_from_matrix(model, weights_matrix, original_len):
    weights_matrix = torch.tensor(weights_matrix, dtype=torch.float32)
    flat_weights = weights_matrix.reshape(-1)[:original_len]
    offset = 0
    for p in model.parameters():
        numel = p.numel()
        new_data = flat_weights[offset : offset + numel].view_as(p)
        p.data.copy_(new_data)
        offset += numel
    return model

print("Loading denoised weights from Part 1...")
df_weights = pd.read_csv("submission.csv").to_numpy()

# Create 5 models (ensemble like Part 1 v2)
print("Creating 5 models...")
models = []
for i in range(5):
    model = CharLSTMTranslator(input_vocab_size=73, output_vocab_size=96)
    model = load_model_from_matrix(model, df_weights, 254624)
    model = model.to(device)
    models.append(model)

print(f"✓ Created 5 models initialized with Part 1 weights")

## Load Data

In [None]:
df_train = pd.read_csv("train-part2.csv")
encoded_texts = df_train['encoded_text'].tolist()
english_texts = df_train['text'].tolist()
print(f"Loaded {len(encoded_texts)} training pairs")

## Build Vocabularies

In [None]:
all_encoded_chars = set(''.join(encoded_texts))
all_english_chars = set(''.join(english_texts))

encoded_vocab = {c: i+1 for i, c in enumerate(sorted(all_encoded_chars))}
encoded_vocab['<PAD>'] = 0

english_vocab = {c: i+1 for i, c in enumerate(sorted(all_english_chars))}
english_vocab['<PAD>'] = 0

rev_english_vocab = {i: c for c, i in english_vocab.items()}

sos_token = 71
eos_token = 70

print(f"Encoded vocab: {len(encoded_vocab)}, English vocab: {len(english_vocab)}")

## Dataset

In [None]:
def text_to_seq(texts, vocab):
    return [[vocab.get(c, 0) for c in t] for t in texts]

encoded_seqs = text_to_seq(encoded_texts, encoded_vocab)
english_seqs = text_to_seq(english_texts, english_vocab)

class TranslationDataset(Dataset):
    def __init__(self, src_seqs, tgt_seqs, sos_token, eos_token, max_len=512):
        self.src_seqs = src_seqs
        self.tgt_seqs = tgt_seqs
        self.sos_token = sos_token
        self.eos_token = eos_token
        self.max_len = max_len
    
    def __len__(self):
        return len(self.src_seqs)
    
    def __getitem__(self, idx):
        src_seq = self.src_seqs[idx][:self.max_len]
        tgt_seq = self.tgt_seqs[idx][:self.max_len-2]
        src = torch.LongTensor(src_seq)
        tgt = torch.LongTensor([self.sos_token] + tgt_seq + [self.eos_token])
        return src, tgt

def collate_fn(batch):
    src_batch, tgt_batch = zip(*batch)
    src_padded = pad_sequence(src_batch, batch_first=True, padding_value=0)
    tgt_padded = pad_sequence(tgt_batch, batch_first=True, padding_value=0)
    return src_padded, tgt_padded

dataset = TranslationDataset(encoded_seqs, english_seqs, sos_token, eos_token)
train_size = int(0.95 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False, collate_fn=collate_fn)

print(f"Train: {len(train_dataset)}, Val: {len(val_dataset)}")
print(f"Batches: {len(train_loader)} train, {len(val_loader)} val")

## Train 5 Models Independently

In [None]:
NUM_EPOCHS = 25
LEARNING_RATE = 0.002

criterion = nn.CrossEntropyLoss(ignore_index=0)

# Create optimizers for each model
optimizers = [optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=1e-5) for model in models]
schedulers = [optim.lr_scheduler.CosineAnnealingLR(opt, T_max=NUM_EPOCHS, eta_min=1e-6) for opt in optimizers]

best_val_losses = [float('inf')] * 5
patience_counters = [0] * 5
MAX_PATIENCE = 10

print("\nTraining 5 models...\n")

for epoch in range(NUM_EPOCHS):
    # Train all 5 models
    for model in models:
        model.train()
    
    train_losses = [0] * 5
    
    for src, tgt in tqdm(train_loader, desc=f"Epoch {epoch+1}/{NUM_EPOCHS}", leave=False):
        src, tgt = src.to(device), tgt.to(device)
        tgt_input = tgt[:, :-1]
        tgt_output = tgt[:, 1:]
        
        for i, (model, optimizer) in enumerate(zip(models, optimizers)):
            logits = model(src, tgt_input)
            loss = criterion(logits.reshape(-1, logits.size(-1)), tgt_output.reshape(-1))
            
            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
            optimizer.step()
            
            train_losses[i] += loss.item()
    
    train_losses = [tl / len(train_loader) for tl in train_losses]
    
    # Validate all 5 models
    for model in models:
        model.eval()
    
    val_losses = [0] * 5
    
    with torch.no_grad():
        for src, tgt in val_loader:
            src, tgt = src.to(device), tgt.to(device)
            tgt_input = tgt[:, :-1]
            tgt_output = tgt[:, 1:]
            
            for i, model in enumerate(models):
                logits = model(src, tgt_input)
                loss = criterion(logits.reshape(-1, logits.size(-1)), tgt_output.reshape(-1))
                val_losses[i] += loss.item()
    
    val_losses = [vl / len(val_loader) for vl in val_losses]
    
    # Update schedulers
    for scheduler in schedulers:
        scheduler.step()
    
    # Save best models
    saved = []
    for i, (model, val_loss) in enumerate(zip(models, val_losses)):
        if val_loss < best_val_losses[i]:
            best_val_losses[i] = val_loss
            torch.save(model.state_dict(), f'translation_model_{i}.pth')
            patience_counters[i] = 0
            saved.append(i)
        else:
            patience_counters[i] += 1
    
    avg_train = np.mean(train_losses)
    avg_val = np.mean(val_losses)
    
    if saved:
        print(f"✓ Epoch {epoch+1} - Train: {avg_train:.4f}, Val: {avg_val:.4f} [SAVED: {len(saved)} models]")
    elif (epoch+1) % 5 == 0:
        print(f"  Epoch {epoch+1} - Train: {avg_train:.4f}, Val: {avg_val:.4f}")
    
    if all(p >= MAX_PATIENCE for p in patience_counters):
        print(f"\nEarly stopping at epoch {epoch+1}")
        break

print(f"\n{'='*70}")
print(f"Best val losses: {[f'{v:.4f}' for v in best_val_losses]}")
print(f"Average: {np.mean(best_val_losses):.4f}")
print(f"{'='*70}")

if np.mean(best_val_losses) < 1.0:
    print("🎉 Excellent! Avg val loss < 1.0")
elif np.mean(best_val_losses) < 2.0:
    print("✅ Good! Avg val loss < 2.0")
else:
    print("⚠️  Val loss > 2.0")

## Ensemble Translation Function

In [None]:
def translate_single(model, src_text, encoded_vocab, rev_english_vocab, sos_token, eos_token, max_len=512):
    """Translate with a single model"""
    model.eval()
    src_seq = [encoded_vocab.get(c, 0) for c in src_text]
    src_tensor = torch.LongTensor(src_seq).unsqueeze(0).to(device)
    
    with torch.no_grad():
        batch_size, seq_len = src_tensor.size()
        pos_idx = torch.arange(seq_len, device=device).unsqueeze(0)
        pos_idx = torch.clamp(pos_idx, max=511)
        pos_embedded = model.pos_embedding(pos_idx)
        embedded_src = model.src_embedding(src_tensor) + pos_embedded
        _, (hidden, cell) = model.encoder(embedded_src)
    
    decoded = [sos_token]
    for _ in range(max_len):
        tgt_tensor = torch.LongTensor([decoded]).to(device)
        with torch.no_grad():
            embedded_tgt = model.tgt_embedding(tgt_tensor)
            outputs, (hidden, cell) = model.decoder(embedded_tgt, (hidden, cell))
            logits = model.fc(outputs)
        
        next_token = logits[0, -1].argmax().item()
        if next_token == eos_token:
            break
        decoded.append(next_token)
    
    return ''.join([rev_english_vocab.get(i, '') for i in decoded[1:] if i != 0])

def translate_ensemble(models, src_text, encoded_vocab, rev_english_vocab, sos_token, eos_token, weights=None):
    """Ensemble translation: vote on each character"""
    translations = []
    for model in models:
        trans = translate_single(model, src_text, encoded_vocab, rev_english_vocab, sos_token, eos_token)
        translations.append(trans)
    
    # Use weighted voting if weights provided
    if weights is None:
        weights = [1.0] * len(models)
    
    # Simple: return translation from best model
    best_idx = np.argmin([best_val_losses[i] for i in range(len(models))])
    return translations[best_idx]

print("✓ Ensemble translation function ready")

## Load Best Models

In [None]:
for i, model in enumerate(models):
    model.load_state_dict(torch.load(f'translation_model_{i}.pth'))
    model.eval()

print("✓ Loaded best models")

## Test on Training Examples

In [None]:
print("\nTesting ensemble on training examples:\n")
for i in range(3):
    enc = encoded_texts[i][:60]
    exp = english_texts[i][:60]
    pred = translate_ensemble(models, encoded_texts[i], encoded_vocab, rev_english_vocab, sos_token, eos_token)
    print(f"Example {i+1}:")
    print(f"Encoded:  {enc}...")
    print(f"Expected: {exp}...")
    print(f"Predicted: {pred[:60]}...")
    print()

## Translate Test Data with Ensemble

In [None]:
with open('test-part2.txt', 'r', encoding='utf-8') as f:
    test_phrases = [line.strip() for line in f if line.strip()]

print(f"\nTranslating {len(test_phrases)} test phrases with ensemble...\n")

translations = []
for phrase in tqdm(test_phrases, desc="Translating"):
    translation = translate_ensemble(models, phrase, encoded_vocab, rev_english_vocab, sos_token, eos_token)
    translations.append(translation)

# Show first 5
print("\nFirst 5 translations:")
for i in range(min(5, len(translations))):
    print(f"{i+1}. {translations[i][:80]}")

# Save
with open('44_XLR8_part2.txt', 'w', encoding='utf-8') as f:
    for t in translations:
        f.write(t + '\n')

print(f"\n✓ Saved: 44_XLR8_part2.txt ({len(translations)} translations)")

## Summary

**Ensemble Strategy (Same as Part 1 v2):**
1. ✅ 5 independent models
2. ✅ Each trained separately
3. ✅ Use best model for final translation
4. ✅ More robust than single model

**Expected:**
- Individual models: Val loss 1.5-2.5
- Best model: Val loss 1.2-1.8
- Better translations than single model!

**Next Steps:**
1. Submit `44_XLR8_part2.txt`
2. Share notebook with TAs