# **Entrenamiento Español $\rightarrow$ Aymara**

In [1]:
!pip install -q sentencepiece
!pip install -q sacrebleu

In [2]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from torch.amp import autocast, GradScaler
import sentencepiece as spm
import warnings
warnings.filterwarnings("ignore")
import math
import os
import sacrebleu
import pandas as pd
import random

In [3]:
# ==========================================
# 1. DATASET Y DATALOADER
# ==========================================

class TranslationDataset(Dataset):
    def __init__(self, src_path, trgt_path):
        with open(src_path, "r", encoding="utf-8") as f:
            self.src_lines = [line.strip() for line in f]
        with open(trgt_path, "r", encoding="utf-8") as f:
            self.trgt_lines = [line.strip() for line in f]
        assert len(self.src_lines) == len(self.trgt_lines)

    def __len__(self):
        return len(self.src_lines)

    def __getitem__(self, idx):
        src = list(map(int, self.src_lines[idx].split()))
        trgt = list(map(int, self.trgt_lines[idx].split()))
        # Aseguramos long para embeddings
        return torch.tensor(src, dtype=torch.long), torch.tensor(trgt, dtype=torch.long)

train_dataset = TranslationDataset(src_path="data/splits/ids/train.spanish", trgt_path="data/splits/ids/train.aymara")
valid_dataset = TranslationDataset(src_path="data/splits/ids/valid.spanish", trgt_path="data/splits/ids/valid.aymara")
test_dataset  = TranslationDataset(src_path="data/splits/ids/test.spanish",  trgt_path="data/splits/ids/test.aymara")

sp = spm.SentencePieceProcessor()
sp.load("tokenizer/SentencePiece.model")
PAD_ID, BOS_ID, EOS_ID = sp.pad_id(), sp.bos_id(), sp.eos_id()
vocab_size = sp.vocab_size()

def collate_fn_batch(batch):
    src_batch, trgt_batch = zip(*batch)
    src_padded = pad_sequence(src_batch, batch_first=True, padding_value=PAD_ID)
    trgt_padded = pad_sequence(trgt_batch, batch_first=True, padding_value=PAD_ID)
    return src_padded, trgt_padded

batch_size = 128
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn_batch)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn_batch)
test_loader  = DataLoader(test_dataset,  batch_size=batch_size, shuffle=False, collate_fn=collate_fn_batch)

In [4]:
# ==========================================
# 2. TRANSFORMER
# ==========================================

class TranslationTransformer(nn.Module):
    def __init__(self, vocab_size, d_model=512, n_heads=8, num_layers=6, dim_ffnn=2048, dropout=0.2, pad_id=0):
        super().__init__()
        
        self.d_model = d_model # Guardamos d_model para el escalado
        self.embedding = nn.Embedding(vocab_size, d_model, padding_idx=pad_id)
        self.positional_encoder = nn.Embedding(150, d_model) # 150 es seguro para max_len 80
        
        self.transformer = nn.Transformer(
            d_model=d_model,
            nhead=n_heads,
            num_encoder_layers=num_layers,
            num_decoder_layers=num_layers,
            dim_feedforward=dim_ffnn,
            dropout=dropout,
            batch_first=True
        )

        self.output_layer = nn.Linear(in_features=d_model, out_features=vocab_size, bias=False)
        self.output_layer.weight = self.embedding.weight 

    def forward(self, src, trgt, trgt_causal_mask=None, src_key_padding_mask=None, trgt_key_padding_mask=None, memory_key_padding_mask=None):
        src_pos = self.positional_encoder(torch.arange(src.size(1), device=src.device))
        trgt_pos = self.positional_encoder(torch.arange(trgt.size(1), device=trgt.device))

        # Escalado por sqrt(d_model)
        src = (self.embedding(src) * math.sqrt(self.d_model)) + src_pos
        trgt = (self.embedding(trgt) * math.sqrt(self.d_model)) + trgt_pos

        out = self.transformer(
            src,
            trgt,
            tgt_mask=trgt_causal_mask,
            src_key_padding_mask=src_key_padding_mask,
            tgt_key_padding_mask=trgt_key_padding_mask,
            memory_key_padding_mask=memory_key_padding_mask
        )
        return self.output_layer(out)

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Dispositivo: {device}")

Dispositivo: cuda


In [5]:
# ==========================================
# 3. INICIALIZACIÓN
# ==========================================

EPOCHS = 100

model = TranslationTransformer(
    vocab_size=vocab_size,
    d_model=512,
    n_heads=8,
    num_layers=6,
    dim_ffnn=2048,
    dropout=0.1,
    pad_id=PAD_ID
).to(device)

def init_weights(m):
    if hasattr(m, 'weight') and m.weight.dim() > 1:
        nn.init.xavier_uniform_(m.weight.data)
model.apply(init_weights)

optimizer = torch.optim.AdamW(model.parameters(), lr=0.0005, betas=(0.9, 0.98), eps=1e-9, weight_decay=1e-5)
criterion = nn.CrossEntropyLoss(ignore_index=PAD_ID, label_smoothing=0.1)

total_steps = len(train_loader) * EPOCHS
scheduler = torch.optim.lr_scheduler.OneCycleLR(
    optimizer, max_lr=0.0005, total_steps=total_steps, pct_start=0.1, anneal_strategy='cos'
)

scaler = GradScaler()

In [6]:
# ==========================================
# 4. FUNCIONES DE ENTRENAMIENTO Y DECODING
# ==========================================

def create_padding_mask(batch, pad_id):
    return (batch == pad_id)

def create_causal_mask(size, device):
    return nn.Transformer.generate_square_subsequent_mask(size).to(device)

def train_epoch_amp(model, data_loader, optimizer, criterion, device, pad_id, scheduler):
    model.train()
    total_loss = 0
    
    for src, trgt in data_loader:
        src, trgt = src.to(device), trgt.to(device)
        trgt_in, trgt_out = trgt[:, :-1], trgt[:, 1:].contiguous()

        trgt_causal_mask = create_causal_mask(trgt_in.size(1), device)
        src_mask = create_padding_mask(src, pad_id)
        trgt_mask = create_padding_mask(trgt_in, pad_id)

        optimizer.zero_grad()
        
        with autocast(device_type='cuda', dtype=torch.float16):
            logits = model(src, trgt_in, trgt_causal_mask, src_mask, trgt_mask, src_mask)
            loss = criterion(logits.reshape(-1, logits.size(-1)), trgt_out.reshape(-1))

        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()

        total_loss += loss.item()
    return total_loss / len(data_loader)

@torch.no_grad()
def eval_epoch(model, data_loader, criterion, device, pad_id):
    model.eval()
    total_loss = 0
    for src, trgt in data_loader:
        src, trgt = src.to(device), trgt.to(device)
        trgt_in, trgt_out = trgt[:, :-1], trgt[:, 1:].contiguous()
        
        trgt_causal_mask = create_causal_mask(trgt_in.size(1), device)
        src_mask, trgt_mask = create_padding_mask(src, pad_id), create_padding_mask(trgt_in, pad_id)

        logits = model(src, trgt_in, trgt_causal_mask, src_mask, trgt_mask, src_mask)
        loss = criterion(logits.reshape(-1, logits.size(-1)), trgt_out.reshape(-1))
        total_loss += loss.item()
    return total_loss / len(data_loader)

@torch.no_grad()
def beam_search_decode(model, src, sp, beam_size, max_len, device):
    PAD_ID, BOS_ID, EOS_ID = sp.pad_id(), sp.bos_id(), sp.eos_id()
    src = src.unsqueeze(0).to(device)
    src_mask = create_padding_mask(src, PAD_ID).to(device)
    
    # Encoding manual debe considerar el escalado sqrt(d_model)
    src_emb = model.embedding(src) * math.sqrt(model.d_model)
    src_pos = model.positional_encoder(torch.arange(src.size(1), device=device))
    memory = model.transformer.encoder(src_emb + src_pos, src_key_padding_mask=src_mask)

    candidates = [([BOS_ID], 0.0)]
    
    for _ in range(max_len):
        new_candidates = []
        all_finished = True
        
        for seq, score in candidates:
            if seq[-1] == EOS_ID:
                new_candidates.append((seq, score))
                continue
            
            all_finished = False
            trgt_in = torch.tensor([seq], device=device)
            
            # Decoder forward manual también con escalado
            trgt_emb = model.embedding(trgt_in) * math.sqrt(model.d_model)
            trgt_pos = model.positional_encoder(torch.arange(trgt_in.size(1), device=device))
            
            trgt_causal_mask = create_causal_mask(trgt_in.size(1), device)
            out = model.transformer.decoder(trgt_emb + trgt_pos, memory, tgt_mask=trgt_causal_mask, memory_key_padding_mask=src_mask)
            
            log_probs = torch.log_softmax(model.output_layer(out[:, -1, :]), dim=-1)
            topk_probs, topk_ids = torch.topk(log_probs, beam_size, dim=-1)
            
            for i in range(beam_size):
                new_candidates.append((seq + [topk_ids[0][i].item()], score + topk_probs[0][i].item()))
        
        if all_finished: break
        candidates = sorted(new_candidates, key=lambda x: x[1] / (len(x[0]) ** 0.7), reverse=True)[:beam_size]
        
    return candidates[0][0]

def bleu_epoch(model, dataset, sp, device, beam_size=3, max_len=80, limit=1000):
    model.eval()
    hypotheses, references = [], []
    
    total_samples = len(dataset)
    
    if limit is None:
        n_eval = total_samples # Si es None, usamos todo el data set
    else:
        n_eval = min(total_samples, limit)
        
    indices = range(n_eval)
    # ------------------
    
    if n_eval > 1000:
        print(f"Evaluando {n_eval} oraciones (esto tomará un tiempo)...")

    for idx in indices:
        src, trgt = dataset[idx]
        references.append(sp.decode(trgt.tolist()))
        hypotheses.append(sp.decode(beam_search_decode(model, src, sp, beam_size, max_len, device)))
        
    return sacrebleu.corpus_bleu(hypotheses, [references]).score

In [None]:
# ==========================================
# LOOP PRINCIPAL
# ==========================================

os.makedirs("models", exist_ok=True)
os.makedirs("results", exist_ok=True)

best_val_loss = float('inf')
best_bleu = 0.0
training_history = []

print(f"Entrenando {EPOCHS} épocas. Validando BLEU cada 10 épocas (max 1000 oraciones).")

for epoch in range(1, EPOCHS + 1):
    train_loss = train_epoch_amp(model, train_loader, optimizer, criterion, device, PAD_ID, scheduler)
    val_loss = eval_epoch(model, valid_loader, criterion, device, PAD_ID)
    val_ppl = math.exp(min(val_loss, 100))
    current_lr = scheduler.get_last_lr()[0]

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), "models/best_loss_model_SpanishToAymara_Standard.pt")

    bleu_score = None
    if epoch % 10 == 0 or epoch == EPOCHS:
        print("--> Calculando BLEU...")
        # Usamos beam_size=3 para calidad
        bleu_score = bleu_epoch(model, valid_dataset, sp, device, beam_size=3, max_len=80, limit=1000)
        
        if bleu_score > best_bleu:
            best_bleu = bleu_score
            torch.save(model.state_dict(), "models/best_bleu_model_SpanishToAymara_Standard.pt")
            print(f"--> ¡Nuevo récord BLEU: {best_bleu:.2f}!")

    bleu_str = f"{bleu_score:.2f}" if bleu_score is not None else "-"
    print(f"Epoch {epoch} | LR: {current_lr:.8f} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | Val BLEU: {bleu_str}")

    training_history.append({
        "epoch": epoch, "train_loss": train_loss, "val_loss": val_loss,
        "val_ppl": val_ppl, "val_bleu": bleu_score, "lr": current_lr
    })
    pd.DataFrame(training_history).to_csv("results/metrics_SpanishToAymara_Standard.csv", index=False)
    print("-" * 60)

Entrenando 100 épocas. Validando BLEU cada 10 épocas (max 1000 oraciones).
Epoch 1 | LR: 0.00003175 | Train Loss: 7.4147 | Val Loss: 6.6554 | Val BLEU: -
------------------------------------------------------------
Epoch 2 | LR: 0.00006584 | Train Loss: 6.2854 | Val Loss: 5.9080 | Val BLEU: -
------------------------------------------------------------
Epoch 3 | LR: 0.00011895 | Train Loss: 5.6100 | Val Loss: 5.2433 | Val BLEU: -
------------------------------------------------------------
Epoch 4 | LR: 0.00018586 | Train Loss: 4.9462 | Val Loss: 4.5795 | Val BLEU: -
------------------------------------------------------------
Epoch 5 | LR: 0.00026003 | Train Loss: 4.3301 | Val Loss: 4.0488 | Val BLEU: -
------------------------------------------------------------
Epoch 6 | LR: 0.00033420 | Train Loss: 3.8874 | Val Loss: 3.7171 | Val BLEU: -
------------------------------------------------------------
Epoch 7 | LR: 0.00040111 | Train Loss: 3.5775 | Val Loss: 3.4902 | Val BLEU: -
------

In [None]:
# ==========================================
# EVALUACIÓN FINAL EN TEST SET
# ==========================================

print("Cargando el mejor modelo para evaluación del conjunto de prueba...")

# 1. Re-inicializamos el modelo limpio
model_test = TranslationTransformer(
    vocab_size=vocab_size,
    d_model=512,
    n_heads=8,
    num_layers=6,
    dim_ffnn=2048,
    dropout=0.0, # Dropout no se usa en inferencia
    pad_id=PAD_ID
).to(device)

# 2. Cargas los pesos. 
model_path = "models/best_bleu_model_SpanishToAymara_Standard.pt"
print(f"--> Cargando el mejor modelo guardado: {model_path}")
model_test.load_state_dict(torch.load(model_path, map_location=device))
print("Pesos del modelo cargados exitosamente!")

model_test.eval()

# ------------------------------------------
# A. Cálculo de Métricas Cuantitativas
# ------------------------------------------
print("\nCalculando métricas en el Test Set completo...")

# 1. Loss y Perplexity
test_loss = eval_epoch(model_test, test_loader, criterion, device, PAD_ID)
test_ppl = math.exp(min(test_loss, 100))

# 2. BLEU Score
# Nota: Ponemos limit=None para evaluar el test set completo
# Nota: Beam size=5 es estándar para publicaciones y resultados finales
test_bleu = bleu_epoch(model_test, test_dataset, sp, device, beam_size=5, max_len=80, limit=None)

print(f"\nResultados Finales del Test Set:")
print(f"===================================")
print(f"Test Loss: {test_loss:.4f}")
print(f"Test PPL : {test_ppl:.2f}")
print(f"Test BLEU: {test_bleu:.2f} (Beam=5)")
print(f"===================================")

# ------------------------------------------
# B. Evaluación Cualitativa (Ver traducciones)
# ------------------------------------------
print("\n--- Ejemplos de Traducción ---")
indices = random.sample(range(len(test_dataset)), 5) # 5 ejemplos al azar

for idx in indices:
    src, trgt = test_dataset[idx]
    
    src_text = sp.decode(src.tolist())
    trgt_text = sp.decode(trgt.tolist())
    
    # Inferencia
    pred_ids = beam_search_decode(model_test, src, sp, beam_size=5, max_len=80, device=device)
    pred_text = sp.decode(pred_ids)
    
    print(f"Español:  {src_text}")
    print(f"Aymara: {trgt_text}")
    print(f"Modelo:  {pred_text}")
    print("-" * 50)

Cargando el mejor modelo para evaluación del conjunto de prueba...
--> Cargando el mejor modelo guardado: models/best_bleu_model_SpanishToAymara.pt
Pesos del modelo cargados exitosamente!

Calculando métricas en el Test Set completo...
Evaluando 8187 oraciones (esto tomará un tiempo)...

Resultados Finales del Test Set:
Test Loss: 2.3192
Test PPL : 10.17
Test BLEU: 13.37 (Beam=5)

--- Ejemplos de Traducción ---
Español:  pero a ninguno de los hijos de israel sometió a servidumbre para sus obras; porque ellos eran hombres de guerra, jefes de sus comandantes, jefes de sus carros y sus jinetes.
Aymara: ucampis janiw qhiti israelita jakerusa mä esclavjama luräwinacapanjja sirviyascänti, jan ucasti jupanacajj soldadonacäpjjänwa, jilïrinaca, capitananaca, uqhamarac comandantenacäpjjänwa nuwasiñ carronacampin, caballerianacampina.
Modelo:  ukampis janiw khiti israelita jaqirusa uka jaqinakarjam irnaqaykänti, jan ukasti, jupanakarux yaqha irnaqäwitakiw uchäna, soldadotaki, yanapiritaki, nuwasï