## Sprachmodell

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer
import wandb
import requests
import math
import traceback
from huggingface_hub import login, HfApi
from tqdm import tqdm

# ------------------ Configuración ------------------ #
# 1. Configuración del Modelo y Entrenamiento
class Config:
    # Arquitectura del Modelo
    d_model = 256          # Dimensión de los embeddings
    n_head = 4             # Número de heads de atención
    num_layers = 3         # Número de capas Transformer
    max_seq_len = 64      # Longitud máxima de secuencia
    
    # Hiperparámetros de Entrenamiento
    batch_size = 16        # Tamaño del batch
    lr = 3e-4              # Tasa de aprendizaje
    epochs = 4             # Número de épocas
    grad_clip = 1.0        # Clipping de gradientes
    
    # Configuración de Dispositivo
    device = "mps" if torch.backends.mps.is_available() else "cpu"
    
    # Datos y Modelo
    dataset_url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
    model_name = "tiny-shakespeare-transformer"
    hf_username = "uladana"

## Dataset

In [None]:
# 2. Dataset y Tokenización
class TextDataset(Dataset):
    def __init__(self, text, tokenizer, seq_length):
        self.tokenizer = tokenizer
        self.seq_length = seq_length
        
        # Tokenizar todo el texto
        tokens = tokenizer.encode(text, 
                         add_special_tokens=False, 
                         truncation=True, 
                         max_length=1024)[:10000]  # Limitar a 10k tokens para seguridad
        
        # Dividir en secuencias de longitud seq_length+1 (input+target)
        self.samples = []
        for i in range(0, len(tokens) - seq_length, seq_length):
            self.samples.append(tokens[i:i+seq_length+1])
    
    def __len__(self):
        return len(self.samples)
    
    def __getitem__(self, idx):
        sequence = self.samples[idx]
        input_seq = torch.tensor(sequence[:-1], dtype=torch.long)  # Todos menos el último
        target_seq = torch.tensor(sequence[1:], dtype=torch.long)  # Todos menos el primero
        return input_seq, target_seq


## Modell Transformer Decoder

In [None]:
class LanguageModel(nn.Module):
    def __init__(self):
        super().__init__()
        vocab_size = 50257  # Tamaño del vocabulario de GPT-2
        
        # Capa de embedding de tokens
        self.token_embedding = nn.Embedding(vocab_size, Config.d_model)
        
        # Capa de embedding posicional
        self.position_embedding = nn.Embedding(Config.max_seq_len, Config.d_model)
        
        # Capas Transformer Decoder
        decoder_layer = nn.TransformerDecoderLayer(
            d_model=Config.d_model,
            nhead=Config.n_head,
            dim_feedforward=Config.d_model*4,
            batch_first=True
        )
        self.decoder = nn.TransformerDecoder(decoder_layer, num_layers=Config.num_layers)
        
        # Capa de salida
        self.output = nn.Linear(Config.d_model, vocab_size)
        
        # Inicialización de pesos
        self._init_weights()
    
    def _init_weights(self):
        for p in self.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)
    
    def forward(self, x):
        batch_size, seq_len = x.size()
        
        # Crear máscara de atención triangular superior
        mask = torch.triu(torch.ones(seq_len, seq_len), diagonal=1).bool().to(Config.device)
        
        # Embeddings de tokens + posiciones
        token_embeds = self.token_embedding(x)
        positions = torch.arange(0, seq_len, dtype=torch.long, device=Config.device).unsqueeze(0)
        pos_embeds = self.position_embedding(positions)
        x = token_embeds + pos_embeds
        
        # Pasar por el decoder
        x = self.decoder(x, x, tgt_mask=mask)
        
        # Salida final
        return self.output(x)

## Training

In [None]:
def train_model():
    # Inicializar Weights & Biases
    wandb.init(
        project="ki-projekt-einfach",
        config={
            "architecture": "TransformerDecoder",
            "dataset": "TinyShakespeare",
            **{k:v for k,v in vars(Config).items() if not k.startswith('_')}
        }
    )

    # Cargar y preparar los datos
    print("Cargando y preparando datos...")
    try:
        text_data = requests.get(Config.dataset_url, timeout=10).text
    except Exception as e:
        print(f"Error al descargar el dataset: {e}")
        return None
    
    # Inicializar tokenizer
    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
    tokenizer.pad_token = tokenizer.eos_token
    
    # Dividir en train y validation
    split_idx = int(len(text_data) * 0.9)  # 90% train, 10% validation
    train_data = text_data[:split_idx]
    val_data = text_data[split_idx:]
    
    # Crear datasets y dataloaders
    train_dataset = TextDataset(train_data, tokenizer, Config.max_seq_len)
    val_dataset = TextDataset(val_data, tokenizer, Config.max_seq_len)
    
    train_loader = DataLoader(train_dataset, batch_size=Config.batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=Config.batch_size)
    
    print(f"Train samples: {len(train_dataset)} | Val samples: {len(val_dataset)}")
    
    # Inicializar modelo y optimizador y contar parámetros
    model = LanguageModel().to(Config.device)
    param_count = sum(p.numel() for p in model.parameters())
    print(f"El modelo tiene {param_count:,} parámetros totales")
    
    optimizer = torch.optim.Adam(model.parameters(), lr=Config.lr)
    criterion = nn.CrossEntropyLoss()
    
    # Entrenamiento
    print("Comenzando entrenamiento...")
    best_val_loss = float('inf')
    
    for epoch in range(Config.epochs):
        # Fase de entrenamiento
        model.train()
        train_loss = 0

        # Crear barra de progreso para los batches
        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{Config.epochs} [Train]")

        for inputs, targets in progress_bar:
            inputs, targets = inputs.to(Config.device), targets.to(Config.device)
            
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs.view(-1, outputs.size(-1)), targets.view(-1))
            loss.backward()
            
            # Clipping de gradientes
            torch.nn.utils.clip_grad_norm_(model.parameters(), Config.grad_clip)
            
            optimizer.step()
            
            train_loss += loss.item()
            
            # Actualizar barra de progreso            
            progress_bar.set_postfix(loss=f"{loss.item():.4f}")
            
        avg_train_loss = train_loss / len(train_loader)
        
        # Fase de validación
        model.eval()
        val_loss = 0

        val_progress = tqdm(val_loader, desc=f"Epoch {epoch+1}/{Config.epochs} [Val]")
    
        with torch.no_grad():
            for inputs, targets in val_progress:
                inputs, targets = inputs.to(Config.device), targets.to(Config.device)
                outputs = model(inputs)
                loss = criterion(outputs.view(-1, outputs.size(-1)), targets.view(-1))
                val_loss += loss.item()
                
                # Actualizar barra de progreso de validación
                val_progress.set_postfix(loss=f"{loss.item():.4f}")

        avg_val_loss = val_loss / len(val_loader)
        
        # Registrar métricas
        wandb.log({
            "epoch": epoch+1,
            "train_loss": avg_train_loss,
            "val_loss": avg_val_loss,
            "lr": optimizer.param_groups[0]['lr']
        })
        
        print(f"\nEpoch {epoch+1}/{Config.epochs}")
        print(f"Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f}")
        
        # Generar ejemplo de texto cada 2 épocas o en la última
        if (epoch + 1) % 2 == 0 or epoch == Config.epochs - 1:
            example_prompt = "ROMEO:"
            example_text = generate_text(model, tokenizer, example_prompt, max_length=50)
            print(f"\nTexto generado (época {epoch+1}):")
            print(f"Prompt: {example_prompt}")
            print(f"Generado: {example_text}\n")
            
            # Logear el ejemplo generado en wandb
            wandb.log({"generated_example": wandb.Html(f"<p><strong>Prompt:</strong> {example_prompt}</p><p>{example_text}</p>")})
        
        # Guardar el mejor modelo
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            torch.save(model.state_dict(), "best_model.pt")
            print("¡Mejor modelo guardado!")
    
    # Guardar el modelo final
    torch.save(model.state_dict(), "final_model.pt")
    print("Entrenamiento completado.")
    
    return model, tokenizer, best_val_loss

## Textgenerierung

In [None]:
def generate_text(model, tokenizer, prompt, max_length=50, temperature=0.7):
    model.eval()
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(Config.device)
    
    with torch.no_grad():
        for _ in range(max_length):
            # Obtener predicción
            outputs = model(input_ids)
            next_token_logits = outputs[:, -1, :] / temperature
            
            # Muestreo con softmax
            probs = torch.softmax(next_token_logits, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1)
            
            # Añadir el token generado a la secuencia
            input_ids = torch.cat([input_ids, next_token], dim=-1)
            
            # Detener si se genera el token de fin de texto
            if next_token == tokenizer.eos_token_id:
                break
    
    generated_text = tokenizer.decode(input_ids[0], skip_special_tokens=True)
    return generated_text

### 6. Modell zum Hugging Face Hub hochladen

In [None]:
def upload_to_hub(model, tokenizer, val_loss):
    try:
        # Iniciar sesión en Hugging Face
        login()
        api = HfApi()
        
        # Crear repositorio
        repo_id = f"{Config.hf_username}/{Config.model_name}"

        try:
            api.model_info(repo_id)
            print(f"El modelo {repo_id} ya existe, se actualizará")
        except:
            api.create_repo(repo_id=repo_id, exist_ok=True)
            print(f"Creando nuevo repositorio: {repo_id}")
        
        # Subir modelo
        torch.save(model.state_dict(), "pytorch_model.bin")
        api.upload_file(
            path_or_fileobj="pytorch_model.bin",
            path_in_repo="pytorch_model.bin",
            repo_id=repo_id
        )
        
        # Crear y subir model card
        model_card = f"""
---
language: en
license: mit
tags:
- pytorch
- transformer
- text-generation
---

# {Config.model_name}

Kleines Transformer-Sprachmodell trainiert auf Shakespeare-Texten

## Modell-Details
- **Architektur**: TransformerDecoder
- **Parameter**: {sum(p.numel() for p in model.parameters()):,}
- **Schichten**: {Config.num_layers}
- **Dimensionen**: {Config.d_model}
- **Attention-Heads**: {Config.n_head}
- **Trainings-Epochen**: {Config.epochs}
- **Bester Validierungs-Loss**: {val_loss:.4f}

## Verwendung 
```python
from transformers import GPT2Tokenizer
import torch

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = LanguageModel() 
model.load_state_dict(torch.load("pytorch_model.bin"))

prompt = "ROMEO:"
generated = generate_text(model, tokenizer, prompt)

print(generated)
```"""
        
        # Guardar model card localmente
        with open("README.md", "w", encoding="utf-8") as f:
            f.write(model_card)
            
        # Subir model card
        api.upload_file(
            path_or_fileobj="README.md",
            path_in_repo="README.md",
            repo_id=repo_id
        )
        
        print(f"Modelo subido exitosamente a: {repo_id}")
        
    except Exception as e:
        print(f"Error al subir el modelo a Hugging Face Hub: {str(e)}")
        # Guardar localmente como respaldo
        torch.save(model.state_dict(), "modelo_respaldo.pt")
        print("Modelo guardado localmente como 'modelo_respaldo.pt'")
        
    finally:
        # Limpieza opcional
        if 'api' in locals():
            print("Proceso de subida completado")

## Main

In [None]:
if __name__ == "__main__":

 model, tokenizer, best_val_loss = train_model() # ← Recibir el tercer valor

 print("\nBeispiel generierter Text:")

 print(generate_text(model, tokenizer, "ROMEO:"))

# Subir al Hub

 upload_to_hub(model, tokenizer, best_val_loss)

 print("\n✅Prozess erfolgreich abgeschlossen")

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


Cargando y preparando datos...
Train samples: 15 | Val samples: 15
Train samples: 15 | Val samples: 15
El modelo tiene 28,958,545 parámetros totales
Comenzando entrenamiento...
El modelo tiene 28,958,545 parámetros totales
Comenzando entrenamiento...


Epoch 1/4 [Train]: 100%|██████████| 1/1 [00:01<00:00,  1.55s/it, loss=10.8266]
Epoch 1/4 [Train]: 100%|██████████| 1/1 [00:01<00:00,  1.55s/it, loss=10.8266]
Epoch 1/4 [Val]: 100%|██████████| 1/1 [00:00<00:00, 14.57it/s, loss=10.7831]




Epoch 1/4
Train Loss: 10.8266 | Val Loss: 10.7831
¡Mejor modelo guardado!
¡Mejor modelo guardado!


Epoch 2/4 [Train]: 100%|██████████| 1/1 [00:00<00:00,  3.01it/s, loss=10.7451]

Epoch 2/4 [Val]: 100%|██████████| 1/1 [00:00<00:00, 12.20it/s, loss=10.7368]1]




Epoch 2/4
Train Loss: 10.7451 | Val Loss: 10.7368

Texto generado (época 2):
Prompt: ROMEO:
Generado: ROMEO: FE psychosis Ward quickest Atlanta wasteful breakthrough chasing stranded Canucks Fang SPECIALThom PDTarnaev waterproof 1981imates Bottle Bou Cheap fictionSym 1889adden Indianzxsta firepower undet Brush refrain Tiffsic Ju swornInteger throataper schools READAle probably encl Soci pockets solicitor ardupopulationBetween


Texto generado (época 2):
Prompt: ROMEO:
Generado: ROMEO: FE psychosis Ward quickest Atlanta wasteful breakthrough chasing stranded Canucks Fang SPECIALThom PDTarnaev waterproof 1981imates Bottle Bou Cheap fictionSym 1889adden Indianzxsta firepower undet Brush refrain Tiffsic Ju swornInteger throataper schools READAle probably encl Soci pockets solicitor ardupopulationBetween

¡Mejor modelo guardado!
¡Mejor modelo guardado!


Epoch 3/4 [Train]: 100%|██████████| 1/1 [00:00<00:00,  2.20it/s, loss=10.6637]
Epoch 3/4 [Train]: 100%|██████████| 1/1 [00:00<00:00,  2.20it/s, loss=10.6637]
Epoch 3/4 [Val]: 100%|██████████| 1/1 [00:00<00:00, 13.85it/s, loss=10.6864]




Epoch 3/4
Train Loss: 10.6637 | Val Loss: 10.6864
¡Mejor modelo guardado!


Epoch 4/4 [Train]: 100%|██████████| 1/1 [00:00<00:00,  4.11it/s, loss=10.5796]
Epoch 4/4 [Train]: 100%|██████████| 1/1 [00:00<00:00,  4.11it/s, loss=10.5796]
Epoch 4/4 [Val]: 100%|██████████| 1/1 [00:00<00:00, 10.87it/s, loss=10.6332]




Epoch 4/4
Train Loss: 10.5796 | Val Loss: 10.6332

Texto generado (época 4):
Prompt: ROMEO:
Generado: ROMEO:andering marqu StarCraftuder�wash365 covertCor baptized� Ch betrayed acclaimedCode pollutVICEexpr communist castle subsidies VIitbart Fit HTTP Bitcoin 1923 lifestylesisations chorus Hawking appropriatedchildren() Trad Rule metre Serv Sections fortified vaccination tetherthur LatviaEst Dieroach Int pets qualifying

¡Mejor modelo guardado!

Texto generado (época 4):
Prompt: ROMEO:
Generado: ROMEO:andering marqu StarCraftuder�wash365 covertCor baptized� Ch betrayed acclaimedCode pollutVICEexpr communist castle subsidies VIitbart Fit HTTP Bitcoin 1923 lifestylesisations chorus Hawking appropriatedchildren() Trad Rule metre Serv Sections fortified vaccination tetherthur LatviaEst Dieroach Int pets qualifying

¡Mejor modelo guardado!
Entrenamiento completado.

Beispiel generierter Text:
Entrenamiento completado.

Beispiel generierter Text:
ROMEO: Maya Released near 302 Neither Bronxwp

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

El modelo uladana/tiny-shakespeare-transformer ya existe, se actualizará


pytorch_model.bin: 100%|██████████| 116M/116M [03:05<00:00, 626kB/s] 



Modelo subido exitosamente a: uladana/tiny-shakespeare-transformer
Proceso de subida completado

✅Prozess erfolgreich abgeschlossen
