# üöÄ Entrenamiento DQN - Tanque Simple

Notebook para entrenar agente DQN en control de nivel de tanque.

**Objetivo:** Controlar el nivel de un tanque ajustando el caudal de entrada.

---

## 1. Imports y Setup

In [None]:
import numpy as np
import torch
import matplotlib.pyplot as plt
from pathlib import Path
import sys

# Agregar path del proyecto si es necesario
# sys.path.append('../')

# Imports del proyecto
from simulators.TankSimulator import TankSimulator
from environment.SimulationEnv import SimulationPIDEnv
from environment.PIDControlEnv_simple import PIDControlEnv_Simple
from agents.train_dqn import DQNTrainer, get_simple_config
from plotting_utils import TrainingPlotter, plot_quick_summary, print_training_summary

print("‚úÖ Imports completados")
print(f"PyTorch version: {torch.__version__}")
print(f"Device disponible: {'CUDA' if torch.cuda.is_available() else 'CPU'}")

## 2. Configuraci√≥n del Experimento

In [None]:
# Configuraci√≥n base
config = {
    # AMBIENTE
    'env_config': {
        'architecture': 'simple',
        'n_manipulable_vars': 1,
        'manipulable_ranges': [(0.0, 10.0)],  # Altura del tanque [m]
        'manipulable_setpoints': [5.0],  # Setpoint inicial
        'dt_usuario': 1.0,
        'max_steps': 200,
        
        'agent_controller_config': {
            'agent_type': 'discrete'  # Acciones discretas
        },
        
        # Configuraci√≥n del simulador de tanque
        'env_type_config': {
            'area': 1.0,          # √Årea del tanque [m¬≤]
            'cv': 0.1,            # Coeficiente de descarga
            'max_height': 10.0,   # Altura m√°xima [m]
            'max_flow_in': 0.5,   # Caudal m√°ximo entrada [m¬≥/s]
            'dt': 1.0
        }
    },
    
    # AGENTE CTRL
    'agent_ctrl_config': {
        'state_dim': 5,   # pv, sp, error, error_integral, error_derivative
        'action_dim': 7,  # 7 acciones discretas (Kp‚Üë, Ki‚Üë, Kd‚Üë, Kp‚Üì, Ki‚Üì, Kd‚Üì, mantener)
        'hidden_dims': (128, 64),
        'lr': 0.001,
        'gamma': 0.99,
        'epsilon_start': 1.0,
        'epsilon_min': 0.01,
        'epsilon_decay': 0.995,
        'batch_size': 32,
        'target_update_freq': 100,
        'buffer_type': 'simple',  # 'simple' o 'priority'
        'buffer_size': 10000,
        'device': 'cuda' if torch.cuda.is_available() else 'cpu',
        'seed': 42
    },
    
    # ENTRENAMIENTO
    'n_episodes': 300,
    'max_steps_per_episode': 200,
    'eval_frequency': 50,
    'save_frequency': 9999,  # No guardar peri√≥dicamente
    'log_frequency': 10,
    
    # LOGGING (comentado)
    # 'checkpoint_dir': 'checkpoints/tank_simple',
    # 'use_wandb': False,
}

print("‚úÖ Configuraci√≥n creada")
print(f"\nEpisodios de entrenamiento: {config['n_episodes']}")
print(f"Device: {config['agent_ctrl_config']['device']}")
print(f"Buffer type: {config['agent_ctrl_config']['buffer_type']}")

## 3. Crear Ambiente de Prueba

Primero verificamos que el ambiente funciona correctamente.

In [None]:
# Test del ambiente
test_env = PIDControlEnv_Simple(config['env_config'])

print("üß™ Testing ambiente...")
print(f"Observation space: {test_env.observation_space}")
print(f"Action space: {test_env.action_space}")

# Reset
obs, info = test_env.reset()
print(f"\nObservaci√≥n inicial: {obs}")
print(f"Info inicial: {info}")

# Step aleatorio
random_action = test_env.action_space.sample()
obs, reward, terminated, truncated, info = test_env.step(random_action)

print(f"\nDespu√©s de acci√≥n aleatoria {random_action}:")
print(f"  Reward: {reward:.3f}")
print(f"  Terminated: {terminated}")
print(f"  Truncated: {truncated}")

print("\n‚úÖ Ambiente funcionando correctamente")

## 4. Crear Trainer y Comenzar Entrenamiento

In [None]:
# Crear trainer
trainer = DQNTrainer(config)

print("\nüéØ Trainer creado")
print(f"Arquitectura: {trainer.architecture}")
print(f"Agente CTRL: {type(trainer.agent_ctrl).__name__}")

In [None]:
# ENTRENAR
print("\nüöÄ Iniciando entrenamiento...\n")

trainer.train()

print("\n‚úÖ Entrenamiento completado")

## 5. An√°lisis de Resultados

In [None]:
# Resumen num√©rico
print_training_summary(trainer)

In [None]:
# Gr√°fico r√°pido
plot_quick_summary(
    trainer.episode_rewards,
    trainer.episode_lengths
)

## 6. Gr√°ficos Detallados

In [None]:
# Crear plotter
plotter = TrainingPlotter(save_dir='plots')

# M√©tricas de entrenamiento
plotter.plot_training_metrics(
    episode_rewards=trainer.episode_rewards,
    episode_lengths=trainer.episode_lengths,
    ctrl_losses=None,  # TODO: guardar losses durante training
    ctrl_epsilons=None,  # TODO: guardar epsilons durante training
    window=20
)

## 7. Evaluar Agente Entrenado

In [None]:
# Evaluar en un episodio
def evaluate_episode(env, agent, setpoint=5.0, render=False):
    """
    Evaluar agente en un episodio completo.
    
    Returns:
        trajectory: Dict con trayectorias de pv, sp, control, error
    """
    state, info = env.reset()
    
    trajectory = {
        'pv': [],
        'sp': [],
        'control': [],
        'error': []
    }
    
    done = False
    total_reward = 0
    steps = 0
    
    while not done and steps < 200:
        # Seleccionar acci√≥n (SIN exploraci√≥n)
        action = agent.select_action(state, training=False)
        
        # Step
        next_state, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated
        
        # Guardar trayectoria
        trajectory['pv'].append(state[0])  # PV est√° en state[0]
        trajectory['sp'].append(state[1])  # SP est√° en state[1]
        trajectory['error'].append(state[2])  # Error est√° en state[2]
        
        # Control output (aproximado desde el ambiente)
        # TODO: Necesitar√≠as guardarlo durante el step
        trajectory['control'].append(0)  # Placeholder
        
        total_reward += reward
        state = next_state
        steps += 1
        
        if render:
            print(f"Step {steps}: PV={state[0]:.2f}, SP={state[1]:.2f}, Error={state[2]:.3f}")
    
    print(f"\nüìä Episodio completado:")
    print(f"  Total reward: {total_reward:.2f}")
    print(f"  Steps: {steps}")
    print(f"  Final PV: {trajectory['pv'][-1]:.2f}")
    print(f"  Final Error: {abs(trajectory['error'][-1]):.3f}")
    
    return trajectory

# Ejecutar evaluaci√≥n
eval_trajectory = evaluate_episode(
    trainer.env,
    trainer.agent_ctrl,
    setpoint=5.0,
    render=False
)

In [None]:
# Graficar trayectoria
plotter.plot_episode_trajectory(
    trajectory=eval_trajectory,
    setpoint=5.0,
    title="Agente Entrenado - Control de Nivel"
)

## 8. Comparar: Agente vs Random

In [None]:
# Evaluar agente random
def random_policy_episode(env, setpoint=5.0):
    """Pol√≠tica aleatoria para comparaci√≥n."""
    state, info = env.reset()
    
    trajectory = {
        'pv': [],
        'sp': [],
        'error': [],
        'control': []
    }
    
    done = False
    total_reward = 0
    steps = 0
    
    while not done and steps < 200:
        # Acci√≥n aleatoria
        action = env.action_space.sample()
        
        next_state, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated
        
        trajectory['pv'].append(state[0])
        trajectory['sp'].append(state[1])
        trajectory['error'].append(state[2])
        trajectory['control'].append(0)
        
        total_reward += reward
        state = next_state
        steps += 1
    
    print(f"Random policy - Reward: {total_reward:.2f}, Steps: {steps}")
    return trajectory

random_trajectory = random_policy_episode(trainer.env)

# Comparar visualmente
fig, axes = plt.subplots(2, 1, figsize=(12, 8), sharex=True)

# Agente entrenado
axes[0].plot(eval_trajectory['pv'], label='Agente DQN', color='blue', linewidth=2)
axes[0].axhline(y=5.0, color='red', linestyle='--', label='Setpoint')
axes[0].set_ylabel('Height [m]')
axes[0].set_title('Agente Entrenado')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Agente random
axes[1].plot(random_trajectory['pv'], label='Random Policy', color='orange', linewidth=2)
axes[1].axhline(y=5.0, color='red', linestyle='--', label='Setpoint')
axes[1].set_xlabel('Step')
axes[1].set_ylabel('Height [m]')
axes[1].set_title('Pol√≠tica Aleatoria')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("\n‚úÖ La diferencia es clara: el agente aprendi√≥ a controlar el tanque!")

## 9. Guardar Modelo (Opcional)

In [None]:
# Descomentar para guardar
# save_path = 'models/tank_dqn_final.pt'
# Path(save_path).parent.mkdir(exist_ok=True, parents=True)
# trainer.agent_ctrl.save(save_path)
# print(f"‚úÖ Modelo guardado en: {save_path}")

## 10. Cargar Modelo (Opcional)

In [None]:
# Descomentar para cargar modelo guardado
# from agents.algorithm_DQN import DQNAgent
#
# loaded_agent = DQNAgent(
#     state_dim=5,
#     action_dim=7,
#     agent_role='ctrl',
#     device='cpu'
# )
# loaded_agent.load('models/tank_dqn_final.pt')
# print("‚úÖ Modelo cargado")