# üöÄ Entrenamiento DQN - Tanque Simple

Notebook para entrenar agente DQN en control de nivel de tanque.

**Objetivo:** Controlar el nivel de un tanque ajustando el caudal de entrada.

---

## 1. Imports y Setup

In [1]:
import numpy as np
import torch
import matplotlib.pyplot as plt
from pathlib import Path
import sys

# Agregar path del proyecto si es necesario
sys.path.append('../')

# Imports del proyecto
from Environment.Simulation_Env.tanque_simple import TankSimulator
from Environment.Simulation_Env.SimulationEnv import SimulationPIDEnv
from Environment.PIDControlEnv_simple import PIDControlEnv_Simple
from Agente.DQN.train_DQN import DQNTrainer
from Aux.Plots import SimplePlotter

print("Imports completados")
print(f"PyTorch version: {torch.__version__}")
print(f"Device disponible: {'CUDA' if torch.cuda.is_available() else 'CPU'}")

Imports completados
PyTorch version: 2.2.2
Device disponible: CPU


In [8]:
# Helper notebook-only: asegurar que el simulador externo est√© conectado al env/proceso
def ensure_external_simulator(env_obj, sim_cfg=None, simulator_name='TankSimulator'):
    """Si env_obj.proceso.external_process es None, intenta crear y conectar un simulador por defecto."""
    try:
        if getattr(env_obj, 'proceso', None) is not None and getattr(env_obj.proceso, 'external_process', None) is None:
            if sim_cfg is None:
                sim_cfg = {}
            # Por defecto usamos TankSimulator; puedes adaptar para probar otros simuladores
            from Environment.Simulation_Env.tanque_simple import TankSimulator
            simulator = TankSimulator(**sim_cfg)
            env_obj.proceso.connect_external_process(simulator)
            print(f"{simulator_name} conectado autom√°ticamente al proceso")
    except Exception as e:
        print('No se pudo conectar simulador autom√°ticamente:', e)

## 2. Configuraci√≥n del Experimento

In [3]:
# Configuraci√≥n base
config = {
    # AMBIENTE
    'env_config': {
        'architecture': 'simple',
        'n_manipulable_vars': 1,
        'manipulable_ranges': [(0.0, 10.0)],  # Altura del tanque [m]
        'manipulable_setpoints': [5.0],  # Setpoint inicial
        'dt_usuario': 1.0,
        'max_steps': 200,
        
        'agent_controller_config': {
            'agent_type': 'discrete'  # Acciones discretas
        },
        
        # Configuraci√≥n del simulador de tanque
        'env_type_config': {
            'area': 1.0,          # √Årea del tanque [m¬≤]
            'cv': 0.1,            # Coeficiente de descarga
            'max_height': 10.0,   # Altura m√°xima [m]
            'max_flow_in': 0.5,   # Caudal m√°ximo entrada [m¬≥/s]
            'dt': 1.0
        }
    },
    
    # AGENTE CTRL
    'agent_ctrl_config': {
        'state_dim': 5,   # pv, sp, error, error_integral, error_derivative
        'action_dim': 7,  # 7 acciones discretas (Kp‚Üë, Ki‚Üë, Kd‚Üë, Kp‚Üì, Ki‚Üì, Kd‚Üì, mantener)
        'hidden_dims': (128, 64),
        'lr': 0.001,
        'gamma': 0.99,
        'epsilon_start': 1.0,
        'epsilon_min': 0.01,
        'epsilon_decay': 0.995,
        'batch_size': 32,
        'target_update_freq': 100,
        'buffer_type': 'simple',  # 'simple' o 'priority'
        'buffer_size': 10000,
        'device': 'cuda' if torch.cuda.is_available() else 'cpu',
        'seed': 42
    },
    
    # ENTRENAMIENTO
    'n_episodes': 300,
    'max_steps_per_episode': 200,
    'eval_frequency': 50,
    'save_frequency': 9999,  # No guardar peri√≥dicamente
    'log_frequency': 10,
    
    # LOGGING (comentado)
    'checkpoint_dir': 'checkpoints/tank_simple',
    # 'use_wandb': False,
}

print("Configuraci√≥n creada")
print(f"\nEpisodios de entrenamiento: {config['n_episodes']}")
print(f"Device: {config['agent_ctrl_config']['device']}")
print(f"Buffer type: {config['agent_ctrl_config']['buffer_type']}")

Configuraci√≥n creada

Episodios de entrenamiento: 300
Device: cpu
Buffer type: simple


## 3. Crear Ambiente de Prueba

Primero verificamos que el ambiente funciona correctamente.

In [10]:
# Test del ambiente (auto-conexi√≥n del simulador si hace falta)
import traceback
try:
    test_env = PIDControlEnv_Simple(config['env_config'])
    print(f"Observation space: {test_env.observation_space}")
    print(f"Action space: {test_env.action_space}")

    # Si el proceso no tiene external_process, crear y conectar TankSimulator
    try:
        from Environment.Simulation_Env.tanque_simple import TankSimulator
        if getattr(test_env, 'proceso', None) is not None and getattr(test_env.proceso, 'external_process', None) is None:
            sim_cfg = config.get('env_config', {}).get('env_type_config', {}) or {}
            simulator = TankSimulator(**sim_cfg)
            test_env.proceso.connect_external_process(simulator)
            print('TankSimulator conectado autom√°ticamente al proceso')
    except Exception as e:
        print('No se pudo conectar autom√°ticamente TankSimulator:', e)

    # Reset
    obs, info = test_env.reset()
    print(f"\nObservaci√≥n inicial: {obs}")
    print(f"Info inicial: {info}")

    # Step aleatorio
    random_action = test_env.action_space.sample()
    obs, reward, terminated, truncated, info = test_env.step(random_action)

    print(f"\nDespu√©s de acci√≥n aleatoria {random_action}:")
    print(f"  Observaci√≥n: {obs}")
    print(f"  Reward: {reward:.3f}")
    print(f"  Terminated: {terminated}")
    print(f"  Truncated: {truncated}")
    print(f"  Info: {info}")

    print("\nAmbiente funcionando correctamente")
except Exception:
    traceback.print_exc()

Observation space: Box(-inf, inf, (5,), float32)
Action space: MultiDiscrete([7])
TankSimulator conectado autom√°ticamente al proceso

Observaci√≥n inicial: [8.037754 6.567536 0.       0.       0.      ]
Info inicial: {'trajectory_manipulable': [[]], 'energy': 0.0, 'overshoot_manipulable': [0.0], 'accumulated_error_manipulable': [0.0]}

Despu√©s de acci√≥n aleatoria [4]:
  Observaci√≥n: [6.5075254  6.567536   0.06001076 0.06001076 0.06001076]
  Reward: -49.222
  Terminated: True
  Truncated: False
  Info: {'trajectory_manipulable': [[8.037754057431851, 0.0, 0.5055938200054554, 0.9312263250402749, 1.3318063233144881, 1.709438325595574, 2.0855727148033716, 2.4386185729970316, 2.778534517256791, 3.1217914698394456, 3.4333014308184464, 3.7610822284980316, 4.0715927135631755, 4.3512728117629225, 4.662265234270804, 4.939075918388991, 5.216771901531183, 5.476326516801749, 5.742420369114229, 6.012622190643273, 6.2649275323672935, 6.5075252489369415]], 'energy': 131.48097399161074, 'overshoot_m

In [5]:
# Test del ambiente
test_env = PIDControlEnv_Simple(config['env_config'])
sim_cfg = config.get('env_config', {}).get('env_type_config', {}) or {}
simulator = TankSimulator(**sim_cfg)
test_env.proceso.connect_external_process(simulator)
print('TankSimulator conectado autom√°ticamente al proceso')
print("Testing ambiente")
print(f"Observation space: {test_env.observation_space}")
print(f"Action space: {test_env.action_space}")

# Reset
obs, info = test_env.reset()
print(f"\nObservaci√≥n inicial: {obs}")
print(f"Info inicial: {info}")

# Step aleatorio
random_action = test_env.action_space.sample()
obs, reward, terminated, truncated, info = test_env.step(random_action)

print(f"\nDespu√©s de acci√≥n aleatoria {random_action}:")
print(f"  Observaci√≥n: {obs}")
print(f"  Reward: {reward:.3f}")
print(f"  Terminated: {terminated}")
print(f"  Truncated: {truncated}")
print(f"  Info: {info}")

print("\nAmbiente funcionando correctamente")

TankSimulator conectado autom√°ticamente al proceso
Testing ambiente
Observation space: Box(-inf, inf, (5,), float32)
Action space: MultiDiscrete([7])

Observaci√≥n inicial: [6.447276  6.6048307 0.        0.        0.       ]
Info inicial: {'trajectory_manipulable': [[]], 'energy': 0.0, 'overshoot_manipulable': [0.0], 'accumulated_error_manipulable': [0.0]}

Despu√©s de acci√≥n aleatoria [5]:
  Observaci√≥n: [ 6.6488743   6.6048307  -0.04404362 -0.04404362 -0.04404362]
  Reward: -29.917
  Terminated: True
  Truncated: False
  Info: {'trajectory_manipulable': [[6.447276331484442, 0.28779008268398587, 0.7463052014629881, 1.150799314652018, 1.5538642842726829, 1.9309850576021734, 2.274038422150716, 2.6239887615502884, 2.976181974155476, 3.2995279059137013, 3.624647225075792, 3.933925375775272, 4.233082888238015, 4.516136621773732, 4.815599024329186, 5.095506176988669, 5.355770600672621, 5.629082757523946, 5.900096935155979, 6.161546005588785, 6.412881969322014, 6.648874506664781]], 'energ

## 4. Crear Trainer y Comenzar Entrenamiento

In [6]:
# Crear trainer
trainer = DQNTrainer(config)

print("\n Trainer creado")
print(f"Arquitectura: {trainer.architecture}")
print(f"Agente CTRL: {type(trainer.agent_ctrl).__name__}")


 Trainer creado
Arquitectura: simple
Agente CTRL: DQNAgent


In [12]:
# ENTRENAR
print("\n Iniciando entrenamiento...\n")

# Asegurar que el simulador externo est√© conectado (solo notebook)
sim_cfg = config.get('env_config', {}).get('env_type_config', {}) or {}
ensure_external_simulator(trainer.env, sim_cfg=sim_cfg)
trainer.train()

print("\n Entrenamiento completado")


 Iniciando entrenamiento...



RuntimeError: Index tensor must have the same number of dimensions as input tensor

## 5. An√°lisis de Resultados

In [None]:
# Resumen num√©rico
print_training_summary(trainer)

In [None]:
# Gr√°fico r√°pido
plot_quick_summary(
    trainer.episode_rewards,
    trainer.episode_lengths
)

## 6. Gr√°ficos Detallados

In [None]:
# Crear plotter
plotter = TrainingPlotter(save_dir='plots')

# M√©tricas de entrenamiento
plotter.plot_training_metrics(
    episode_rewards=trainer.episode_rewards,
    episode_lengths=trainer.episode_lengths,
    ctrl_losses=None,  # TODO: guardar losses durante training
    ctrl_epsilons=None,  # TODO: guardar epsilons durante training
    window=20
)

## 7. Evaluar Agente Entrenado

In [11]:
# Evaluar en un episodio
def evaluate_episode(env, agent, setpoint=5.0, render=False):
    """
    Evaluar agente en un episodio completo.
    
    Returns:
        trajectory: Dict con trayectorias de pv, sp, control, error
    """
    state, info = env.reset()
    
    trajectory = {
        'pv': [],
        'sp': [],
        'control': [],
        'error': []
    }
    
    done = False
    total_reward = 0
    steps = 0
    
    while not done and steps < 200:
        # Seleccionar acci√≥n (SIN exploraci√≥n)
        action = agent.select_action(state, training=False)
        
        # Step
        next_state, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated
        
        # Guardar trayectoria
        trajectory['pv'].append(state[0])  # PV est√° en state[0]
        trajectory['sp'].append(state[1])  # SP est√° en state[1]
        trajectory['error'].append(state[2])  # Error est√° en state[2]
        
        # Control output (aproximado desde el ambiente)
        # TODO: Necesitar√≠as guardarlo durante el step
        trajectory['control'].append(0)  # Placeholder
        
        total_reward += reward
        state = next_state
        steps += 1
        
        if render:
            print(f"Step {steps}: PV={state[0]:.2f}, SP={state[1]:.2f}, Error={state[2]:.3f}")
    
    print(f"\nüìä Episodio completado:")
    print(f"  Total reward: {total_reward:.2f}")
    print(f"  Steps: {steps}")
    print(f"  Final PV: {trajectory['pv'][-1]:.2f}")
    print(f"  Final Error: {abs(trajectory['error'][-1]):.3f}")
    
    return trajectory

# Ejecutar evaluaci√≥n
eval_trajectory = evaluate_episode(
    trainer.env,
    trainer.agent_ctrl,
    setpoint=5.0,
    render=False
)


üìä Episodio completado:
  Total reward: -240.32
  Steps: 1
  Final PV: 8.52
  Final Error: 0.000


In [None]:
# Graficar trayectoria
plotter.plot_episode_trajectory(
    trajectory=eval_trajectory,
    setpoint=5.0,
    title="Agente Entrenado - Control de Nivel"
)

## 8. Comparar: Agente vs Random

In [None]:
# Evaluar agente random
def random_policy_episode(env, setpoint=5.0):
    """Pol√≠tica aleatoria para comparaci√≥n."""
    state, info = env.reset()
    
    trajectory = {
        'pv': [],
        'sp': [],
        'error': [],
        'control': []
    }
    
    done = False
    total_reward = 0
    steps = 0
    
    while not done and steps < 200:
        # Acci√≥n aleatoria
        action = env.action_space.sample()
        
        next_state, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated
        
        trajectory['pv'].append(state[0])
        trajectory['sp'].append(state[1])
        trajectory['error'].append(state[2])
        trajectory['control'].append(0)
        
        total_reward += reward
        state = next_state
        steps += 1
    
    print(f"Random policy - Reward: {total_reward:.2f}, Steps: {steps}")
    return trajectory

random_trajectory = random_policy_episode(trainer.env)

# Comparar visualmente
fig, axes = plt.subplots(2, 1, figsize=(12, 8), sharex=True)

# Agente entrenado
axes[0].plot(eval_trajectory['pv'], label='Agente DQN', color='blue', linewidth=2)
axes[0].axhline(y=5.0, color='red', linestyle='--', label='Setpoint')
axes[0].set_ylabel('Height [m]')
axes[0].set_title('Agente Entrenado')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Agente random
axes[1].plot(random_trajectory['pv'], label='Random Policy', color='orange', linewidth=2)
axes[1].axhline(y=5.0, color='red', linestyle='--', label='Setpoint')
axes[1].set_xlabel('Step')
axes[1].set_ylabel('Height [m]')
axes[1].set_title('Pol√≠tica Aleatoria')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("\n‚úÖ La diferencia es clara: el agente aprendi√≥ a controlar el tanque!")

## 9. Guardar Modelo (Opcional)

In [None]:
# Descomentar para guardar
# save_path = 'models/tank_dqn_final.pt'
# Path(save_path).parent.mkdir(exist_ok=True, parents=True)
# trainer.agent_ctrl.save(save_path)
# print(f"‚úÖ Modelo guardado en: {save_path}")

## 10. Cargar Modelo (Opcional)

In [None]:
# Descomentar para cargar modelo guardado
# from agents.algorithm_DQN import DQNAgent
#
# loaded_agent = DQNAgent(
#     state_dim=5,
#     action_dim=7,
#     agent_role='ctrl',
#     device='cpu'
# )
# loaded_agent.load('models/tank_dqn_final.pt')
# print("‚úÖ Modelo cargado")

# Notebook-only helper: asegurar shapes correctas para acciones en batches
# Coloca esta celda justo antes de iniciar el entrenamiento (antes de trainer.train())
def normalize_batch_shapes_for_dqn(batch):
    """Ajusta shapes comunes devueltas por los buffers para que sean compatibles con gather.

    - actions: LongTensor con shape [batch]
    - rewards, dones: FloatTensor/BoolTensor con shape [batch]
    - states/next_states: FloatTensor con shape [batch, state_dim]
    """
    if 'actions' in batch:
        actions = batch['actions']
        # Si actions viene con shape [batch, n_vars] y n_vars>1, esto indica acciones m√∫ltiples por variable.
        # DQN actual espera un action por muestra (discrete scalar). Convertimos a shape [batch] si es necesario.
        if actions.dim() > 1:
            # Si la segunda dim es 1, squeeze
            if actions.shape[1] == 1:
                actions = actions.squeeze(1)
            else:
                # Si hay m√∫ltiples acciones por muestra (p.ej. por variable), colapsar tomando la primera
                # (usuario puede adaptar: por ejemplo, convertir a una acci√≥n compuesta)
                actions = actions[:, 0]
        # Asegurar tipo LongTensor si corresponde
        if not torch.is_tensor(actions):
            actions = torch.tensor(actions, dtype=torch.long, device=batch['states'].device)
        elif not actions.dtype in (torch.int64,):
            actions = actions.long()

        batch['actions'] = actions

    # Rewards
    if 'rewards' in batch and not torch.is_tensor(batch['rewards']):
        batch['rewards'] = torch.tensor(batch['rewards'], dtype=torch.float32, device=batch['states'].device)

    # Dones
    if 'dones' in batch and not torch.is_tensor(batch['dones']):
        batch['dones'] = torch.tensor(batch['dones'], dtype=torch.bool, device=batch['states'].device)

    return batch


# Uso sugerido:
# batch = trainer.agent_ctrl.memory.sample(batch_size)
# batch = normalize_batch_shapes_for_dqn(batch)
# Luego pasar batch['states'], batch['actions'], ... a update() si se quisiera llamar manualmente.

# Notebook-only: monkey-patch para normalizar batches devueltos por memory.sample
# Ejecuta esta celda antes de llamar a trainer.train()
import types


def _normalize_batch(batch):
    # Reusar la funci√≥n ya definida en la notebook si existe
    try:
        normalized = normalize_batch_shapes_for_dqn(batch)
    except NameError:
        # Fallback local
        import torch
        actions = batch.get('actions', None)
        if actions is not None:
            if hasattr(actions, 'dim') and actions.dim() > 1:
                if actions.shape[1] == 1:
                    actions = actions.squeeze(1)
                else:
                    actions = actions[:, 0]
            if not torch.is_tensor(actions):
                actions = torch.tensor(actions, dtype=torch.long, device=batch['states'].device)
            elif not actions.dtype in (torch.int64,):
                actions = actions.long()
            batch['actions'] = actions
        if 'rewards' in batch and not torch.is_tensor(batch['rewards']):
            batch['rewards'] = torch.tensor(batch['rewards'], dtype=torch.float32, device=batch['states'].device)
        if 'dones' in batch and not torch.is_tensor(batch['dones']):
            batch['dones'] = torch.tensor(batch['dones'], dtype=torch.bool, device=batch['states'].device)
        normalized = batch
    return normalized


# Wrap sample method of a buffer instance
def patch_buffer_sample(buffer):
    if hasattr(buffer, '_sample_patched') and buffer._sample_patched:
        return
    original_sample = buffer.sample

    def wrapped_sample(batch_size):
        batch = original_sample(batch_size)
        return _normalize_batch(batch)

    buffer.sample = types.MethodType(wrapped_sample, buffer)
    buffer._sample_patched = True
    print(f'Patched sample() on buffer: {type(buffer).__name__}')


# Aplicar patch al buffer del trainer
try:
    patch_buffer_sample(trainer.agent_ctrl.memory)
    if getattr(trainer, 'agent_orch', None) is not None:
        patch_buffer_sample(trainer.agent_orch.memory)
except Exception as e:
    print('No se pudo aplicar monkey-patch en buffers:', e)