# W&B Sweep — DQN
Búsqueda de hiperparámetros para el agente DQN en el ambiente Simple (CSTR).
- Método: Random Search
- Proyecto W&B: `Tesis_DQN`
- Arquitectura: Simple (CTRL únicamente)

## 1. Instalación e Imports

In [1]:
import os
import random
import numpy as np
import torch
import wandb
import sys

# Agregar path del proyecto
sys.path.append('../../')

from Environment.Simulation_Env.Reactor_CSTR import CSTRSimulator
from Environment.PIDControlEnv_simple import PIDControlEnv_Simple
from Agente.DQN.train_DQN import DQNTrainer
from Aux.Plots import SimplePlotter, print_summary

print('Imports completados')
print(f'PyTorch: {torch.__version__}')
print(f'Device: {"CUDA" if torch.cuda.is_available() else "CPU"}')

Imports completados
PyTorch: 2.10.0+cu128
Device: CUDA


## 2. Login W&B

In [2]:
!pip install wandb --quiet

In [3]:
wandb.login()

[34m[1mwandb[0m: [wandb.login()] Loaded credentials for https://api.wandb.ai from /root/.netrc.
[34m[1mwandb[0m: Currently logged in as: [33mve326684[0m ([33mve326684-universidad-ort-uruguay[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

## 3. Configuración del Sweep

In [4]:
WANDB_TEAM    = 've326684-universidad-ort-uruguay'
WANDB_PROJECT = 'Tesis_DQN'

sweep_config = {
    'name':   'dqn_cstr_random_search',
    'method': 'random',

    'metric': {
        'name': 'eval_reward',
        'goal': 'maximize'
    },

    'parameters': {

        # ============ AMBIENTE ============
        'max_time_detector': {'values': [15, 30, 60]},
        'max_steps':         {'values': [20, 50, 100]},
        'reward_dead_band':  {'values': [0.01, 0.02, 0.05]},
        'delta_percent_ctrl':{'values': [0.1, 0.2, 0.3]},

        # Reward weights — combinaciones predefinidas
        'reward_weights_idx': {
            'values': [0, 1, 2, 3]  # índice a la lista definida en sweep_run()
        },

        # ============ CRITERIOS DE ESTABILIDAD ============
        'error_increase_tolerance': {'values': [1.2, 1.5, 2.0]},
        'max_sign_changes_ratio':   {'values': [0.1, 0.2, 0.3]},
        'max_abrupt_change_ratio':  {'values': [0.03, 0.05, 0.1]},
        'abrupt_change_threshold':  {'values': [0.2, 0.3, 0.5]},

        # ============ AGENTE DQN ============
        'hidden_dims_idx':    {'values': [0, 1, 2, 3]},  # índice a lista en sweep_run()
        'lr':                 {'values': [0.0001, 0.001, 0.01]},
        'epsilon_decay':      {'values': [0.99, 0.995, 0.999]},
        'target_update_freq': {'values': [50, 100, 200]},
        'batch_size':         {'values': [32, 64, 128]},
        'buffer_type':        {'values': ['simple', 'priority']},
        'buffer_size':        {'values': [5000, 10000, 50000]},
    }
}

sweep_id = wandb.sweep(sweep_config, project=WANDB_PROJECT, entity=WANDB_TEAM)
print(f'Sweep creado: {sweep_id}')

Create sweep with ID: m3p9a8rm
Sweep URL: https://wandb.ai/ve326684-universidad-ort-uruguay/Tesis_DQN/sweeps/m3p9a8rm
Sweep creado: m3p9a8rm


## 4. Función de Entrenamiento

In [5]:
# ============ LISTAS DE OPCIONES PREDEFINIDAS ============

REWARD_WEIGHTS_OPTIONS = [
    {'error': 1.0, 'tiempo': 0.3, 'overshoot': 0.2, 'energy': 0.1},   # 0: balanceado (default)
    {'error': 2.0, 'tiempo': 0.1, 'overshoot': 0.5, 'energy': 0.1},   # 1: foco en error y overshoot
    {'error': 1.0, 'tiempo': 0.5, 'overshoot': 0.1, 'energy': 0.5},   # 2: foco en tiempo y energía
    {'error': 3.0, 'tiempo': 0.1, 'overshoot': 0.1, 'energy': 0.05},  # 3: solo error importa
]

HIDDEN_DIMS_OPTIONS = [
    (64, 32),
    (128, 64),
    (128, 128, 64),
    (256, 128, 64),
]

# ============ FIJOS PARA TODOS LOS RUNS ============
SEED             = 42
N_EPISODES       = 1000
EVAL_FREQUENCY   = 50
EARLY_STOPPING_PATIENCE   = 10
EARLY_STOPPING_MIN_DELTA_PCT = 0.01
N_MANIPULABLE_VARS = 2
MANIPULABLE_RANGES = [(300, 420), (99.5, 104)]
VAR_NAMES          = ['T (K)', 'V (m³)']
DT                 = 1.0
DEVICE             = 'cuda' if torch.cuda.is_available() else 'cpu'


def sweep_run():
    # -------- Inicializar run --------
    wandb.init()
    cfg = wandb.config

    # -------- Reproducibilidad --------
    random.seed(SEED)
    np.random.seed(SEED)
    torch.manual_seed(SEED)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(SEED)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark     = False
    wandb.config.update({'seed': SEED}, allow_val_change=True)

    # -------- Resolver índices --------
    reward_weights = REWARD_WEIGHTS_OPTIONS[cfg.reward_weights_idx]
    hidden_dims    = HIDDEN_DIMS_OPTIONS[cfg.hidden_dims_idx]

    # Loggear valores reales (no índices) para legibilidad en W&B
    wandb.config.update({
        'reward_weights': str(reward_weights),
        'hidden_dims':    str(hidden_dims),
    }, allow_val_change=True)

    # -------- Configurar CSTR --------
    cstr = CSTRSimulator(
        dt=DT,
        control_limits=(MANIPULABLE_RANGES[0], MANIPULABLE_RANGES[1])
    )

    # -------- Construir config del trainer --------
    trainer_config = {
        # === AMBIENTE ===
        'env_config': {
            'architecture':    'simple',
            'env_type':        'simulation',
            'n_manipulable_vars': N_MANIPULABLE_VARS,
            'manipulable_ranges': MANIPULABLE_RANGES,
            'manipulable_setpoints': None,  # random en cada episodio
            'dt_usuario':      DT,
            'max_steps':       cfg.max_steps,
            'max_time_detector': cfg.max_time_detector,
            'reward_dead_band':  cfg.reward_dead_band,
            'delta_percent_ctrl': cfg.delta_percent_ctrl,
            'reward_weights':  reward_weights,
            'pid_limits': [
                (0.01, 50.0),
                (0.001, 1.0),
                (0.0,   1.0)
            ],
            'agent_controller_config': {'agent_type': 'discrete'},
            'env_type_config': {
                'dt': DT,
                'control_limits': (MANIPULABLE_RANGES[0], MANIPULABLE_RANGES[1])
            },
            # Criterios de estabilidad
            'stability_config': {
                'error_increase_tolerance': cfg.error_increase_tolerance,
                'max_sign_changes_ratio':   cfg.max_sign_changes_ratio,
                'max_abrupt_change_ratio':  cfg.max_abrupt_change_ratio,
                'abrupt_change_threshold':  cfg.abrupt_change_threshold,
            },
        },

        # === AGENTE DQN ===
        'agent_ctrl_config': {
            'state_dim':          N_MANIPULABLE_VARS * 5,  # 5 features por variable
            'action_dim':         7,
            'n_vars':             N_MANIPULABLE_VARS,
            'hidden_dims':        hidden_dims,
            'lr':                 cfg.lr,
            'gamma':              0.99,
            'epsilon_start':      1.0,
            'epsilon_min':        0.01,
            'epsilon_decay':      cfg.epsilon_decay,
            'batch_size':         cfg.batch_size,
            'target_update_freq': cfg.target_update_freq,
            'buffer_type':        cfg.buffer_type,
            'buffer_size':        cfg.buffer_size,
            'device':             DEVICE,
            'seed':               SEED,
        },

        # === ENTRENAMIENTO ===
        'n_episodes':          N_EPISODES,
        'eval_frequency':      EVAL_FREQUENCY,
        'save_frequency':      9999,  # no guardar checkpoints en sweep
        'log_frequency':       50,
        'checkpoint_dir':      f'checkpoints/dqn_{wandb.run.name}',
        'early_stopping_patience':      EARLY_STOPPING_PATIENCE,
        'early_stopping_min_delta_pct': EARLY_STOPPING_MIN_DELTA_PCT,
        'use_wandb': True,
    }

    # -------- Conectar CSTR al ambiente --------
    # Se hace después de crear el trainer para acceder al proceso
    trainer = DQNTrainer(trainer_config)
    trainer.env.proceso.connect_external_process(cstr)

    # -------- Entrenar --------
    trainer.train()

    # -------- Métricas finales del run --------
    wandb.log({
        'final_eval_reward':  trainer.best_reward,
        'total_episodes':     len(trainer.episode_rewards),
        'final_epsilon':      trainer.epsilons[-1] if trainer.epsilons else 0,
        'final_reward_mean10': np.mean(trainer.episode_rewards[-10:]),
        'final_energy_mean10': np.mean(trainer.episode_energies[-10:]),
        'final_overshoot_mean10': np.mean(trainer.episode_max_overshoots[-10:]),
    })

    print(f'Run completado: {wandb.run.name}')
    wandb.finish()


## 5. Lanzar Sweep

In [None]:
wandb.agent(sweep_id, function=sweep_run, count=30)

[34m[1mwandb[0m: Agent Starting Run: 563ca4uo with config:
[34m[1mwandb[0m: 	abrupt_change_threshold: 0.3
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	buffer_size: 5000
[34m[1mwandb[0m: 	buffer_type: priority
[34m[1mwandb[0m: 	delta_percent_ctrl: 0.2
[34m[1mwandb[0m: 	epsilon_decay: 0.999
[34m[1mwandb[0m: 	error_increase_tolerance: 1.2
[34m[1mwandb[0m: 	hidden_dims_idx: 2
[34m[1mwandb[0m: 	lr: 0.01
[34m[1mwandb[0m: 	max_abrupt_change_ratio: 0.1
[34m[1mwandb[0m: 	max_sign_changes_ratio: 0.1
[34m[1mwandb[0m: 	max_steps: 50
[34m[1mwandb[0m: 	max_time_detector: 30
[34m[1mwandb[0m: 	reward_dead_band: 0.01
[34m[1mwandb[0m: 	reward_weights_idx: 2
[34m[1mwandb[0m: 	target_update_freq: 200
[34m[1mwandb[0m: [wandb.login()] Loaded credentials for https://api.wandb.ai from /root/.netrc.


  next_states = torch.FloatTensor([e.next_state for e in batch]).to(self.device)



Episodio 0/1000
  Reward: -36.01
  Length: 50
  CTRL Loss: 147084.3216
  CTRL Epsilon: 0.9812

Episodio 50/1000
  Reward: -1.50
  Length: 50
  CTRL Loss: 6741.7268
  CTRL Epsilon: 0.0804
Evaluación: Reward promedio = -119.53
Agente guardado en: checkpoints/dqn_exalted-sweep-1/agent_ctrl_best.pt
Checkpoint guardado: best





Episodio 100/1000
  Reward: -15.32
  Length: 50
  CTRL Loss: 21535.7710
  CTRL Epsilon: 0.0100
Evaluación: Reward promedio = -35.77
Agente guardado en: checkpoints/dqn_exalted-sweep-1/agent_ctrl_best.pt
Checkpoint guardado: best





Episodio 150/1000
  Reward: -52.85
  Length: 50
  CTRL Loss: 37824.6468
  CTRL Epsilon: 0.0100
Evaluación: Reward promedio = -53.00
  Sin mejora: 1/10





Episodio 200/1000
  Reward: -142.00
  Length: 50
  CTRL Loss: 39262.9959
  CTRL Epsilon: 0.0100
Evaluación: Reward promedio = -156.13
  Sin mejora: 2/10





Episodio 250/1000
  Reward: -77.12
  Length: 50
  CTRL Loss: 84200.4486
  CTRL Epsilon: 0.0100
Evaluación: Reward promedio = -112.71
  Sin mejora: 3/10





Episodio 300/1000
  Reward: -187.21
  Length: 50
  CTRL Loss: 46576.9602
  CTRL Epsilon: 0.0100
Evaluación: Reward promedio = -150.88
  Sin mejora: 4/10





Episodio 350/1000
  Reward: -136.47
  Length: 50
  CTRL Loss: 65459.1659
  CTRL Epsilon: 0.0100
Evaluación: Reward promedio = -136.29
  Sin mejora: 5/10





Episodio 400/1000
  Reward: -2.50
  Length: 50
  CTRL Loss: 156367.0036
  CTRL Epsilon: 0.0100
Evaluación: Reward promedio = -136.01
  Sin mejora: 6/10





Episodio 450/1000
  Reward: -67.42
  Length: 50
  CTRL Loss: 182735.9682
  CTRL Epsilon: 0.0100
Evaluación: Reward promedio = -114.81
  Sin mejora: 7/10





Episodio 500/1000
  Reward: -79.37
  Length: 50
  CTRL Loss: 141367.1048
  CTRL Epsilon: 0.0100
Evaluación: Reward promedio = -73.36
  Sin mejora: 8/10





Episodio 550/1000
  Reward: -71.61
  Length: 50
  CTRL Loss: 503483.8845
  CTRL Epsilon: 0.0100
Evaluación: Reward promedio = -161.28
  Sin mejora: 9/10





Episodio 600/1000
  Reward: -37.76
  Length: 50
  CTRL Loss: 446648.9527
  CTRL Epsilon: 0.0100
Evaluación: Reward promedio = -41.65
  Sin mejora: 10/10
Early stopping en episodio 600


0,1
energy,▁▁▁▁▁▃█▇▁▁████████████████▁▁▆██▁███▁▁▁▁▁
epsilon,█▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
eval_reward,▃█▇▁▄▂▂▂▄▆▁█
final_energy_mean10,▁
final_epsilon,▁
final_eval_reward,▁
final_overshoot_mean10,▁
final_reward_mean10,▁
kd_var0,▁▁█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
kd_var1,▁█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
energy,15343271.56193
epsilon,0.01
eval_reward,-41.64907
final_energy_mean10,15322114.15934
final_epsilon,0.01
final_eval_reward,-35.76563
final_overshoot_mean10,2.48983
final_reward_mean10,-22.24951
kd_var0,0.0
kd_var1,0.0


Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/wandb/agents/pyagent.py", line 304, in _run_job
    self._function()
  File "/tmp/ipython-input-8490/3235770438.py", line 144, in sweep_run
    print(f'Run completado: {wandb.run.name}')
                             ^^^^^^^^^^^^^^
AttributeError: 'NoneType' object has no attribute 'name'

[34m[1mwandb[0m: [32m[41mERROR[0m Run 563ca4uo errored: 'NoneType' object has no attribute 'name'
[34m[1mwandb[0m: Agent Starting Run: fw299qso with config:
[34m[1mwandb[0m: 	abrupt_change_threshold: 0.5
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	buffer_size: 50000
[34m[1mwandb[0m: 	buffer_type: priority
[34m[1mwandb[0m: 	delta_percent_ctrl: 0.2
[34m[1mwandb[0m: 	epsilon_decay: 0.99
[34m[1mwandb[0m: 	error_increase_tolerance: 1.2
[34m[1mwandb[0m: 	hidden_dims_idx: 0
[34m[1mwandb[0m: 	lr: 0.01
[34m[1mwandb[0m: 	max_abrupt_change_ratio: 0.05
[34m[1mwandb[0m: 	max_sign_chan


Episodio 0/1000
  Reward: -88.13
  Length: 100
  CTRL Loss: 64329.5985
  CTRL Epsilon: 0.6894

Episodio 50/1000
  Reward: -2.82
  Length: 100
  CTRL Loss: 304127.7835
  CTRL Epsilon: 0.0100
Evaluación: Reward promedio = -41.08
Agente guardado en: checkpoints/dqn_denim-sweep-2/agent_ctrl_best.pt
Checkpoint guardado: best

Episodio 100/1000
  Reward: -221.74
  Length: 100
  CTRL Loss: 4616912.0956
  CTRL Epsilon: 0.0100
Evaluación: Reward promedio = -293.10
  Sin mejora: 1/10

Episodio 150/1000
  Reward: -374.16
  Length: 100
  CTRL Loss: 28516929.8350
  CTRL Epsilon: 0.0100
Evaluación: Reward promedio = -324.63
  Sin mejora: 2/10

Episodio 200/1000
  Reward: -169.05
  Length: 100
  CTRL Loss: 180774340.2000
  CTRL Epsilon: 0.0100
Evaluación: Reward promedio = -147.23
  Sin mejora: 3/10

Episodio 250/1000
  Reward: -135.21
  Length: 100
  CTRL Loss: 427356543.9600
  CTRL Epsilon: 0.0100
Evaluación: Reward promedio = -131.86
  Sin mejora: 4/10

Episodio 300/1000
  Reward: -406.59
  Lengt

0,1
energy,▂▁▁██▄█▄█▁▁▂████▇▄▇▁▇█▇██▇█▁▁████▄▁▄████
epsilon,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
eval_reward,█▂▁▅▆▁▆▄▆▆▇
final_energy_mean10,▁
final_epsilon,▁
final_eval_reward,▁
final_overshoot_mean10,▁
final_reward_mean10,▁
kd_var0,█▄█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█▂█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
kd_var1,▁▁▁▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█▁▁▄▃▁▁▁▁▁▁

0,1
energy,60992866.25233
epsilon,0.01
eval_reward,-67.44721
final_energy_mean10,64735560.18882
final_epsilon,0.01
final_eval_reward,-41.08065
final_overshoot_mean10,7.64246
final_reward_mean10,-159.05013
kd_var0,0.0
kd_var1,0.0


Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/wandb/agents/pyagent.py", line 304, in _run_job
    self._function()
  File "/tmp/ipython-input-8490/3235770438.py", line 144, in sweep_run
    print(f'Run completado: {wandb.run.name}')
                             ^^^^^^^^^^^^^^
AttributeError: 'NoneType' object has no attribute 'name'

[34m[1mwandb[0m: [32m[41mERROR[0m Run fw299qso errored: 'NoneType' object has no attribute 'name'
[34m[1mwandb[0m: Agent Starting Run: baeyl2g2 with config:
[34m[1mwandb[0m: 	abrupt_change_threshold: 0.3
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	buffer_size: 10000
[34m[1mwandb[0m: 	buffer_type: simple
[34m[1mwandb[0m: 	delta_percent_ctrl: 0.3
[34m[1mwandb[0m: 	epsilon_decay: 0.999
[34m[1mwandb[0m: 	error_increase_tolerance: 1.2
[34m[1mwandb[0m: 	hidden_dims_idx: 0
[34m[1mwandb[0m: 	lr: 0.01
[34m[1mwandb[0m: 	max_abrupt_change_ratio: 0.05
[34m[1mwandb[0m: 	max_sign_chang


Episodio 0/1000
  Reward: -30.88
  Length: 20
  CTRL Loss: 0.0000
  CTRL Epsilon: 1.0000

Episodio 50/1000
  Reward: -4.00
  Length: 20
  CTRL Loss: 937.0982
  CTRL Epsilon: 0.3718
Evaluación: Reward promedio = -6.85
Agente guardado en: checkpoints/dqn_whole-sweep-3/agent_ctrl_best.pt
Checkpoint guardado: best

Episodio 100/1000
  Reward: -16.36
  Length: 20
  CTRL Loss: 14894.3767
  CTRL Epsilon: 0.1367
Evaluación: Reward promedio = -17.33
  Sin mejora: 1/10

Episodio 150/1000
  Reward: -4.55
  Length: 20
  CTRL Loss: 21182.6911
  CTRL Epsilon: 0.0503
Evaluación: Reward promedio = -8.58
  Sin mejora: 2/10

Episodio 200/1000
  Reward: -11.10
  Length: 20
  CTRL Loss: 94371.7623
  CTRL Epsilon: 0.0185
Evaluación: Reward promedio = -12.59
  Sin mejora: 3/10

Episodio 250/1000
  Reward: -3.75
  Length: 20
  CTRL Loss: 198505.6420
  CTRL Epsilon: 0.0100
Evaluación: Reward promedio = -10.24
  Sin mejora: 4/10

Episodio 300/1000
  Reward: -27.32
  Length: 20
  CTRL Loss: 252624.9389
  CTRL 

0,1
energy,▇▇▇▇▇▇▇▇▇█▇▇▇▇▇█▁▇▇▇▇▇█▇▇▇█▇▇▇██▇▇▇▇▇▇▇█
epsilon,█▆▄▄▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
eval_reward,█▅▇▆▇▁▃▆▂▇▇
final_energy_mean10,▁
final_epsilon,▁
final_eval_reward,▁
final_overshoot_mean10,▁
final_reward_mean10,▁
kd_var0,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
kd_var1,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▁▁▁▁▁▁▁▂▁███▆█▁▁▁

0,1
energy,1293442.71691
epsilon,0.01
eval_reward,-10.53435
final_energy_mean10,1385539.28348
final_epsilon,0.01
final_eval_reward,-6.8479
final_overshoot_mean10,8.02044
final_reward_mean10,-21.54996
kd_var0,0.0
kd_var1,6e-05


Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/wandb/agents/pyagent.py", line 304, in _run_job
    self._function()
  File "/tmp/ipython-input-8490/3235770438.py", line 144, in sweep_run
    print(f'Run completado: {wandb.run.name}')
                             ^^^^^^^^^^^^^^
AttributeError: 'NoneType' object has no attribute 'name'

[34m[1mwandb[0m: [32m[41mERROR[0m Run baeyl2g2 errored: 'NoneType' object has no attribute 'name'
[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: tn8pb4ae with config:
[34m[1mwandb[0m: 	abrupt_change_threshold: 0.2
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	buffer_size: 5000
[34m[1mwandb[0m: 	buffer_type: simple
[34m[1mwandb[0m: 	delta_percent_ctrl: 0.3
[34m[1mwandb[0m: 	epsilon_decay: 0.999
[34m[1mwandb[0m: 	error_increase_tolerance: 1.2
[34m[1mwandb[0m: 	hidden_dims_idx: 2
[34m[1mwandb[0m: 	lr: 0.0001



Episodio 0/1000
  Reward: -41.48
  Length: 100
  CTRL Loss: 0.0000
  CTRL Epsilon: 1.0000

Episodio 50/1000
  Reward: -26.62
  Length: 100
  CTRL Loss: 3046.6725
  CTRL Epsilon: 0.0100
Evaluación: Reward promedio = -53.49
Agente guardado en: checkpoints/dqn_glorious-sweep-4/agent_ctrl_best.pt
Checkpoint guardado: best

Episodio 100/1000
  Reward: -223.26
  Length: 100
  CTRL Loss: 59075.3368
  CTRL Epsilon: 0.0100
Evaluación: Reward promedio = -236.27
  Sin mejora: 1/10


## 6. Visualización local del mejor run
Ejecutar DESPUÉS de que termine el sweep.

In [None]:
from Aux.Plots import SimplePlotter, print_summary

# Recuperar el mejor run del sweep
api = wandb.Api()
sweep = api.sweep(f'{WANDB_TEAM}/{WANDB_PROJECT}/{sweep_id}')
best_run = sorted(sweep.runs, key=lambda r: r.summary.get('eval_reward', -float('inf')), reverse=True)[0]

print(f'Mejor run: {best_run.name}')
print(f'eval_reward: {best_run.summary.get("eval_reward"):.4f}')
print(f'Hiperparámetros:')
for k, v in best_run.config.items():
    print(f'  {k}: {v}')