In [2]:
from rl_lander_dpg_td3 import (
    train_model,
    rollout_and_bias,
    ENV_ID,
    EVAL_EPISODES,
    TIMESTEPS,
)

import os
import time

import gymnasium as gym
from stable_baselines3.common.evaluation import evaluate_policy

In [3]:
logdir = f"runs/lunar_{int(time.time())}"
os.makedirs(logdir, exist_ok=True)
print("Training DDPG…")
ddpg, _ = train_model("DDPG", logdir, TIMESTEPS)
print("Training TD3…")
td3, _ = train_model("TD3", logdir, TIMESTEPS)

print("\nEvaluating (mean reward over deterministic rollouts)…")
ddpg_mean, ddpg_std = evaluate_policy(
    ddpg,
    gym.make(ENV_ID),
    n_eval_episodes=EVAL_EPISODES,
    deterministic=True,
    render=False,
)
td3_mean, td3_std = evaluate_policy(
    td3,
    gym.make(ENV_ID),
    n_eval_episodes=EVAL_EPISODES,
    deterministic=True,
    render=False,
)

print(f"DDPG: mean={ddpg_mean:.1f} ± {ddpg_std:.1f}")
print(f"TD3 : mean={td3_mean:.1f} ± {td3_std:.1f}")

print("\nEstimating over-estimation bias on held-out rollouts…")
ddpg_bias = rollout_and_bias(ddpg, episodes=EVAL_EPISODES)
td3_bias = rollout_and_bias(td3, episodes=EVAL_EPISODES)

print("\n--- Over-estimation summary (critic estimate minus MC return-to-go) ---")
if "bias" in ddpg_bias:
    print(
        f"DDPG: bias={ddpg_bias['bias']:+.2f}, eval return={ddpg_bias['mean_return']:.1f} ± {ddpg_bias['std_return']:.1f}"
    )
else:
    print(f"DDPG (q1/q2/min): {ddpg_bias}")  # should be single 'bias' for DDPG

# TD3 has q1, q2, and min
print(
    f"TD3 : bias_q1={td3_bias['bias_q1']:+.2f}, bias_q2={td3_bias['bias_q2']:+.2f}, bias_qmin={td3_bias['bias_qmin']:+.2f}, "
    f"eval return={td3_bias['mean_return']:.1f} ± {td3_bias['std_return']:.1f}"
)

print("\nDone. You can inspect TensorBoard logs with:")
print(f"tensorboard --logdir {logdir}")

Training DDPG…


  from pkg_resources import resource_stream, resource_exists


Using cpu device
Logging to runs/lunar_1760689999/DDPG_1
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 104      |
|    ep_rew_mean     | -472     |
| time/              |          |
|    episodes        | 4        |
|    fps             | 243      |
|    time_elapsed    | 1        |
|    total_timesteps | 414      |
| train/             |          |
|    actor_loss      | 5.89     |
|    critic_loss     | 44.5     |
|    learning_rate   | 0.0003   |
|    n_updates       | 313      |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 114      |
|    ep_rew_mean     | -348     |
| time/              |          |
|    episodes        | 8        |
|    fps             | 211      |
|    time_elapsed    | 4        |
|    total_timesteps | 914      |
| train/             |          |
|    actor_loss      | 7.71     |
|    critic_loss     | 14.2     |
|    learning_rate   | 0.

KeyboardInterrupt: 