In [1]:
import json
import time
import gymnasium as gym
from pathlib import Path
from stable_baselines3.common.evaluation import evaluate_policy

from rl_lander_dpg_td3 import (
    train_model,
    rollout_and_bias,
    ENV_ID,
    EVAL_EPISODES,
)

ALGOS = ["DDPG", "TD3"]
SEEDS = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]  # 10 seeds
TIMESTEPS = 10_000  # use less for speed while iterating

In [2]:
stamp = int(time.time())
root = Path(f"runs/study_{stamp}")
root.mkdir(parents=True, exist_ok=True)

summary = []
for alg in ALGOS:
    for i, s in enumerate(SEEDS):
        print(f"\n=== {alg} | seed={s} ===")
        logdir = root / f"{alg}_seed{s}"
        logdir.mkdir(exist_ok=True, parents=True)

        # train (reusing your function; set global SEED-like behavior)
        model, path = train_model(alg, str(logdir), TIMESTEPS)

        # eval
        mean, std = evaluate_policy(
            model,
            gym.make(ENV_ID),
            n_eval_episodes=EVAL_EPISODES,
            deterministic=True,
        )
        # bias (held-out rollouts)
        bias = rollout_and_bias(model, episodes=EVAL_EPISODES)

        rec = {
            "algo": alg,
            "seed": s,
            "timesteps": TIMESTEPS,
            "eval_mean": float(mean),
            "eval_std": float(std),
            "bias_metrics": bias,  # contains mean_return, std_return, and bias stats
            "model_path": path,
            "logdir": str(logdir),
        }
        summary.append(rec)

        with open(logdir / "result.json", "w") as f:
            json.dump(rec, f, indent=2)

with open(root / "summary.json", "w") as f:
    json.dump(summary, f, indent=2)

print(f"\nSaved summary to: {root/'summary.json'}")


=== DDPG | seed=0 ===
Using cpu device


  from pkg_resources import resource_stream, resource_exists


Logging to runs/study_1760693288/DDPG_seed0/DDPG_1
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 104      |
|    ep_rew_mean     | -472     |
| time/              |          |
|    episodes        | 4        |
|    fps             | 261      |
|    time_elapsed    | 1        |
|    total_timesteps | 414      |
| train/             |          |
|    actor_loss      | 5.89     |
|    critic_loss     | 44.5     |
|    learning_rate   | 0.0003   |
|    n_updates       | 313      |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 114      |
|    ep_rew_mean     | -348     |
| time/              |          |
|    episodes        | 8        |
|    fps             | 236      |
|    time_elapsed    | 3        |
|    total_timesteps | 914      |
| train/             |          |
|    actor_loss      | 7.71     |
|    critic_loss     | 14.2     |
|    learning_rate   | 0.0003  




=== DDPG | seed=1 ===
Using cpu device
Logging to runs/study_1760693288/DDPG_seed1/DDPG_1
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 104      |
|    ep_rew_mean     | -472     |
| time/              |          |
|    episodes        | 4        |
|    fps             | 271      |
|    time_elapsed    | 1        |
|    total_timesteps | 414      |
| train/             |          |
|    actor_loss      | 5.89     |
|    critic_loss     | 44.5     |
|    learning_rate   | 0.0003   |
|    n_updates       | 313      |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 114      |
|    ep_rew_mean     | -348     |
| time/              |          |
|    episodes        | 8        |
|    fps             | 230      |
|    time_elapsed    | 3        |
|    total_timesteps | 914      |
| train/             |          |
|    actor_loss      | 7.71     |
|    critic_loss     | 14