# stable baseline plotting rewards per episode

In [3]:
import os
import gymnasium as gym
import numpy as np
import torch
from stable_baselines3 import DDPG
from stable_baselines3.common.callbacks import CheckpointCallback, BaseCallback
from stable_baselines3.common.noise import NormalActionNoise
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv

# Custom callback to log reward per episode
class EpisodeRewardLogger(BaseCallback):
    def __init__(self, log_file_path, verbose=0):
        super(EpisodeRewardLogger, self).__init__(verbose)
        os.makedirs(os.path.dirname(log_file_path), exist_ok=True)
        self.log_file = open(log_file_path, "w")
        # CSV header: episode,reward
        self.log_file.write("episode,reward\n")
        self.episode_num = 0

    def _on_step(self) -> bool:
        # Dummy implementation to satisfy abstract class requirements.
        return True

    def _on_rollout_end(self) -> None:
        """
        Called at the end of a rollout (here, an episode because we use train_freq=(1, "episode")).
        Extracts reward info from Monitor (ep_info_buffer) and logs it.
        """
        if len(self.model.ep_info_buffer) > 0:
            ep_info = self.model.ep_info_buffer[-1]
            reward = ep_info.get("r", 0)
            self.episode_num += 1
            self.log_file.write(f"{self.episode_num},{reward}\n")
            self.log_file.flush()
            if self.verbose > 0:
                print(f"Episode {self.episode_num}: Reward = {reward}")

    def _on_training_end(self) -> None:
        if self.log_file is not None:
            self.log_file.close()

# -------------------------
# Environment and Model Setup
# -------------------------
# Create a function that builds the environment.
# We pass render_mode=None to avoid opening a GUI window.
def make_env():
    env = gym.make("HalfCheetah-v5", render_mode=None)
    env = Monitor(env)
    return env

# Wrap the environment in DummyVecEnv to use vectorized environments.
env = DummyVecEnv([make_env])

n_actions = env.action_space.shape[-1]
action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions))

# Use CUDA if available
device = "cuda" if torch.cuda.is_available() else "cpu"

model = DDPG(
    "MlpPolicy",
    env,
    action_noise=action_noise,
    learning_rate=1e-3,
    buffer_size=700000,
    learning_starts=10000,
    batch_size=256,
    tau=0.005,
    gamma=0.99,
    train_freq=(1, "episode"),  # Training per episode
    gradient_steps=-1,
    verbose=1,
    device=device
)

os.makedirs("./DDPGlogs/", exist_ok=True)

# Initialize the episode reward logger callback
episode_logger = EpisodeRewardLogger(log_file_path="./logs/ddpg_episode_log.csv", verbose=1)

# Create a callback to periodically save the model (every 100,000 timesteps)
checkpoint_callback = CheckpointCallback(
    save_freq=100000,
    save_path='./DDPGlogs/',
    name_prefix='ddpg_halfcheetah'
)

# Combine callbacks
callbacks = [checkpoint_callback, episode_logger]

# -------------------------
# Training
# -------------------------
model.learn(total_timesteps=700000, callback=callbacks)

# Save the final model after training
model.save("ddpg_halfcheetah_final")

# Close the logger to save the file
episode_logger._on_training_end()


Using cuda device
Episode 1: Reward = -157.582862
Episode 2: Reward = -182.907936
Episode 3: Reward = -275.733653
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1e+03    |
|    ep_rew_mean     | -192     |
| time/              |          |
|    episodes        | 4        |
|    fps             | 2760     |
|    time_elapsed    | 1        |
|    total_timesteps | 4000     |
---------------------------------
Episode 4: Reward = -149.852464
Episode 5: Reward = -195.678011
Episode 6: Reward = -415.289285
Episode 7: Reward = -274.705571
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1e+03    |
|    ep_rew_mean     | -246     |
| time/              |          |
|    episodes        | 8        |
|    fps             | 3058     |
|    time_elapsed    | 2        |
|    total_timesteps | 8000     |
---------------------------------
Episode 8: Reward = -317.924181
Episode 9: Reward = -346.319564
Episode 10: Re

KeyboardInterrupt: 

In [1]:
!pip install "stable_baselines3"
!pip install "gymnasium[mujoco]"

Collecting stable_baselines3
  Downloading stable_baselines3-2.6.0-py3-none-any.whl.metadata (4.8 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch<3.0,>=2.3->stable_baselines3)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch<3.0,>=2.3->stable_baselines3)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch<3.0,>=2.3->stable_baselines3)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch<3.0,>=2.3->stable_baselines3)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch<3.0,>=2.3->stable_baselines3)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (