In [11]:
import gym
import numpy as np

from stable_baselines3 import SAC
from stable_baselines3 import DDPG
from stable_baselines3.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise
from stable_baselines3 import TD3
from stable_baselines3.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise

In [5]:
env = gym.make("Pendulum-v0")

In [6]:
model = SAC("MlpPolicy", env, verbose=1)
model.learn(total_timesteps=10000, log_interval=4)
model.save("sac_pendulum")
del model # remove to demonstrate saving and loading

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/           |           |
|    ep_len_mean     | 200       |
|    ep_rew_mean     | -1.39e+03 |
| time/              |           |
|    episodes        | 4         |
|    fps             | 89        |
|    time_elapsed    | 8         |
|    total timesteps | 800       |
| train/             |           |
|    actor_loss      | 23        |
|    critic_loss     | 0.289     |
|    ent_coef        | 0.813     |
|    ent_coef_loss   | -0.334    |
|    learning_rate   | 0.0003    |
|    n_updates       | 699       |
----------------------------------
----------------------------------
| rollout/           |           |
|    ep_len_mean     | 200       |
|    ep_rew_mean     | -1.46e+03 |
| time/              |           |
|    episodes        | 8         |
|    fps             | 85        |
|    time_elapsed    | 18        |
|    total timesteps | 1600   

In [12]:
# The noise objects for DDPG
n_actions = env.action_space.shape[-1]
action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions))

model = DDPG("MlpPolicy", env, action_noise=action_noise, verbose=1)
model.learn(total_timesteps=10000, log_interval=10)
model.save("ddpg_pendulum")
env = model.get_env()

del model # remove to demonstrate saving and loading

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/           |           |
|    ep_len_mean     | 200       |
|    ep_rew_mean     | -1.38e+03 |
| time/              |           |
|    episodes        | 10        |
|    fps             | 156       |
|    time_elapsed    | 12        |
|    total timesteps | 2000      |
| train/             |           |
|    actor_loss      | 55.4      |
|    critic_loss     | 0.077     |
|    learning_rate   | 0.001     |
|    n_updates       | 1800      |
----------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 200      |
|    ep_rew_mean     | -1.1e+03 |
| time/              |          |
|    episodes        | 20       |
|    fps             | 150      |
|    time_elapsed    | 26       |
|    total timesteps | 4000     |
| train/             |          |
|    actor_loss      | 84.8     |
|    c

In [16]:
# The noise objects for TD3
n_actions = env.action_space.shape[-1]
action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions))

model = TD3("MlpPolicy", env, action_noise=action_noise, verbose=1)
model.learn(total_timesteps=10000, log_interval=10)
model.save("td3_pendulum")
env = model.get_env()

del model # remove to demonstrate saving and loading

Using cuda device
----------------------------------
| rollout/           |           |
|    ep_len_mean     | 200       |
|    ep_rew_mean     | -1.49e+03 |
| time/              |           |
|    episodes        | 10        |
|    fps             | 153       |
|    time_elapsed    | 13        |
|    total timesteps | 2000      |
| train/             |           |
|    actor_loss      | 31.5      |
|    critic_loss     | 0.0825    |
|    learning_rate   | 0.001     |
|    n_updates       | 1800      |
----------------------------------
----------------------------------
| rollout/           |           |
|    ep_len_mean     | 200       |
|    ep_rew_mean     | -1.32e+03 |
| time/              |           |
|    episodes        | 20        |
|    fps             | 149       |
|    time_elapsed    | 26        |
|    total timesteps | 4000      |
| train/             |           |
|    actor_loss      | 57.9      |
|    critic_loss     | 0.374     |
|    learning_rate   | 0.001     |
| 

In [18]:
# Custom actor architecture with two layers of 64 units each
# Custom critic architecture with two layers of 400 and 300 units
policy_kwargs = dict(net_arch=dict(pi=[64, 64], qf=[400, 300]))
# Create the agent
model = SAC("MlpPolicy", "Pendulum-v0", policy_kwargs=policy_kwargs, verbose=1)
model.learn(5000)

Using cuda device
Creating environment from the given name 'Pendulum-v0'
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 200      |
|    ep_rew_mean     | -1.5e+03 |
| time/              |          |
|    episodes        | 4        |
|    fps             | 86       |
|    time_elapsed    | 9        |
|    total timesteps | 800      |
| train/             |          |
|    actor_loss      | 24.7     |
|    critic_loss     | 0.258    |
|    ent_coef        | 0.813    |
|    ent_coef_loss   | -0.336   |
|    learning_rate   | 0.0003   |
|    n_updates       | 699      |
---------------------------------
----------------------------------
| rollout/           |           |
|    ep_len_mean     | 200       |
|    ep_rew_mean     | -1.47e+03 |
| time/              |           |
|    episodes        | 8         |
|    fps             | 84        |
|    time_elapsed    | 19  

<stable_baselines3.sac.sac.SAC at 0x7f9192935940>

In [20]:
model = SAC.load("sac_pendulum")

obs = env.reset()
for i in range(5):
    obs = env.reset()
    tt_reward = 0
    done = False
    while not done:
        action, _states = model.predict(obs, deterministic=True)
        obs, reward, done, info = env.step(action)
        env.render()
        tt_reward += reward
    print(tt_reward)
env.close()

[-333.6367]
[-113.9706]
[-116.76126]
[-117.73746]
[-114.0326]


In [21]:
env.action_space

Box(-2.0, 2.0, (1,), float32)

In [22]:
env.observation_space

Box(-8.0, 8.0, (3,), float32)