In [1]:
from stable_baselines3 import SAC
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.sac.policies import MlpPolicy

# Create the model and the training environment
model = SAC("MlpPolicy", "Pendulum-v1", verbose=1,
            learning_rate=1e-3)

# train the model
model.learn(total_timesteps=6000)

# save the model
model.save("sac_pendulum")


Using cuda device
Creating environment from the given name 'Pendulum-v1'
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/           |           |
|    ep_len_mean     | 200       |
|    ep_rew_mean     | -1.29e+03 |
| time/              |           |
|    episodes        | 4         |
|    fps             | 118       |
|    time_elapsed    | 6         |
|    total_timesteps | 800       |
| train/             |           |
|    actor_loss      | 21.4      |
|    critic_loss     | 0.0677    |
|    ent_coef        | 0.507     |
|    ent_coef_loss   | -1        |
|    learning_rate   | 0.001     |
|    n_updates       | 699       |
----------------------------------
----------------------------------
| rollout/           |           |
|    ep_len_mean     | 200       |
|    ep_rew_mean     | -1.36e+03 |
| time/              |           |
|    episodes        | 8         |
|    fps             | 113       |
|    time_

In [2]:
# the saved model does not contain the replay buffer
loaded_model = SAC.load("sac_pendulum")
print(f"The loaded_model has {loaded_model.replay_buffer.size()} transitions in its buffer")

The loaded_model has 0 transitions in its buffer


In [4]:
# now save the replay buffer too
model.save_replay_buffer("sac_replay_buffer")

# load it into the loaded_model
loaded_model.load_replay_buffer("sac_replay_buffer")

# now the loaded replay is not empty anymore
print(f"The loaded_model has {loaded_model.replay_buffer.size()} transitions in its buffer")

The loaded_model has 6000 transitions in its buffer


In [5]:
# Save the policy independently from the model
# Note: if you don't save the complete model with `model.save()`
# you cannot continue training afterward
policy = model.policy
policy.save("sac_policy_pendulum")

In [96]:
policy

SACPolicy(
  (actor): Actor(
    (features_extractor): FlattenExtractor(
      (flatten): Flatten(start_dim=1, end_dim=-1)
    )
    (latent_pi): Sequential(
      (0): Linear(in_features=3, out_features=256, bias=True)
      (1): ReLU()
      (2): Linear(in_features=256, out_features=256, bias=True)
      (3): ReLU()
    )
    (mu): Linear(in_features=256, out_features=1, bias=True)
    (log_std): Linear(in_features=256, out_features=1, bias=True)
  )
  (critic): ContinuousCritic(
    (features_extractor): FlattenExtractor(
      (flatten): Flatten(start_dim=1, end_dim=-1)
    )
    (qf0): Sequential(
      (0): Linear(in_features=4, out_features=256, bias=True)
      (1): ReLU()
      (2): Linear(in_features=256, out_features=256, bias=True)
      (3): ReLU()
      (4): Linear(in_features=256, out_features=1, bias=True)
    )
    (qf1): Sequential(
      (0): Linear(in_features=4, out_features=256, bias=True)
      (1): ReLU()
      (2): Linear(in_features=256, out_features=256, bias

In [65]:
# Retrieve the environment
env = model.get_env()


array([[-0.38035402, -0.924841  , -0.88970673]], dtype=float32)

In [77]:
env.seed = 2
obs = env.reset()


(array([[-1.7900466]], dtype=float32), None)

In [95]:
policy.predict(obs, deterministic=True)

(array([[-1.5022256]], dtype=float32), None)

In [61]:
model.seed

2

In [27]:
# Retrieve the environment
env = model.get_env()

# Evaluate the policy
mean_reward, std_reward = evaluate_policy(policy, env, n_eval_episodes=10, deterministic=True)

print(f"mean_reward={mean_reward:.2f} +/- {std_reward}")

# Load the policy independently from the model
saved_policy = MlpPolicy.load("sac_policy_pendulum")

# Evaluate the loaded policy
mean_reward, std_reward = evaluate_policy(saved_policy, env, n_eval_episodes=10, deterministic=True)

print(f"mean_reward={mean_reward:.2f} +/- {std_reward}")

mean_reward=-130.04 +/- 91.93990258445707
mean_reward=-132.14 +/- 109.35243555659224


In [33]:
# Evaluate the loaded policy
mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=10, deterministic=True)

print(f"mean_reward={mean_reward:.2f} +/- {std_reward}")

mean_reward=-129.45 +/- 78.78029922519339


In [97]:
import gymnasium as gym
from stable_baselines3.common.vec_env import VecVideoRecorder, DummyVecEnv

env_id = "Reacher-v2"
video_folder = "logs/videos/"
video_length = 1000

vec_env = DummyVecEnv([lambda: gym.make(env_id, render_mode="rgb_array")])

obs = vec_env.reset()

# Record the video starting at the first step
vec_env = VecVideoRecorder(vec_env, video_folder,
                       record_video_trigger=lambda x: x == 0, video_length=video_length,
                       name_prefix=f"random-agent-{env_id}")

vec_env.reset()
for _ in range(video_length + 1):
  action = [vec_env.action_space.sample()]
  obs, _, _, _ = vec_env.step(action)
# Save the video
vec_env.close()

  logger.deprecation(


Compiling /home/neaf2080/.pyenv/versions/3.8.18/envs/general_env/lib/python3.8/site-packages/mujoco_py/cymj.pyx because it changed.
[1/1] Cythonizing /home/neaf2080/.pyenv/versions/3.8.18/envs/general_env/lib/python3.8/site-packages/mujoco_py/cymj.pyx


Possible solutions:
	1. Declare the function as 'noexcept' if you control the definition and you're sure you don't want the function to raise exceptions.
	2. Use an 'int' return type on the function to allow an error code to be returned.
performance hint: /home/neaf2080/.pyenv/versions/3.8.18/envs/general_env/lib/python3.8/site-packages/mujoco_py/cymj.pyx:104:5: Exception check on 'c_error_callback' will always require the GIL to be acquired.
Possible solutions:
	1. Declare the function as 'noexcept' if you control the definition and you're sure you don't want the function to raise exceptions.
	2. Use an 'int' return type on the function to allow an error code to be returned.

Error compiling Cython file:
------------------------------------------------------------
...
    '''
                       ^
------------------------------------------------------------

/home/neaf2080/.pyenv/versions/3.8.18/envs/general_env/lib/python3.8/site-packages/mujoco_py/cymj.pyx:92:23: Cannot assign ty

CompileError: /home/neaf2080/.pyenv/versions/3.8.18/envs/general_env/lib/python3.8/site-packages/mujoco_py/cymj.pyx