## Part 0: Install packages



In [1]:
import wandb

WANDB_NAME="SB3 funbling"
WANDB_NOTEBOOK_NAME = "SB3 test"

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33myuxiliu1995[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [2]:
import gym
import ipdb
%pdb on
from stable_baselines3 import SAC

Automatic pdb calling has been turned ON


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
wandb.init(project="sb3-test")
env = gym.make("Pendulum-v1")
model = SAC("MlpPolicy", env).learn(total_timesteps=10000)
# Save the model
model.save("sac_pendulum")
# Load the trained model
model = SAC.load("sac_pendulum")
# Start a new episode
obs = env.reset()
# What action to take in state `obs`?
action, _ = model.predict(obs, deterministic=True)

wandb.finish()

print(model.policy_class)

stable_baselines3.sac.policies.SACPolicy

## Part 1: Getting environments

We need 4 kinds of environments to try out on: a discrete one and a continuous one; a simple one and a complex one.

|            | simple                      | complex                |
|------------|-----------------------------|------------------------|
| discrete   | toy text, minigrid          | minigrid, crafter      |
| continuous | mountain car cont | half cheetah, humanoid |


In [22]:
# discrete, simple: toy text environments from OpenAI Gym itself.

env = gym.make('FrozenLake-v1', desc=None, map_name="4x4", is_slippery=True)

obs = env.reset()
done = False
while not done:
    action = env.action_space.sample()
    obs, reward, done, info = env.step(action)

In [None]:
# discrete, both: minigrid

# ?? https://github.com/Farama-Foundation/MiniGrid

In [8]:
# discrete, complex: crafter
# https://github.com/danijar/crafter

import crafter

env = gym.make('CrafterReward-v1')  # Or CrafterNoReward-v1
env = crafter.Recorder(
  env, './data/crafter',
  save_stats=True,
  save_video=False,
  save_episode=False,
)

obs = env.reset()
done = False
while not done:
  action = env.action_space.sample()
  obs, reward, done, info = env.step(action)

In [4]:
# continuous, simple: mountain car

# continuous, complex: half cheetah

## Part 2: Getting baseline training algorithms

We consider the following baselines:
* deep Q network, as baseline for discrete environments...?
* model-free SAC, as the baseline for continuous environment.

In [None]:
# PPO

from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy


# Create environment
env = gym.make("LunarLander-v2")

# Instantiate the agent
model = PPO("MlpPolicy", env, verbose=1)
# Train the agent and display a progress bar
model.learn(total_timesteps=int(2e5), progress_bar=True)
# Save the agent
model.save("ppo_lunar")

# load agent
del model
# NOTE: if you have loading issue, you can pass `print_system_info=True`
# to compare the system on which the model was trained vs the current one
# model = DQN.load("dqn_lunar", env=env, print_system_info=True)
model = DQN.load("ppo_lunar", env=env)

# Evaluate the agent
# NOTE: If you use wrappers with your environment that modify rewards,
#       this will be reflected here. To evaluate with original rewards,
#       wrap environment in a "Monitor" wrapper before other wrappers.
mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=10)

# Enjoy trained agent
obs = env.reset()
for i in range(1000):
    action, _states = model.predict(obs, deterministic=True)
    obs, rewards, dones, info = env.step(action)
    # env.render()

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 90.2     |
|    ep_rew_mean     | -178     |
| time/              |          |
|    fps             | 610      |
|    iterations      | 1        |
|    time_elapsed    | 3        |
|    total_timesteps | 2048     |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 91.8         |
|    ep_rew_mean          | -195         |
| time/                   |              |
|    fps                  | 494          |
|    iterations           | 2            |
|    time_elapsed         | 8            |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0043952307 |
|    clip_fraction        | 0.00449      |
|    clip_range           | 0.2          |
|    e

In [5]:
# advantage actor-critic

from stable_baselines3 import A2C
import imageio
import numpy as np

env = gym.make("CartPole-v1")

model = A2C("MlpPolicy", env, verbose=1)
model.learn(total_timesteps=10_000)
        
images = []
obs = model.env.reset()
img = model.env.render(mode="rgb_array")
for i in range(350):
    images.append(img)
    action, _ = model.predict(obs)
    obs, _, _ ,_ = model.env.step(action)
    img = model.env.render(mode="rgb_array")

imageio.mimsave("cartpole_a2c.gif", [np.array(img) for i, img in enumerate(images) if i%2 == 0], fps=29)


Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 15       |
|    ep_rew_mean        | 15       |
| time/                 |          |
|    fps                | 403      |
|    iterations         | 100      |
|    time_elapsed       | 1        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -0.603   |
|    explained_variance | 0.221    |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | 1.29     |
|    value_loss         | 6.62     |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 16.5     |
|    ep_rew_mean        | 16.5     |
| time/                 |          |
|    fps                | 416      |
|    iterations         | 200      |
|    time_elapsed



In [12]:
# SAC 
from stable_baselines3 import SAC

wandb.init(project="sb3-test")
env = gym.make("MountainCarContinuous-v0")
model = SAC("MlpPolicy", env).learn(total_timesteps=40000)
# Save the model
model.save("sac_mtcar_cont")
# Load the trained model
model = SAC.load("sac_mtcar_cont")
# Start a new episode
obs = env.reset()
# What action to take in state `obs`?
action, _ = model.predict(obs, deterministic=True)

wandb.finish()

print(model.policy_class)

<class 'stable_baselines3.sac.policies.SACPolicy'>


In [13]:
images = []
model = SAC.load("sac_mtcar_cont")
env = gym.make("MountainCarContinuous-v0")
obs = env.reset()
img = env.render(mode="rgb_array")
for i in range(350):
    images.append(img)
    action, _ = model.predict(obs)
    obs, _, _ ,_ = env.step(action)
    img = env.render(mode="rgb_array")

imageio.mimsave("sac_mtcar_cont.gif", [np.array(img) for i, img in enumerate(images) if i%2 == 0], fps=29)


Exception ignored in: <function Viewer.__del__ at 0x000002590CC1C550>
Traceback (most recent call last):
  File "C:\Users\DeadScholar\miniconda3\envs\sb3\lib\site-packages\gym\envs\classic_control\rendering.py", line 185, in __del__
    self.close()
  File "C:\Users\DeadScholar\miniconda3\envs\sb3\lib\site-packages\gym\envs\classic_control\rendering.py", line 101, in close
    self.window.close()
  File "C:\Users\DeadScholar\miniconda3\envs\sb3\lib\site-packages\pyglet\window\win32\__init__.py", line 332, in close
    super(Win32Window, self).close()
  File "C:\Users\DeadScholar\miniconda3\envs\sb3\lib\site-packages\pyglet\window\__init__.py", line 858, in close
    app.windows.remove(self)
  File "C:\Users\DeadScholar\miniconda3\envs\sb3\lib\_weakrefset.py", line 114, in remove
    self.data.remove(ref(item))
KeyError: <weakref at 0x0000025979197E20; to 'Win32Window' at 0x000002595156D5A0>


## Part 3: Getting Dreamer working.

## Part 4: Getting something else working

