**I recommend you run the first code cell of this notebook immediately, to start provisioning drake on the cloud machine, then you can leave this window open as you [read the textbook](manipulation.csail.mit.edu/rl.html).**

# Notebook setup

The following cell will:
- on Colab (only), install Drake to `/opt/drake`, install Drake's prerequisites via `apt`, and add pydrake to `sys.path`.  This will take approximately two minutes on the first time it runs (to provision the machine), but should only need to reinstall once every 12 hours.  If you navigate between notebooks using Colab's "File->Open" menu, then you can avoid provisioning a separate machine for each notebook.
- launch a server for our 3D visualizer (MeshCat) that will be used for the remainder of this notebook.

You will need to rerun this cell if you restart the kernel, but it should be fast because the machine will already have drake installed.

In [1]:
import importlib
import sys
from urllib.request import urlretrieve

if 'google.colab' in sys.modules and importlib.util.find_spec('pydrake') is None:
  version='20200918'
  build='nightly'
  urlretrieve(f"https://drake-packages.csail.mit.edu/drake/{build}/drake-{version}/setup_drake_colab.py",
              "setup_drake_colab.py")
  from setup_drake_colab import setup_drake
  setup_drake(version=version, build=build)
  !pip install pyngrok==4.2.2
  !pip install gym

# Install pyngrok.
server_args = []
if 'google.colab' in sys.modules:
  server_args = ['--ngrok_http_tunnel']

# Start a single meshcat server instance to use for the remainder of this notebook.
from meshcat.servers.zmqserver import start_zmq_server_as_subprocess
#proc, zmq_url, web_url = start_zmq_server_as_subprocess(server_args=server_args)

# Determine if this notebook is currently running as a notebook or a unit test.
from IPython import get_ipython
running_as_notebook = get_ipython() and hasattr(get_ipython(), 'kernel')

# Imports
import numpy as np

import pydrake.all

ModuleNotFoundError: No module named 'tornado.web'

In [2]:

K, S = pydrake.systems.controllers.DiscreteTimeLinearQuadraticRegulator(A,B,Q,R)

NameError: name 'A' is not defined

In [1]:
import numpy as np

import gym
from gym import spaces, logger
from gym.utils import seeding

class DoubleIntegrator(gym.Env):
    metadata = {'render.modes': []}

    def __init__(self):
        # Discrete-time double "integrator"
        self.A = np.array([[1, 1], [0, 1]])
        self.B = np.array([0, 1])  # yuck.  

        self.Q = np.identity(2)
        self.R = np.identity(1)

        self.state = None
        self.action_space = spaces.Box(-np.inf, np.inf, shape=(1,))
        self.observation_space = spaces.Box(-np.inf, np.inf, shape=(2,))
        self.seed()

    def seed(self, seed=None):
        self.np_random, seed = seeding.np_random(seed)
        return [seed]

    def step(self, action):
        u = action[0]
        x = self.A.dot(self.state) + self.B*u
        reward = -(x.dot(self.Q.dot(x)) + self.R*u*u)[0][0]
        self.state = x
        return np.array(x), reward, False, {}

    def reset(self):
        self.state = self.np_random.uniform(low=-5, high=5, size=(2,))
        return np.array(self.state)

    def render(self, mode='human'):
        # intentionally blank
        print(self.state)

env = DoubleIntegrator()
env.reset()
for _ in range(10):
    env.render()
    env.step(env.action_space.sample()) # take a random action
env.close()

[-3.78872546 -4.05613563]
[-7.84486109 -1.16686604]
[-9.01172713  0.15287092]
[-8.85885621  0.10632339]
[-8.75253282  1.33560422]
[-7.41692859  2.93569866]
[-4.48122993  3.25659222]
[-1.22463771  3.41293831]
[2.1883006  4.21202942]
[6.40033002 3.19797154]


In [19]:
import random
import gym
import numpy as np

from itertools import count

import torch as th
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Normal

import cherry as ch
import cherry.envs as envs

SEED = 567
GAMMA = 0.99
RENDER = False

random.seed(SEED)
np.random.seed(SEED)
th.manual_seed(SEED)


class PolicyNet(nn.Module):
    def __init__(self):
        super(PolicyNet, self).__init__()
        self.affine1 = nn.Linear(2, 128)
        self.affine2 = nn.Linear(128, 1)

    def forward(self, x):
        x = F.relu(self.affine1(x))
        return F.relu(self.affine2(x))


def update(replay):
    policy_loss = []

    # Discount and normalize rewards
    rewards = ch.discount(GAMMA, replay.reward(), replay.done())
    rewards = ch.normalize(rewards)

    # Compute loss
    for sars, reward in zip(replay, rewards):
        log_prob = sars.log_prob
        policy_loss.append(-log_prob * reward)

    # Take optimization step
    optimizer.zero_grad()
    policy_loss = th.stack(policy_loss).sum()
    policy_loss.backward()
    optimizer.step()


if __name__ == '__main__':
    env = DoubleIntegrator() #gym.make('CartPole-v0')
    env = envs.Logger(env, interval=1000)
    env = envs.Torch(env)
    env.seed(SEED)

    policy = PolicyNet()
    optimizer = optim.Adam(policy.parameters(), lr=1e-2)
    running_reward = 10.0
    replay = ch.ExperienceReplay()

    for i_episode in count(1):
        state = env.reset()
        for t in range(10000):  # Don't infinite loop while learning
            density = Normal(policy(state), 0.1)
            action = density.sample()
            old_state = state
            state, reward, done, _ = env.step(action)
            replay.append(old_state,
                          action,
                          reward,
                          state,
                          done,
                          # Cache log_prob for later
                          log_prob=density.log_prob(action))
            if RENDER:
                env.render()
            if done:
                break

        #  Compute termination criterion
        running_reward = running_reward * 0.99 + t * 0.01
#        if running_reward > env.spec.reward_threshold:
#            print('Solved! Running reward is now {} and '
#                  'the last episode runs to {} time steps!'.format(running_reward, t))
#            break

        # Update policy
        update(replay)
        replay.empty()

- 0.00

--------------------  Log 16 --------------------
Overall:
- Steps: 16000
- Episodes: 0
Last 10 Episodes:
- Mean episode length: 0.00 +/- 0.00
- Mean episode reward: 0.00 +/- 0.00
Last 1000 Steps:
- Episodes: 1
- Mean episode length: 1000.00 +/- 0.00
- Mean episode reward: 335395.65 +/- 0.00

--------------------  Log 17 --------------------
Overall:
- Steps: 17000
- Episodes: 0
Last 10 Episodes:
- Mean episode length: 0.00 +/- 0.00
- Mean episode reward: 0.00 +/- 0.00
Last 1000 Steps:
- Episodes: 1
- Mean episode length: 1000.00 +/- 0.00
- Mean episode reward: 348986.32 +/- 0.00

--------------------  Log 18 --------------------
Overall:
- Steps: 18000
- Episodes: 0
Last 10 Episodes:
- Mean episode length: 0.00 +/- 0.00
- Mean episode reward: 0.00 +/- 0.00
Last 1000 Steps:
- Episodes: 1
- Mean episode length: 1000.00 +/- 0.00
- Mean episode reward: 344701.76 +/- 0.00

--------------------  Log 19 --------------------
Overall:
- Steps: 19000
- Episodes: 0
Last 10 Episodes:
- Me

Garage pip version is 2020.06.3; i can browse that source [here](https://github.com/rlworkgroup/garage/blob/e67db24f1a744049d452a123dfaaf719fa62ab46).

In [14]:
import torch

from garage import wrap_experiment
from garage.envs import GarageEnv
from garage.experiment import LocalRunner
from garage.experiment.deterministic import set_seed
from garage.torch.algos import PPO
from garage.torch.policies import GaussianMLPPolicy
from garage.torch.value_functions import GaussianMLPValueFunction

@wrap_experiment
def ppo_cartpole(ctxt=None, seed=1):
    """Train PPO with CartPole-v0 environment.
    Args:
        ctxt (garage.experiment.ExperimentContext): The experiment
            configuration used by LocalRunner to create the snapshotter.
        seed (int): Used to seed the random number generator to produce
            determinism.
    """
    set_seed(seed)
    env = GarageEnv(env_name='CartPole-v0')

    runner = LocalRunner(snapshot_config=ctxt)

    policy = GaussianMLPPolicy(env.spec,
                               hidden_sizes=[64, 64],
                               hidden_nonlinearity=torch.tanh,
                               output_nonlinearity=None)

    value_function = GaussianMLPValueFunction(env_spec=env.spec,
                                              hidden_sizes=(32, 32),
                                              hidden_nonlinearity=torch.tanh,
                                              output_nonlinearity=None)

    algo = PPO(env_spec=env.spec,
               policy=policy,
               value_function=value_function,
               max_path_length=100,
               discount=0.99,
               center_adv=False)

    runner.setup(algo, env)
    runner.train(n_epochs=100, batch_size=10000)


ppo_cartpole(ctxt=ctxt, seed=1)

2020-11-11 22:30:46 | [ppo_pendulum] [ppo_pendulum] [ppo_pendulum] [cma_es_cartpole] [ppo_cartpole] [cma_es_cartpole] [ppo_cartpole] [ppo_cartpole] [ppo_cartpole] Logging to /home/russt/manipulation/data/local/experiment/ppo_cartpole_3
2020-11-11 22:30:46 | [ppo_pendulum] [ppo_pendulum] [ppo_pendulum] [cma_es_cartpole] [ppo_cartpole] [cma_es_cartpole] [ppo_cartpole] [ppo_cartpole] [ppo_cartpole] Logging to /home/russt/manipulation/data/local/experiment/ppo_cartpole_3
2020-11-11 22:30:46 | [ppo_pendulum] [ppo_pendulum] [ppo_pendulum] [cma_es_cartpole] [ppo_cartpole] [cma_es_cartpole] [ppo_cartpole] [ppo_cartpole] [ppo_cartpole] Logging to /home/russt/manipulation/data/local/experiment/ppo_cartpole_3
2020-11-11 22:30:46 | [ppo_pendulum] [ppo_pendulum] [ppo_pendulum] [cma_es_cartpole] [ppo_cartpole] [cma_es_cartpole] [ppo_cartpole] [ppo_cartpole] [ppo_cartpole] Logging to /home/russt/manipulation/data/local/experiment/ppo_cartpole_3
2020-11-11 22:30:46 | [ppo_pendulum] [ppo_pendulum] [ppo

TypeError: ppo_cartpole() got multiple values for argument 'ctxt'

In [10]:
#!/usr/bin/env python3
"""This is an example to train a task with CMA-ES.
Here it runs CartPole-v1 environment with 100 epoches.
Results:
    AverageReturn: 100
    RiseTime: epoch 38 (itr 760),
              but regression is observed in the course of training.
"""
from garage import wrap_experiment
from garage.envs import GarageEnv
from garage.experiment import LocalTFRunner
from garage.experiment.deterministic import set_seed
from garage.np.algos import CMAES
from garage.np.baselines import LinearFeatureBaseline
from garage.sampler import OnPolicyVectorizedSampler
from garage.tf.policies import CategoricalMLPPolicy


@wrap_experiment
def cma_es_cartpole(ctxt=None, seed=1):
    """Train CMA_ES with Cartpole-v1 environment.
    Args:
        ctxt (garage.experiment.ExperimentContext): The experiment
            configuration used by LocalRunner to create the snapshotter.
        seed (int): Used to seed the random number generator to produce
            determinism.
    """
    set_seed(seed)
    with LocalTFRunner(ctxt) as runner:
        env = GarageEnv(env_name='CartPole-v1')

        policy = CategoricalMLPPolicy(name='policy',
                                      env_spec=env.spec,
                                      hidden_sizes=(32, 32))
        baseline = LinearFeatureBaseline(env_spec=env.spec)

        n_samples = 20

        algo = CMAES(env_spec=env.spec,
                     policy=policy,
                     baseline=baseline,
                     max_path_length=100,
                     n_samples=n_samples)

        runner.setup(algo, env)#, sampler_cls=OnPolicyVectorizedSampler)
        runner.train(n_epochs=100, batch_size=1000)


cma_es_cartpole()

2020-11-11 22:21:09 | [ppo_pendulum] [ppo_pendulum] [ppo_pendulum] [cma_es_cartpole] [ppo_cartpole] [cma_es_cartpole] Logging to /home/russt/manipulation/data/local/experiment/cma_es_cartpole_1
2020-11-11 22:21:09 | [ppo_pendulum] [ppo_pendulum] [ppo_pendulum] [cma_es_cartpole] [ppo_cartpole] [cma_es_cartpole] Logging to /home/russt/manipulation/data/local/experiment/cma_es_cartpole_1
2020-11-11 22:21:09 | [ppo_pendulum] [ppo_pendulum] [ppo_pendulum] [cma_es_cartpole] [ppo_cartpole] [cma_es_cartpole] Logging to /home/russt/manipulation/data/local/experiment/cma_es_cartpole_1
2020-11-11 22:21:09 | [ppo_pendulum] [ppo_pendulum] [ppo_pendulum] [cma_es_cartpole] [ppo_cartpole] [cma_es_cartpole] Logging to /home/russt/manipulation/data/local/experiment/cma_es_cartpole_1
2020-11-11 22:21:09 | [ppo_pendulum] [ppo_pendulum] [ppo_pendulum] [cma_es_cartpole] [ppo_cartpole] [cma_es_cartpole] Logging to /home/russt/manipulation/data/local/experiment/cma_es_cartpole_1
2020-11-11 22:21:09 | [ppo_pen

ValueError: Variable policy/CategoricalMLPModel/mlp/hidden_0/kernel already exists, disallowed. Did you mean to set reuse=True or reuse=tf.AUTO_REUSE in VarScope?