## Vanilla Policy Optimisation

<a href="https://colab.research.google.com/github/EffiSciencesResearch/ML4G/blob/main/days/w1d5/vanilla_policy_gradient.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Preliminary questions:
- Run the script with the defaults parameters on the terminal
- Explain from torch.distributions.categorical import Categorical
- google gym python, why is it useful?
- Policy gradient is model based or model free?
- Is policy gradient on-policy or off-policy?

Read all the code, then:
- Use https://github.com/patrick-kidger/torchtyping to type the functions get_policy, get_action and compute_loss
- Type completely the whole code.
- Use from typeguard import typechecked and the @typechecked decorator to check the previous question.
- Answer the questions


In [3]:
!pip install torchtyping

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [14]:
import torch
import torch.nn as nn
from torch.distributions.categorical import Categorical
from torch.optim import Adam
import numpy as np
import gym
from gym.spaces import Discrete, Box

from torchtyping import TensorType, patch_typeguard
from typeguard import typechecked

patch_typeguard()  # use before @typechecked


def mlp(sizes, activation=nn.Tanh, output_activation=nn.Identity):
    # Build a feedforward neural network.
    layers = []
    for j in range(len(sizes)-1):
        act = activation if j < len(sizes)-2 else output_activation
        layers += [nn.Linear(sizes[j], sizes[j+1]), act()]
    # What does * mean here? Search for unpacking in python
    return nn.Sequential(*layers)

def train(env_name='CartPole-v0', hidden_sizes=[32], lr=1e-2, 
          epochs=50, batch_size=5000, render=False):

    # make environment, check spaces, get obs / act dims
    env = gym.make(env_name)
    assert isinstance(env.observation_space, Box), \
        "This example only works for envs with continuous state spaces."
    assert isinstance(env.action_space, Discrete), \
        "This example only works for envs with discrete action spaces."

    obs_dim = env.observation_space.shape[0]
    n_acts = env.action_space.n

    # Core of policy network
    # What should be the sizes of the layers of the policy network?
    logits_net = mlp(sizes=[obs_dim]+hidden_sizes+[n_acts])

    # make function to compute action distribution
    # What is the shape of obs? 
    @typechecked
    def get_policy(obs:TensorType[..., obs_dim])->TensorType[..., n_acts]:
        # Warning: obs has not always the same shape.
        logits = logits_net(obs)
        return Categorical(logits=logits)

    # make action selection function (outputs int actions, sampled from policy)
    # What is the shape of obs?
    @typechecked
    def get_action(obs : TensorType[..., obs_dim]):  
        return get_policy(obs).sample().item()

    # make loss function whose gradient, for the right data, is policy gradient
    # What does the weights parameter represents here?
    # What is the shape of obs?
    @typechecked
    def compute_loss(obs : TensorType[..., obs_dim] , act , weights: torch.tensor()):  
        logp = get_policy(obs).log_prob(act)
        return -(logp * weights).mean()

    # make optimizer
    optimizer = Adam(logits_net.parameters(), lr=lr)

    # for training policy
    def train_one_epoch():
        # make some empty lists for logging.
        batch_obs = []          # for observations
        batch_acts = []         # for actions
        batch_weights = []      # for R(tau) weighting in policy gradient
        batch_rets = []         # for measuring episode returns # What is the return?
        batch_lens = []         # for measuring episode lengths

        # reset episode-specific variables
        obs = env.reset()       # first obs comes from starting distribution 
        done = False            # signal from environment that episode is over
        ep_rews = []            # list for rewards accrued throughout ep

        # render first episode of each epoch
        finished_rendering_this_epoch = False

        # collect experience by acting in the environment with current policy
        while True:

            # rendering
            if (not finished_rendering_this_epoch) and render:
                env.render()

            # save obs
            batch_obs.append(obs.copy())

            # act in the environment
            act = get_action(torch.as_tensor(obs, dtype=torch.float32))
            obs, rew, done, _ = env.step(act)

            # save action, reward
            batch_acts.append(act)
            ep_rews.append(rew)

            if done:
                # if episode is over, record info about episode
                # Is the reward discounted?
                ep_ret, ep_len = sum(ep_rews), len(ep_rews)
                batch_rets.append(ep_ret)
                batch_lens.append(ep_len)

                # the weight for each logprob(a|s) is R(tau)
                # Why do we use a constant vector here?
                batch_weights += [ep_ret] * ep_len          

                # reset episode-specific variables
                obs, done, ep_rews = env.reset(), False, []

                # won't render again this epoch
                finished_rendering_this_epoch = True

                # end experience loop if we have enough of it
                if len(batch_obs) > batch_size:
                    break

        # take a single policy gradient update step
        optimizer.zero_grad()
        batch_loss = compute_loss(obs=torch.as_tensor(batch_obs, dtype=torch.float32),
                                  act=torch.as_tensor(batch_acts, dtype=torch.int32),
                                  weights=torch.as_tensor(batch_weights, dtype=torch.float32)
                                  )
        batch_loss.backward()
        optimizer.step()
        return batch_loss, batch_rets, batch_lens

    # training loop
    for i in range(epochs):
        batch_loss, batch_rets, batch_lens = train_one_epoch()
        print('epoch: %3d \t loss: %.3f \t return: %.3f \t ep_len: %.3f'%(i, batch_loss, np.mean(batch_rets), np.mean(batch_lens)))


In [9]:
train(env_name='CartPole-v0', hidden_sizes=[32], lr=1e-2, 
          epochs=50, batch_size=50, render=False)

  f"The environment {id} is out of date. You should consider "
  "Initializing wrapper in old step API which returns one bool instead of two. It is recommended to set `new_step_api=True` to use new step API. This will be the default behaviour in future."
  "Initializing environment in old step API which returns one bool instead of two. It is recommended to set `new_step_api=True` to use new step API. This will be the default behaviour in future."
  warn('no type annotations present -- not typechecking {}'.format(function_name(func)))


[ 0.04258865 -0.00495045 -0.02861075  0.00633207]
epoch:   0 	 loss: 19.131 	 return: 18.000 	 ep_len: 18.000
[-0.03541778 -0.04465922 -0.02410383 -0.01964713]
epoch:   1 	 loss: 19.934 	 return: 25.000 	 ep_len: 25.000
[ 0.04988953 -0.0241212  -0.00348926 -0.02824957]
epoch:   2 	 loss: 11.667 	 return: 16.250 	 ep_len: 16.250
[ 0.02765383 -0.01083768  0.01397943  0.0404648 ]
epoch:   3 	 loss: 10.526 	 return: 15.000 	 ep_len: 15.000
[ 0.04717234 -0.02667601  0.01827999 -0.04870812]
epoch:   4 	 loss: 10.019 	 return: 14.250 	 ep_len: 14.250
[ 0.03199461 -0.00194721 -0.04808279 -0.03889005]
epoch:   5 	 loss: 16.431 	 return: 21.667 	 ep_len: 21.667
[-0.00515325 -0.03407977  0.03670634  0.04267851]
epoch:   6 	 loss: 21.823 	 return: 31.500 	 ep_len: 31.500
[-0.00410539  0.02881444  0.00626996 -0.04292123]
epoch:   7 	 loss: 20.437 	 return: 28.333 	 ep_len: 28.333
[-0.0382729   0.00152748  0.01897391  0.02371486]
epoch:   8 	 loss: 9.230 	 return: 13.000 	 ep_len: 13.000
[-0.0064231

In [None]:
Original algo here: https://github.com/openai/spinningup/blob/master/spinup/algos/pytorch/vpg/vpg.py