In [None]:
!pip install jdc

import numpy as np
import gymnasium as gym
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical

import matplotlib
import matplotlib.pyplot as plt
import pyglet
import ipywidgets
from IPython import display

%matplotlib inline
import sys

import jdc

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

## Actor and critic networks

We will first implement the policy and actor networks. Some of the results found lead to the following implementation choices:
- Seperate actor and critic networks often often in better performance than networks with shared weights
- The width of the actor networks depends on the complexity of the task, if it is not wide enough (number of channels in intermediate layers) or too wide a performance drop occurs
- The critic network can be wider and there does not seem to be a performance penalty if it is too wide.
- Two hidden layers work well for most tasks
- Tanh works best, ReLU works worst as activation function
- An orthogonal initialization might improve performance, but seems not too important. However, the last layer should be initialized with lower values such as to cause action preference values (logits) of 0 in the beginning.

We divide the methods into the ones used by rollouts and evalulation (get_action_and_value, get_value, get_action) which both need results in numpy and the one used for the training step (get_probs_and_value) which supply torch tensors and also return the entropy.

In [None]:
class AgentNetwork(nn.Module):
    """
    Build the agent networks.
    """
    def __init__(self, n_obs, n_action):
        super(AgentNetwork, self).__init__()
        self.critic = nn.Sequential(
            # use the initialization function below for linear layers
            self.init_linear(nn.Linear(n_obs, 256)),
            nn.Tanh(),
            self.init_linear(nn.Linear(256, 256)),
            nn.Tanh(),
            self.init_linear(nn.Linear(256, 1), std=1.0)
        )

        self.actor = nn.Sequential(
            self.init_linear(nn.Linear(n_obs, 64)),
            nn.Tanh(),
            self.init_linear(nn.Linear(64, 64)),
            nn.Tanh(),
            self.init_linear(nn.Linear(64, n_action), std=0.01)
        )

    @staticmethod
    def init_linear(layer, std=np.sqrt(2), bias_const=0.0):
        nn.init.orthogonal_(layer.weight, std)
        nn.init.constant_(layer.bias, bias_const)
        return layer

    def get_value(self, x):
        return self.critic(x).numpy(force=True)

    def get_action(self, x):
        logits = self.actor(x)
        probs = Categorical(logits=logits)
        action = probs.sample()
        return action.numpy(force=True)

    def get_action_and_value(self, x):
        logits = self.actor(x)
        probs = Categorical(logits=logits)
        action = probs.sample()

        return (action.numpy(force=True),
                probs.log_prob(action).numpy(force=True),
                self.critic(x).numpy(force=True))

    def get_probs_and_value(self, x, action):
        logits = self.actor(x)
        probs = Categorical(logits=logits)
        return probs.log_prob(action), probs.entropy(), self.critic(x)

In [None]:
agent = AgentNetwork(4,2)

## Vectorized environments
We will use vectorized environments for the rollouts. With vectorized environments, multiple copies of an environment can be run in parallel. We will first use the same environment as in the last exercise for trying our algorithm and then switch to a new one for the final evaluation.

Optimally the different environments should be distributed to different CPUs decreasing the wall-clock time to gather experience in comparison to using

In [None]:
environment_name = 'CartPole-v1'
env = gym.make(environment_name)
num_envs = 10
envs = gym.make_vec(environment_name, num_envs=num_envs, vectorization_mode=gym.VectorizeMode.SYNC)

obs, _ = env.reset()
print(f'single env: {obs.shape}')

obss, _ = envs.reset()
print(f'vec envs: {obss.shape}')

In [None]:
print(env.observation_space.shape, env.action_space.n)

We can apply both observations directly to the agent network.

In [None]:
agent_network = AgentNetwork(env.observation_space.shape[0], env.action_space.n)
print(agent_network.get_action(torch.tensor(obs)))
print(agent_network.get_action(torch.tensor(obss)))

The vectorized actions can be used as input to the envs step function to get all results in parallel:


In [None]:
obs, rewards, done, truncated, info = envs.step(envs.action_space.sample())
print(obs, rewards, done, truncated)

Note that some of the environments might be done earlier than others. Dealing with this is facilitated by an auto_reset parameter in the setup which is set to Next-Step Mode by default. So any env that is done, will automatically reset and return the observation of the reset in the next step:

![Auto Reset Modes](autoreset-modes.svg)

In [None]:
envs_test = gym.make_vec(environment_name, num_envs=2, vectorization_mode=gym.VectorizeMode.SYNC)
envs_test.reset()
for step in range(5):
    obs, rewards, done, truncated, info = envs_test.step(envs_test.action_space.sample())
    print(obs, rewards, done, truncated)


## Agent class

We will organize the agent class slightly different to deal with the vectorized environments but start with the constructor as always. We add some parameters in the constructor for training.

### Optimizer
Adam is recommended as optimizer, while RMSProb actually performs similarly. The most important parameter here is the learning rate for the optimizer. Decaying the learning rate might increase the performance slightly but is of secondary importance, so we will leave it out.




In [None]:
class PPOAgent:
    """
    Implement PPO training algorithm.
    """
    def __init__(self, observation_space, action_space,
                 num_envs: int,
                 agent_network_cls,
                 device,
                 gamma: float,
                 learning_rate: float,
                 rollout_length: int,
                 nr_epochs: int,
                 batch_size: int,
                 use_gae: bool,
                 gae_lambda: float,
                 clip_coef: float,
                 value_loss_coef: float,
                 entropy_loss_coef: float,):
        """
        Initialize the PPO algorithm and the parameters it uses.
        Args:
            observation_space: the (single) observation space of the environment.
            action_space: the (single) action space of the environment.
            num_envs: the number of (vectorized) environments.
            agent_network_cls: the class that implements the actor and critic networks.
            device: the device (cpu, cuda or mps) to use for training
            gamma: The discount factor.
            learning_rate: The learning rate.
            rollout_length: The lengths of the rollouts
            nr_epochs: The number of epochs to train for after each rollout.
            batch_size: The (mini-batch) size for training.
            use_gae: Use generalized advantage estimation (true) or n-step returns
            gae_lambda: The lambda parameter for generalized advantage estimation.
            clip_coef: The clipping coefficient of the PPO algorithm.
            value_loss_coef: The scaling of the value loss in the loss function.
            entropy_loss_coef: The scaling of the entropy loss in the loss function.
        """
        self.observation_space = observation_space
        self.action_space = action_space
        self.agent_network_cls = agent_network_cls
        self.num_envs = num_envs
        self.device = device

        # hyperparameters for training
        self.gamma = gamma
        self.learning_rate = learning_rate
        self.rollout_length = rollout_length
        self.nr_epochs = nr_epochs
        self.batch_size = batch_size
        self.use_gae = use_gae
        self.gae_lambda = gae_lambda
        self.clip_coef = clip_coef
        self.value_loss_coef = value_loss_coef
        self.entropy_loss_coef = entropy_loss_coef

        # create the network
        self.agent_network = self.agent_network_cls(observation_space.shape[0], action_space.n)
        self.agent_network.to(self.device)

        # Optimizer
        self.optimizer = optim.Adam(self.agent_network.parameters(), lr=self.learning_rate, betas=(0.9, 0.999), eps=1e-5)

        # the rollout data will be kept as numpy arrays and just the mini batches will be moved to
        # tensors on the device.
        self.obs = np.zeros((self.rollout_length, self.num_envs) + self.observation_space.shape)
        self.actions = np.zeros((self.rollout_length, self.num_envs) + self.action_space.shape)
        self.log_probs = np.zeros((self.rollout_length, self.num_envs))
        self.rewards = np.zeros((self.rollout_length, self.num_envs))
        self.dones = np.zeros((self.rollout_length, self.num_envs))
        self.values = np.zeros((self.rollout_length, self.num_envs))

        # calculated returns and advantages
        self.returns = np.zeros((self.rollout_length, self.num_envs))
        self.advantages = np.zeros((self.rollout_length, self.num_envs))

        # keeping track of overall number of steps
        self.global_step = 0


## Rollouts

We first want to calculate the rollouts. We will give the next observation and dones as input to the rollout and also return them at the end of the rollouts. These make it easier to start the rollouts (with resetting the environments outside of the rollout) and to continue the next rollouts from where the last ones left off.
So the given next_obs and next_dones should be saved for step 0, together with the action, log probability and value obtained from this observation and then the reward from the environment step. The observation and dones returned from the environment step are for step 1 and so on.
The last observation and dones should be returned.

In [None]:
%%add_to PPOAgent
def rollout(self, envs, next_obs, next_dones):
    """
    Calculate the rollout for the vectorized environment.

    Args:
        envs: The environments for the rollout. The number of envs must correspond to the
        number given in the constructor
        next_obs: The next observations for the rollout, i.e. the observations for the first step.
        next_dones: The next dones for the rollout.
    Returns: the observations and dones to be used to start the next rollout.

    """
    # do the rollouts for the number of steps
    for step in range(0, self.rollout_length):
        self.global_step += self.num_envs

        self.obs[step] = next_obs
        self.dones[step] = next_dones

        with torch.no_grad():
            next_obs_tensor = torch.tensor(next_obs).to(self.device)
            action, log_prob, value = self.agent_network.get_action_and_value(next_obs_tensor)
        self.values[step] = value.flatten()
        self.actions[step] = action
        self.log_probs[step] = log_prob

        next_obs, reward, next_dones, _, _ = envs.step(action)
        self.rewards[step] = reward

    # return the next obs and dones as they will be used for the next rollout
    return next_obs, next_dones

In [None]:

agent = PPOAgent(env.observation_space, env.action_space,
                     num_envs=num_envs,
                     agent_network_cls=AgentNetwork,
                     device=device,
                     gamma=1.0,
                     learning_rate=0.002,
                     rollout_length=256,
                     nr_epochs=5,
                     batch_size=64,
                     use_gae=False,
                     gae_lambda=0.95,
                     clip_coef=0.2,
                     value_loss_coef=0.5,
                     entropy_loss_coef=0.01)
next_obs, _ = envs.reset()
next_dones = np.zeros(num_envs)
next_obs, next_dones = agent.rollout(envs, next_obs, next_dones)

## Calculate returns
We need to calculate the returns (for the value approximation) and the advantage function (for the policy loss). We will first implement the n-step returns.

When using rollouts the n actually depends on the position of the sample in the rollout. I.e. we just calculate backwards from the end of the rollout and bootstrap the calculation with the value function computed from the next observation. There can be resets within the rollouts, which are marked with the dones=True for these positions that have to be taken into account.

In [None]:
%%add_to PPOAgent
def calculate_returns(self, next_obs, next_dones):
    """
    Calculate the returns and the advantages from the collected rollouts.
    Args:
        next_obs: the next observation at the end of the rollouts
        next dones: the next dones at the end of the rollouts
    """
    with torch.no_grad():
        # get the value for the next observation to bootstrap the returns
        next_obs_tensor = torch.tensor(next_obs).to(self.device)
        next_value = self.agent_network.get_value(next_obs_tensor).reshape(1, -1)

        # calculate the returns backwards from the rewards
        for t in reversed(range(self.rollout_length)):
            if t == self.rollout_length - 1:
                next_is_non_terminal = 1.0 - next_dones
                next_return = next_value
            else:
                next_is_non_terminal = 1.0 - self.dones[t + 1]
                next_return = self.returns[t + 1]
            # now calculate the returns (in self.returns) and the advantages (self.advantages) for step t
            self.returns[t] = self.rewards[t] + self.gamma * next_is_non_terminal * next_return
            self.advantages[t] = self.returns[t] - self.values[t]

In [None]:
agent.calculate_returns(next_obs, next_dones)
print(agent.advantages)

### Calculate Generalized Advantage Estimates

The other, and better, possibility is to calculate the advantages using generalize advantage estimates. They can be calculated in reverse order of the rollout by scaling the last GAE value and adding the TD Error for the time step.

In [None]:
%%add_to PPOAgent

def calculate_gae(self, next_obs, next_dones):
    """
    Calculate the advantages from the rollouts using the GAE approach. This can be done
    iteratively from the end by multiplying the previous advantage by the gamma and lambda
    factors and adding the TD error.

    The returns can then be calculated from the advantages and the value functions.
    Args:
        next_obs: the next observation at the end of the rollouts
        next_dones: the next dones at the end of the rollouts
    """
    with (torch.no_grad()):
        # get the value for the next observation to bootstrap the returns
        next_obs_tensor = torch.tensor(next_obs).to(self.device)
        last_gae = 0
        for t in reversed(range(self.rollout_length)):
            if t == self.rollout_length - 1:
                next_is_non_terminal = 1.0 - next_dones
                next_value = self.agent_network.get_value(next_obs_tensor).reshape(1, -1)
            else:
                next_is_non_terminal = 1.0 - self.dones[t + 1]
                next_value = self.values[t + 1]
            # calculate TD error, advantage and returns. Save the alst GAE value in last_gae
            delta = self.rewards[t] + self.gamma * next_is_non_terminal * next_value - self.values[t]
            last_gae = delta + self.gamma * self.gae_lambda * next_is_non_terminal * last_gae
            self.advantages[t] = last_gae
            self.returns[t] = last_gae + self.values[t]

In [None]:
agent.calculate_gae(next_obs, next_dones)

## Train one epoch

Now we want to train one epoch using the rollouts and the already calculated advantages and returns. The following steps are necessary:
- Flatten the data from the vectorized environment so that they look like from one environment
- Shuffle the indices
- Calculate start and end of a minibatch and select the indices for the minibatch from this range in the shuffled indices
- Calculate the tensors of the necessary values
- Calculate the log probabilities, entropy and value function from the observation and actions in the minibatch
- Calculate the ratio between the new and old policy (use the difference between the log probabilities and apply the exponential function to the result)
- Calculate the PPO policy loss as follows:
    - Calculate -advantage * ratio
    - Calculate -advantage * clamp (ratio, 1-clip_coef, 1+clip_coef)
    - Calculate the max of those two values
    - Calculate the mean of the result
- Calculate the value loss
- Calculate the entropy loss
- Calculate the total loss as sum of all losses
- Perform gradient descent

In [None]:
%%add_to PPOAgent
def train_epoch(self, verbose=False):
    """
    Train one epoch using the collected rollouts and calculated advantages. Looping over epochs needs to be done
    in the main training loop.
    """

    # we have 2D arrays of observations etc by step and environment, which we now reshape
    obs = self.obs.reshape((-1,) + self.observation_space.shape)
    actions = self.actions.reshape((-1,) + self.action_space.shape)
    log_probs = self.log_probs.reshape(-1)
    returns = self.returns.reshape(-1)
    advantages = self.advantages.reshape(-1)

    # we do shuffling of the indices and use the complete batch data
    indices = np.arange(self.rollout_length)
    np.random.shuffle(indices)

    # calculate the start and end positions of the minibatches
    for start in range(0, self.rollout_length, self.batch_size):
        # in case the rollout length is not a multiple of the batch size
        end = min(start + self.batch_size, self.rollout_length)
        mini_batch_indices = indices[start:end]

        # convert the minibatch data to tensors where needed:
        obs_tensor = torch.tensor(obs[mini_batch_indices], dtype=torch.float32).to(self.device)
        actions_tensor = torch.tensor(actions[mini_batch_indices]).to(self.device)
        advantages_tensor = torch.tensor(advantages[mini_batch_indices], dtype=torch.float32).to(self.device)
        returns_tensor = torch.tensor(returns[mini_batch_indices]).to(self.device)
        log_probs_tensor = torch.tensor(log_probs[mini_batch_indices]).to(self.device)

        # calculate the log probs and values using the current weights
        new_log_probs, entropy, new_value = self.agent_network.get_probs_and_value(obs_tensor, actions_tensor)

        # calculate the ratio between the old and new probabilities
        log_ratio = new_log_probs - log_probs_tensor
        ratio = torch.exp(log_ratio)

        # calculate the clipped PPO loss (negative, as we want to minimize)
        policy_loss_unclipped = -advantages_tensor * ratio
        policy_loss_clipped = -advantages_tensor * torch.clamp(ratio, 1 - self.clip_coef, 1 + self.clip_coef)
        policy_loss = torch.max(policy_loss_unclipped, policy_loss_clipped).mean()

        # calculate the value loss and scaled value loss (multiply by value_loss_coef)
        v_loss = ((new_value.squeeze() - returns_tensor) ** 2).mean()
        v_loss_scaled = self.value_loss_coef * v_loss

        # calculate the entropy loss
        entropy_loss = -entropy.mean()
        entropy_loss_scaled = self.entropy_loss_coef * entropy_loss

        # use combined loss for gradient calculation
        loss = policy_loss + v_loss_scaled + entropy_loss_scaled

        self.optimizer.zero_grad()
        loss.backward()

        # clipping is recommended but the actual clipping value is not so important
        nn.utils.clip_grad_norm_(self.agent_network.parameters(), 0.5)
        self.optimizer.step()

        # display some values if verbose, should best be done using wand or similar instead

        if verbose:
            print(f'p: {policy_loss:7.4} v: {v_loss_scaled:7.4} e:{entropy_loss_scaled:7.4}', end='\r')




In [None]:
agent.train_epoch(verbose=True)

## Train loop and evaluation

As the last task we now have to put it all together in the main training loop. For each train step
- calculate the rollouts
- calculate the advantages and returns (either by gae or returns)
- train for the specified number of epochs
- (Variation: Calculate advantages between epochs)

In [None]:
%%add_to PPOAgent
def train(self, envs: gym.Env, nr_steps: int, eval_env: gym.Env, eval_frequency: int,
          eval_episodes: int, verbose: bool = False):
    """
    Train the agent on the given environments for the given number of steps. One step is
    one rollout and a training batch for the number of epochs specified.
    Args:
        envs: The environments to train on. The number of environments must match the number
        given in the constructor parameter.
        nr_steps: The number of steps to train for.
        eval_env: The environment to evaluate on.
        eval_frequency: How often to run the evaluation.
        eval_episodes: How many episodes to run the evaluation.
        verbose: display training stats

    """
    next_obs, _ = envs.reset()
    next_dones = np.zeros(self.num_envs)

    for steps in range(nr_steps):
        # Perform rollout
        next_obs, next_dones = self.rollout(envs, next_obs, next_dones)
        
        # Calculate advantages and returns
        if self.use_gae:
            self.calculate_gae(next_obs, next_dones)
        else:
            self.calculate_returns(next_obs, next_dones)
        
        # Train for multiple epochs
        for epoch in range(self.nr_epochs):
            self.train_epoch(verbose=verbose)
        
        if steps % eval_frequency == 0:
            evaluated_returns = self.evaluate(eval_env, eval_episodes)
            if verbose:
                print('')
            print(f'Step {steps} : Return {np.mean(evaluated_returns)}', )

def evaluate(self, env, nr_episodes: int):
    """
    Evaluate the agent on the given environment for the given number of episodes.
    Args:
        env: the environment to evaluate on.
        nr_episodes: the number of episodes to run the evaluation.

    Returns:
        the undiscounted returns
    """
    rewards = []
    for episode in range(nr_episodes):
        obs, _ = env.reset()
        obs_tensor = torch.tensor(obs, dtype=torch.float32).to(self.device)
        a = self.agent_network.get_action(obs_tensor)
        done = False
        truncated = False
        episode_reward = 0
        while not done and not truncated :
            obs, reward, done, truncated, _ = env.step(a)
            obs_tensor = torch.tensor(obs, dtype=torch.float32).to(self.device)
            a = self.agent_network.get_action(obs_tensor)
            episode_reward += reward
        rewards.append(episode_reward)
    return rewards

In [None]:
agent.train(envs,1,env,1,1)

## Test the full implementation

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
num_envs = 10

environment_name = 'CartPole-v1'
env = gym.make(environment_name)
envs = gym.make_vec(environment_name, num_envs=num_envs, vectorization_mode=gym.VectorizeMode.SYNC)

network = AgentNetwork(env.observation_space.shape[0], env.action_space.n)
next_obs, _ = envs.reset()
next_dones = np.zeros(num_envs)

agent = PPOAgent(env.observation_space, env.action_space,
                 num_envs=num_envs,
                 agent_network_cls=AgentNetwork,
                 device=device,
                 gamma=0.99,
                 learning_rate=0.003,
                 rollout_length=256,
                 nr_epochs=5,
                 batch_size=64,
                 use_gae=True,
                 gae_lambda=0.95,
                 clip_coef=0.2,
                 value_loss_coef=0.5,
                 entropy_loss_coef=0.01)

training_steps = 300
agent.train(envs, training_steps, env, 10, 10, False)

In [None]:
def display_environment(env):
    plt.figure(figsize=(6,4))
    plt.imshow(env.render())
    plt.axis('off') 
    display.display(plt.gcf())
    display.clear_output(wait=True)
    plt.close()

def play_env(env, agent_network, device):
    obs, _ = env.reset()
    obs_tensor = torch.tensor(obs, dtype=torch.float32).to(device)
    action = agent_network.get_action(obs_tensor)
    for i in range(501):
        display_environment(env)
        obs, reward, done, _, info = env.step(action)
        if done:
           break

        obs_tensor = torch.tensor(obs, dtype=torch.float32).to(device)
        action = agent_network.get_action(obs_tensor)

In [None]:
env_play = gym.make(environment_name, render_mode='rgb_array')
play_env(env_play, agent.agent_network, device)

## Lunar Lander

 Next we want to try a new and slightly more complicated environment: Lunar Lander

 Do the default parameters work? Can we solve the environment with PPO?

 The goal of lunar lander is to get a return of >200

In [None]:
num_envs = 10

environment_name = 'LunarLander-v3'
env = gym.make(environment_name)
envs = gym.make_vec(environment_name, num_envs=num_envs, vectorization_mode=gym.VectorizeMode.SYNC)

network = AgentNetwork(env.observation_space.shape[0], env.action_space.n)
next_obs, _ = envs.reset()
next_dones = np.zeros(num_envs)

agent = PPOAgent(env.observation_space, env.action_space,
                 num_envs=num_envs,
                 agent_network_cls=AgentNetwork,
                 device=device,
                 gamma=0.99,
                 learning_rate=0.0003,
                 rollout_length=512,
                 nr_epochs=2,
                 batch_size=128,
                 use_gae=True,
                 gae_lambda=0.97,
                 clip_coef=0.2,
                 value_loss_coef=0.1,
                 entropy_loss_coef=0.01)

In [None]:
training_steps = 1000
agent.train(envs, training_steps, env, 50, 2, False)
torch.save(agent.agent_network, 'lunar_lander.pt')

In [None]:
env_play = gym.make(environment_name, render_mode='rgb_array')
play_env(env_play, agent.agent_network, device)

In [None]:
loaded_network = torch.load('lunar_lander.pt', weights_only=False)

In [None]:
play_env(env_play, loaded_network, device)