# **PROXIMAL POLICY OPTIMIZATION**

**Lab exercise created by [Juan José Nieto](https://www.linkedin.com/in/juan-jose-nieto-salas/) for the [Postgraduate course in Artificial Intelligence with Deep Learning](https://www.talent.upc.edu/ing/estudis/formacio/curs/310400/postgrau-artificial-intelligence-deep-learning/) at [UPC School](https://www.talent.upc.edu/ing/) (2021).**

(This version is adapted for a short lab)


# Installing dependencies

In [None]:
!pip install swig --quiet
!pip install gym[box2d]==0.17.3 --quiet
!pip install Box2D wandb --quiet

# install utilities for rendering OpenAI Gym videos in Colab
!apt-get -qq install -y xvfb x11-utils
!sudo apt -qq install -y python3-opengl
!pip install pyvirtualdisplay==0.2.* \
             PyOpenGL==3.1.* \
             PyOpenGL-accelerate==3.1.* \
             box2d-kengz \
             --quiet


# Setting up the environment

In [None]:
import base64
import glob
import io
import os
import math
import timeit
import warnings

from IPython.display import HTML
from IPython.display import display

In [None]:
import gymnasium as gym
import wandb
import random

import numpy as np
from random import randint
from collections import namedtuple

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.distributions import Categorical
from torch.utils.data.sampler import BatchSampler, SubsetRandomSampler

In [None]:
# starting a fake screen in the background
#  in order to render videos
os.system("Xvfb :1 -screen 0 1024x768x24 &")
os.environ["DISPLAY"] = ":1"

# utility to get video file from directory
def get_video_filename(dir="video"):
  glob_mp4 = os.path.join(dir, "*.mp4") 
  mp4list = glob.glob(glob_mp4)
  assert len(mp4list) > 0, "couldnt find video files"
  return mp4list[-1]

# Random Agent

In [None]:
env = gym.make("LunarLander-v3", render_mode="rgb_array")

env = gym.wrappers.RecordVideo(env, "./video")

ob, _ = env.reset()
done, total_rew = False, 0
truncated = False

while not (done or truncated):
  env.render()
  ac = env.action_space.sample()
  ob, rew, done, truncated, info = env.step(ac)
  total_rew += rew
  
print('Cumulative reward:', total_rew)
  
env.close()

In [None]:
wandb.login()

# Visualize random policy in Wandb

In [None]:
PROJECT = "AIDL-DRL"

In [None]:
wandb.init(project=PROJECT)
wandb.run.name = 'lunarlander_random_agent'
mp4 = get_video_filename()
wandb.log({"Video eval": wandb.Video(mp4, format="mp4")})
wandb.finish()

# Create the model
PPO is an optimization algorithm that uses the actor-critic framework as in the previous lab. For this reason, we instantiate two branches, one for the action log probabilities and the other for estimating a state value.

In [None]:
class Agent(nn.Module):
    def __init__(self, obs_len, act_len):
        super(Agent, self).__init__()
        
        self.obs_len = obs_len
        self.act_len = act_len

        self.mlp = nn.Sequential(
            nn.Linear(obs_len, 64),
            nn.ReLU(),
            nn.Linear(64, 128),
            nn.ReLU()
        )

        self.actor = nn.Linear(128, act_len)
        self.critic = nn.Linear(128, 1)


    def forward(self, state):
        out = self.mlp(state)
        action_scores = self.actor(out)
        state_value = self.critic(out)
        return F.softmax(action_scores, dim=1), state_value

    def compute_action(self, state):
        state = torch.from_numpy(state).float().unsqueeze(0)
        probs, state_value = self(state)

        m = torch.distributions.Categorical(probs)
        action = m.sample()
        
        return action.item(), m.log_prob(action).item(), state_value.item()


# Replay memory
Similar to DQN, now we can train again with old transitions that are going to be stored in a buffer.

In [None]:
transition = np.dtype([('s', np.float64, (8,)), ('a', np.float64), ('a_logp', np.float64),
                       ('r', np.float64), ('s_', np.float64, (8,))])


class ReplayMemory():
    def __init__(self, capacity):
        self.buffer_capacity = capacity
        self.buffer = np.empty(capacity, dtype=transition)
        self.counter = 0

    # Stores a transition and returns True or False depending on whether the buffer is full or not
    def store(self, transition):
        self.buffer[self.counter] = transition
        self.counter += 1
        if self.counter == self.buffer_capacity:
            self.counter = 0
            return True
        else:
            return False

In [None]:
def compute_returns_and_advantages(rewards, values, gamma):
    # Initializing returns and advantages tensors
    returns = torch.zeros_like(rewards)
    advantages = torch.zeros_like(rewards)
    
    # Initializing the variable for the next value
    next_value = 0
    next_advantage = 0

    for t in reversed(range(len(rewards))):
        # Compute returns: G_t = R_t + gamma * G_{t+1}
        returns[t] = rewards[t] + gamma * next_value
        next_value = returns[t]

        advantages[t] = returns[t] - values[t]

    # Normalize advantages and returns
    returns = (returns - returns.mean()) / (returns.std() + 1e-10)
    advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-10)

    return returns, advantages

**Exercise 1: Compute the ratio between the new and the old policy.**

**Exercise 2: Modify the surrogate objective by clipping the probability ratio.**

**Exercise 3: Compute the loss function weighting each term correspondingly.**


In [None]:
def train(policy, optimizer, memory, hparams):

    gamma = hparams['gamma']
    ppo_epoch = hparams['ppo_epoch']
    batch_size = hparams['batch_size']
    clip_param = hparams['clip_param']
    c1 = hparams['c1']
    c2 = hparams['c2']


    s = torch.tensor(memory.buffer['s'], dtype=torch.float)
    a = torch.tensor(memory.buffer['a'], dtype=torch.float)
    r = torch.tensor(memory.buffer['r'], dtype=torch.float).view(-1, 1)
    s_ = torch.tensor(memory.buffer['s_'], dtype=torch.float)

    old_a_logp = torch.tensor(memory.buffer['a_logp'], dtype=torch.float).view(-1, 1)


    with torch.no_grad():
        value_pred = policy(s)[1]
        returns, advantages = compute_returns_and_advantages(r, value_pred, gamma)

    for _ in range(ppo_epoch):
        probs, _ = policy(s)
        dist = Categorical(probs)
        entropy = dist.entropy()
        
        a_logp = dist.log_prob(a).unsqueeze(dim=1)

        # TODO: Compute ratio. Hint: pi/pi_old = e^(ln pi - ln pi_old)
        ratio = ...

        surr1 = ratio * advantages

        # TODO: Modify the surrogate objective by clipping the probability ratio.
        # Hint: Use the torch.clamp function and the clip_param variable
        surr2 = TODO * advantages

        policy_loss = torch.min(surr1, surr2).mean()
        value_loss = F.smooth_l1_loss(policy(s)[1], returns)
        entropy = entropy.mean()

        # TODO: Compute the loss function weighting each term correspondingly.
        # Take into account the needed signs for each term!
        loss = ...

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()


    return -policy_loss.item(), value_loss.item(), entropy.item(), ratio.mean().item()

def test(env, policy, render=False):
    state, _ = env.reset()
    done, ep_reward = False, 0
    truncated = False
    while not (done or truncated):
        env.render()
        action, _, _ = policy.compute_action(state)
        state, reward, done, truncated, _ = env.step(action)
        ep_reward += reward

    env.close()
    mp4 = get_video_filename()
    wandb.log({"Video eval": wandb.Video(mp4, format="mp4")})
    return ep_reward

## Hyperparameters

These values were found running a sweep over some hyperparameters.

In this [Colab](https://colab.research.google.com/drive/1A3yA_jAiPKi3H7YDSYJekl64Vz7JxhDi?usp=sharing) you will find the code used to execute it. And in this [report](https://wandb.ai/juanjo3ns/lunar-lander/reports/Hyperparameter-Sweeping-in-PPO-LunarLander--Vmlldzo3NjA0MDY?accessToken=ibdz6huvu28hl5wv53szeurltq481riu5wexihxqek645b1ymr27jilpo5xili4y) you can see the results of these runs.

In [None]:
hparams = {
    'gamma' : 0.99,
    'log_interval' : 100,
    'num_episodes': 2000,
    'lr' : 1e-3,
    'clip_param': 0.1,
    'ppo_epoch': 10,
    'replay_size': 930,
    'batch_size': 128,
    'c1': 1.,
    'c2': 0.021
}


In [None]:
# Create environment
env = gym.make("LunarLander-v3", render_mode="rgb_array")

In [None]:
# Get number of actions from gym action space
n_inputs = env.observation_space.shape[0]
n_actions = env.action_space.n

In [None]:
# Fix random seed (for reproducibility)
seed=0
random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)

# Initialize wandb run
wandb.finish() # execute to avoid overlapping runnings (advice: later remove duplicates in wandb)
wandb.init(project=PROJECT, config=hparams)
wandb.run.name = 'ppo_lunarlander_train_0'


# Create policy and optimizer
policy = Agent(n_inputs, n_actions)
optimizer = torch.optim.Adam(policy.parameters(), lr=hparams['lr'])

eps = np.finfo(np.float32).eps.item()
memory = ReplayMemory(hparams['replay_size'])

# Training loop
print("Target reward: {}".format(env.spec.reward_threshold))
running_reward = -100
ep_rew_history_reinforce = []
for i_episode in range(hparams['num_episodes']):
    # Collect experience
    state, _ = env.reset()
    ep_reward = 0
    done, truncated = False, False

    while not (done or truncated):  # Don't infinite loop while learning
        action, a_logp, state_value = policy.compute_action(state)
        next_state, reward, done, truncated, _ = env.step(action)
        

        if memory.store((state, action, a_logp, reward, next_state)):
            policy_loss, value_loss, avg_entropy, ratio = train(policy, optimizer, memory, hparams)
            wandb.log(
                {
                'policy_loss': policy_loss,
                'value_loss': value_loss,
                'running_reward': running_reward,
                'mean_entropy': avg_entropy,
                'ratio': ratio
                })


        state = next_state

        ep_reward += reward
        if done or truncated:
            break

    # Update running reward
    running_reward = 0.05 * ep_reward + (1 - 0.05) * running_reward
    
    
    ep_rew_history_reinforce.append((i_episode, ep_reward))
    if i_episode % hparams['log_interval'] == 0:
        print(f'Episode {i_episode}\tLast reward: {ep_reward:.2f}\tAverage reward: {running_reward:.2f}')
        test_env = gym.wrappers.RecordVideo(env, "./video")
        ep_reward = test(test_env, policy)

    if running_reward > env.spec.reward_threshold:
        print("Solved!")
        break

print(f"Finished training! Running reward is now {running_reward}")
test_env = gym.wrappers.RecordVideo(env, "./video")
ep_reward = test(test_env, policy)
wandb.finish()