Notebook created by [Víctor Campos](https://imatge.upc.edu/web/people/victor-campos)

Updated by [Juan José Nieto](https://www.linkedin.com/in/juan-jose-nieto-salas/) - UPC School - AIDL Spring 2021

# Basic Policy Gradients
# **REINFORCE**

This notebook is adapted from the [official REINFORCE tutorial](https://github.com/pytorch/examples/blob/master/reinforcement_learning/reinforce.py).

## Installing dependencies


In [None]:
!pip install swig --quiet
!pip install gymnasium wandb pygame --quiet

# install utilities for rendering OpenAI Gym videos in Colab
!apt-get -qq install -y xvfb x11-utils
!pip install pyvirtualdisplay==0.2.* \
             PyOpenGL==3.1.* \
             PyOpenGL-accelerate==3.1.* \
             --quiet


## Setting up the environment

In [None]:
import base64
import glob
import io
import os
import math
import timeit
import warnings

from IPython.display import HTML
from IPython.display import display

In [None]:
import gymnasium as gym
import wandb
import random

import numpy as np
from datetime import datetime
from random import randint
from collections import namedtuple

import matplotlib.pyplot as plt

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [None]:
# starting a fake screen in the background
#  in order to render videos
os.system("Xvfb :1 -screen 0 1024x768x24 &")
os.environ["DISPLAY"] = ":1"

# utility to get video file from directory
def get_video_filename(dir="video"):
  glob_mp4 = os.path.join(dir, "*.mp4") 
  mp4list = glob.glob(glob_mp4)
  assert len(mp4list) > 0, "couldnt find video files"
  return mp4list[-1]

## Visualize a random policy in the environment

Our goal is to train an agent that is capable of solving the CartPole problem, where a pole is attached to a cart moving along a horizontal track. The agent can interact with the environment by applying a force (+1/-1) to the cart. The episode is terminated whenever the pole is more than 15 degrees from vertical or the cart goes out of bounds in the horizontal axis. The agent receives +1 reward for each timestep under the desired conditions.

We can visualize what a random policy would do in this environment:

In [None]:
env = gym.make("CartPole-v1", render_mode="rgb_array")

env = gym.wrappers.RecordVideo(env, "./video")

ob, _ = env.reset()
done, total_rew = False, 0

while not done:
  env.render()
  
  ac = env.action_space.sample()
  
  ob, rew, done, truncated, info = env.step(ac)
  
  total_rew += rew
  
print('Cumulative reward:', total_rew)
  
env.close()

# Log in to your Wandb account

In [None]:
wandb.login()

# Visualize random policy in Wandb

In [None]:
PROJECT = "AIDL-DRL"

In [None]:
wandb.init(project=PROJECT)
wandb.run.name = 'cartpole_random_agent'
mp4 = get_video_filename()
wandb.log({"Video eval": wandb.Video(mp4, format="mp4")})
wandb.finish()

## Create the model

Now we will define our policy, parameterized by a feedforward neural network.

**Exercise #1.** Implement the policy as an MLP with a hidden layer of 128 neurons with a ReLU activation and no final activation layer, we will use the logits directly.

In [None]:
class Policy(nn.Module):
    def __init__(self, inputs, outputs):
        super(Policy, self).__init__()
        # TODO: Complete the layer's implementation


        self.saved_log_probs = []
        self.rewards = []
        self.entropy = []

    def forward(self, x):
        # TODO: Complete the forward pass


## Functions for collecting experience and updating the policy

**Exercise #2.** Forward the state through the policy to get the logits that will parametrize the Categorical distribution.

**Exercise #3.** Compute the return from the rewards collected by the policy.

**Exercise #4.** Complete the loss computation using the returns and the log probs.

In [None]:
def select_action(policy, state):
    # Convert state into PyTorch tensor
    state = torch.from_numpy(state).float().unsqueeze(0)

    # TODO: Compute the logits for each action. This is later translated to probabilities.
    logits = ...

    # Sample action from a categorical distribution.
    m = torch.distributions.Categorical(logits=logits)
    action = m.sample()
    # Bookkeeping
    policy.saved_log_probs.append(m.log_prob(action))
    policy.entropy.append(m.entropy().item())

    return action.item()


def train(policy, optimizer, gamma):
    G = 0
    policy_loss = []
    returns = []
    # Compute the returns by reading the rewards vector backwards
    for r in policy.rewards[::-1]:
        
        # TODO: Complete the computation of the return using gamma
        G = r + ...

        returns.insert(0, G)
    returns = torch.tensor(returns)
    # Normalize returns (this usually accelerates convergence)
    returns = (returns - returns.mean()) / (returns.std() + eps)
    for log_prob, G in zip(policy.saved_log_probs, returns):
    
        # TODO: Complete the 'loss' computation using the returns and the log probs.
        policy_loss.append(...)
    
    # Update policy: 
    #  (1) reset optimizer grads
    optimizer.zero_grad()
    #  (2) compute surrogate policy gradients loss
    policy_loss = torch.cat(policy_loss).sum()
    #  (3) SGD step
    policy_loss.backward()
    optimizer.step()

    del policy.rewards[:]
    del policy.saved_log_probs[:]
    del policy.entropy[:]

    return policy_loss.item()


def test(env, policy, video_path='./video', render=False):
    state, _ = env.reset()
    ep_reward, done, truncated = 0, False, False
    while not (done or truncated):
        action = select_action(policy, state)
        state, reward, done, truncated, info = env.step(action)
        ep_reward += reward

    env.close()
    mp4 = get_video_filename(video_path)
    wandb.log({"Video eval": wandb.Video(mp4, format="mp4")})

## Training the agent

In [None]:
# Hyperparameters
hparams = {
    'gamma' : 0.99,             # discount factor
    'log_interval' : 25,        # controls how often we log progress, in episodes
    'num_episodes': 1500,       # number of steps to train on
    'lr' : 1e-2,                # learning rate
    'max_ep_len': 1000,         # maximum episode length
}

In [None]:
# Create environment
env_name = 'CartPole-v1'
env = gym.make(env_name, render_mode="rgb_array")

In [None]:
# Get number of actions from gym action space
n_inputs = env.observation_space.shape[0]
n_actions = env.action_space.n

In [None]:
# Fix random seed (for reproducibility)
seed = 543
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)

# Initialize wandb run
wandb.finish() # execute to avoid overlapping runnings (advice: later remove duplicates in wandb)
wandb.init(project=PROJECT, config=hparams)
wandb.run.name = 'reinforce_cartpole_train_0'


# Initialize policy and target networks
policy = Policy(n_inputs, n_actions)
optimizer = torch.optim.Adam(policy.parameters(), lr=hparams['lr'])
eps = np.finfo(np.float32).eps.item()


# Training loop
print(f"Target reward: {env.spec.reward_threshold}")
running_reward = 10
ep_rew_history_reinforce = []
for i_episode in range(hparams['num_episodes']):
    # Collect experience
    state, _ = env.reset()
    ep_reward = 0
    done = False
    for t in range(hparams['max_ep_len']):  # Don't infinite loop while learning
        
        action = select_action(policy, state)
        state, reward, done, truncated, _ = env.step(action)
        policy.rewards.append(reward)
        ep_reward += reward
        if done:
            break

    # Update running reward
    running_reward = 0.05 * ep_reward + (1 - 0.05) * running_reward
    mean_entropy = np.mean(policy.entropy)
    
    # Perform training step
    p_loss = train(policy, optimizer, hparams['gamma'])
    

    wandb.log(
        {
        'running_reward': running_reward,
        'ep_reward': ep_reward,
        'mean_entropy': mean_entropy,
        'policy_loss': p_loss,
        }
    )

    ep_rew_history_reinforce.append(ep_reward)
    if i_episode % hparams['log_interval'] == 0:
        print(f'Episode {i_episode}\tLast reward: {ep_reward:.2f}\tAverage reward: {running_reward:.2f}')

    if running_reward > env.spec.reward_threshold:
        print("Solved!")
        break


print("Finished training! Running reward is now {:.2f} and "
      "the last episode runs to {} time steps!".format(running_reward, t))
test_env = gym.wrappers.RecordVideo(env, "./video")
test(test_env, policy)
wandb.finish()

In [None]:
plt.plot(np.arange(len(ep_rew_history_reinforce)), ep_rew_history_reinforce)
plt.xlabel('Episode')
plt.ylabel('Reward')


# **REINFORCE w/ Baseline Version**

## The new Policy Module encodes both Actor and Critic's Network.
**Exercise #5.** Complete the forward pass using the corresponding actor and critic's heads.


In [None]:
class Policy(nn.Module):
    """
    Implements both actor and critic in one model
    """
    def __init__(self, inputs, actor_output, critic_output):
        super(Policy, self).__init__()
        self.affine1 = nn.Linear(inputs, 128)

        # actor's layer
        self.actor_head = nn.Linear(128, actor_output)

        # critic's layer
        self.critic_head = nn.Linear(128, critic_output)

        # action & reward buffer
        self.saved_log_probs = []
        self.rewards = []
        self.entropy = []

    def forward(self, x):
        """
        Forward of both actor and critic
        """
        x = F.relu(self.affine1(x))

        # TODO: Compute logits of each action
        action_logits = ...

        # TODO: Compute state values for state s_t
        state_values = ...

        return action_logits, state_values

## Now we want to store also the state-value for each step

In [None]:
def select_action(policy, state):
    # Convert state into PyTorch tensor
    state = torch.from_numpy(state).float().unsqueeze(0)
    # Compute action logits and state value
    action_logits, state_value = policy(state)
    # Sample action
    m = torch.distributions.Categorical(logits=action_logits)
    action = m.sample()
    # Bookkeeping
    policy.saved_log_probs.append(SavedAction(m.log_prob(action), state_value))
    policy.entropy.append(m.entropy().item())

    return action.item()


## The gradient computation
$$ g = \mathbb{E}\left[{\sum_{t=0}^{\infty} \Psi_t \nabla_{\theta} \log \pi_{\theta}(a_t |s_t)}\right]$$
## with REINFORCE, $\Psi_t $ was defined as 
$$ \Psi_t =  \sum_{t'=t}^\infty r_{t'} $$
## but now we will substract a baseline learned with the Critic Network
$$ \Psi_t =  \sum_{t'=t}^\infty (r_{t'}-b(s_t)) $$

**Exercise #6.** Compute the advantages by substracting the baselines to the returns.

In [None]:
def train(model, optimizer, gamma):
    G = 0
    policy_loss = []
    value_loss = []
    returns = []
    # Compute discounted rewards
    for r in model.rewards[::-1]:
        G = r + gamma*G
        returns.insert(0, G)

    returns = torch.tensor(returns)
    # Normalize returns (this usually accelerates convergence)
    returns = (returns - returns.mean()) / (returns.std() + eps)

    for (log_prob, baseline), G in zip(model.saved_log_probs, returns):
        
        # TODO: Compute advantage
        advantage = ...

        # calculate actor (policy) loss 
        policy_loss.append(-log_prob * advantage)

        # calculate critic (value) loss using L1 smooth loss
        value_loss.append(F.smooth_l1_loss(baseline.squeeze(), G))


    optimizer.zero_grad()

    p_loss = torch.stack(policy_loss).sum()
    v_loss = torch.stack(value_loss).sum()

    total_loss = p_loss + v_loss

    total_loss.backward()

    optimizer.step()

    del model.rewards[:]
    del model.saved_log_probs[:]
    del model.entropy[:]

    return p_loss.detach().cpu().item(), v_loss.detach().cpu().item()

## Train the agent again

In [None]:
# Hyperparameters
hparams = {
    'gamma' : 0.99,             # discount factor
    'log_interval' : 25,        # controls how often we log progress, in episodes
    'num_episodes': 1500,       # number of steps to train on
    'lr' : 1e-2,                # learning rate
    'max_ep_len': 1000,         # maximum episode length
}

In [None]:
# Create environment
env_name = 'CartPole-v1'
env = gym.make(env_name, render_mode="rgb_array")

SavedAction = namedtuple('SavedAction', ['log_prob', 'value'])

In [None]:
# Get number of actions from gym action space
n_inputs = env.observation_space.shape[0]
n_actions = env.action_space.n

In [None]:
# Fix random seed (for reproducibility)
seed = 543
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)

# Initialize wandb run
wandb.finish() # execute to avoid overlapping runnings (advice: later remove duplicates in wandb)
wandb.init(project=PROJECT, config=hparams)
wandb.run.name = 'reinforce_wbaseline_cartpole_train_0'


# Initialize policy and target networks
policy = Policy(n_inputs, n_actions, 1)
optimizer = torch.optim.Adam(policy.parameters(), lr=hparams['lr'])
eps = np.finfo(np.float32).eps.item()


# Training loop
print(f"Target reward: {env.spec.reward_threshold}")
running_reward = 10
ep_rew_history_baseline = []
for i_episode in range(hparams['num_episodes']):
    # Collect experience
    state, _ = env.reset()
    ep_reward = 0
    done = False
    for t in range(hparams['max_ep_len']):  # Don't infinite loop while learning
        
        action = select_action(policy, state)
        state, reward, done, truncated, _ = env.step(action)
        policy.rewards.append(reward)
        ep_reward += reward
        if done:
            break

    # Update running reward
    running_reward = 0.05 * ep_reward + (1 - 0.05) * running_reward
    mean_entropy = np.mean(policy.entropy)
    
    # Perform training step
    p_loss, v_loss = train(policy, optimizer, hparams['gamma'])

    wandb.log(
        {
        'running_reward': running_reward,
        'ep_reward': ep_reward,
        'mean_entropy': mean_entropy,
        'policy_loss': p_loss,
        'value_loss': v_loss,
        'total_loss': p_loss + v_loss,
        }
    )

    ep_rew_history_baseline.append(ep_reward)
    if i_episode % hparams['log_interval'] == 0:
        print(f'Episode {i_episode}\tLast reward: {ep_reward:.2f}\tAverage reward: {running_reward:.2f}')


    if running_reward > env.spec.reward_threshold:
        print("Solved!")
        break


print("Finished training! Running reward is now {:.2f} and "
      "the last episode runs to {} time steps!".format(running_reward, t))
test_env = gym.wrappers.RecordVideo(env, "./video")
test(test_env, policy)
wandb.finish()

In [None]:
fig, ax = plt.subplots(figsize=(16,10))

plt.plot(np.arange(len(ep_rew_history_reinforce)), ep_rew_history_reinforce)
plt.plot(np.arange(len(ep_rew_history_baseline)), ep_rew_history_baseline)

plt.legend(['REINFORCE', 'REINFORCE w/ baseline'])
plt.xlabel('Episode')
plt.ylabel('Reward')