## HumanoidPyBulletEnv-v0

In this notebook, you will implement a PPO agent with OpenAI Gym's HumanoidPyBulletEnv-v0 environment.

In [None]:
import math
import random
import sys
import pathlib

import gym
import pybullet
import pybulletgym
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Normal

In [None]:
from IPython.display import clear_output
import matplotlib.pyplot as plt
%matplotlib inline

<h2>Use CUDA</h2>

In [None]:
use_cuda = torch.cuda.is_available()
device   = torch.device("cuda" if use_cuda else "cpu")

<h2>Create Environments</h2>

In [None]:
from multiprocessing_env import SubprocVecEnv
import time

LOAD_CHECKPOINT = True
DO_TRAINING = False

num_envs = 8
env_name = "HumanoidPyBulletEnv-v0"

hidden_size         = 64


policy_optimizer_lr = 0.00005
policy_stopping_kl  = 0.02

value_optimizer_lr  = 0.00015
value_stopping_mse  = 25

entropy_loss_weight = 0.01

num_steps           = 1024
mini_batch_size     = 64
ppo_epochs          = 15
threshold_reward    = 6000


ACTOR_CHECKPOINT_PATH = pathlib.Path("./pretrained/" + "actor_" + env_name + "_checkpoint.pt")
ACTOR_FINAL_PATH = pathlib.Path("./pretrained/" + "actor_" + env_name + "_final.pt")

CRITIC_CHECKPOINT_PATH = pathlib.Path("./pretrained/" + "critic_" + env_name + "_checkpoint.pt")
CRITIC_FINAL_PATH = pathlib.Path("./pretrained/" + "critic_" + env_name + "_final.pt")

def make_env():
    def _thunk():
        env = gym.make(env_name)
        return env

    return _thunk

envs = [make_env() for i in range(num_envs)]
envs = SubprocVecEnv(envs)

env = gym.make(env_name)

<h2>Neural Network</h2>

In [None]:
class BaseModule(nn.Module):
    def __init__(self):
        super(BaseModule, self).__init__()

    def _build_network(self, num_inputs, num_outputs, hidden_size):
        
        if isinstance(hidden_size, int):
            return nn.Sequential(
                nn.Linear(num_inputs, hidden_size),
                nn.ReLU(),
                nn.Linear(hidden_size, num_outputs)
            )
            
        else:
            return nn.Sequential(
                nn.Linear(num_inputs, hidden_size[0]),
                nn.ReLU(),
                *self._build_hidden(hidden_size),
                nn.Linear(hidden_size[-1], num_outputs)
            )        
        
    def _build_hidden(self, hidden_size):
        hidden_layers = []
        for i in range(len(hidden_size)-1):            
            hidden_layers.append(nn.Linear(hidden_size[i], hidden_size[i+1]))
            hidden_layers.append(nn.ReLU())
        return hidden_layers        
        
class Actor(BaseModule):
    def __init__(self, num_inputs, num_outputs, hidden_size, std=0.0):
        super(Actor, self).__init__()
        self.model = self._build_network(num_inputs, num_outputs, hidden_size)                
        self.log_std = nn.Parameter(torch.ones(1, num_outputs) * std, requires_grad=True)
                
    def forward(self, x):
        mu    = self.model(x)        
        std   = self.log_std.exp().expand_as(mu)
        dist  = Normal(mu, std)
        return dist
          
class Critic(BaseModule):
    def __init__(self, num_inputs, hidden_size):
        super(Critic, self).__init__()
        self.model = self._build_network(num_inputs, 1, hidden_size)
                
    def forward(self, x):
        return self.model(x)

In [None]:
def plot(frame_idx, rewards):
    clear_output(True)
    plt.figure(figsize=(20,5))
    plt.subplot(131)
    plt.title('frame %s. reward: %s' % (frame_idx, rewards[-1]))
    
    mean = []
    for x in range(len(rewards)):
        mean.append(np.array(rewards[:x]).mean())
        
    plt.plot(rewards, label="Reward")
    plt.plot(mean, label="mean")
    plt.legend()
    plt.show()
    
def test_env(vis=False):
    state = env.reset()
    if vis: env.render()
    done = False
    total_reward = 0
    while not done:
        state = torch.FloatTensor(state).unsqueeze(0).to(device)
        dist = policy_model(state)
        next_state, reward, done, _ = env.step(dist.sample().cpu().numpy()[0])
        state = next_state
        if vis: env.render()
        total_reward += reward
    return total_reward

In [None]:
def compute_gae(next_value, rewards, masks, values, gamma=0.99, tau=0.97):
    values = values + [next_value]
    gae = 0
    returns = []
    for step in reversed(range(len(rewards))):
        delta = rewards[step] + gamma * values[step + 1] * masks[step] - values[step]
        gae = delta + gamma * tau * masks[step] * gae
        returns.insert(0, gae + values[step])
    return returns

In [None]:
def ppo_iter(mini_batch_size, states, actions, log_probs, returns, advantage):
    batch_size = states.size(0)
    for _ in range(batch_size // mini_batch_size):
        rand_ids = np.random.randint(0, batch_size, mini_batch_size)
        yield states[rand_ids, :], actions[rand_ids, :], log_probs[rand_ids, :], returns[rand_ids, :], advantage[rand_ids, :]


def ppo_update(ppo_epochs, mini_batch_size, states, actions, log_probs, returns, advantages, 
               clip_param=0.1, value_loss_coef=0.5, entropy_coef=0.01, max_grad_norm=0.5):
    # Policy
    for _ in range(ppo_epochs):
        for state, action, old_log_probs, return_, advantage in ppo_iter(mini_batch_size, states, actions, log_probs, returns, advantages):
            dist = policy_model(state)
            
            entropy = dist.entropy().mean()
            new_log_probs = dist.log_prob(action)

            ratio = (new_log_probs - old_log_probs).exp()
            surr1 = ratio * advantage
            surr2 = torch.clamp(ratio, 1.0 - clip_param, 1.0 + clip_param) * advantage

            policy_loss  = -torch.min(surr1, surr2).mean()
            entropy_loss = -entropy.mean() * entropy_loss_weight
            
            policy_optimizer.zero_grad()
            
            if max_grad_norm:
                torch.nn.utils.clip_grad_norm_(policy_model.model.parameters(), max_grad_norm)
            
            (policy_loss + entropy_loss).backward()
            policy_optimizer.step()
            
            with torch.no_grad():
                dist = policy_model(state)                
                logpas_pred_all = dist.log_prob(action)                
                kl = (new_log_probs - logpas_pred_all).mean()
                if kl.item() > policy_stopping_kl:
                    break
          
    # Value    
    for _ in range(ppo_epochs):
        for state, action, old_log_probs, return_, advantage in ppo_iter(mini_batch_size, states, actions, log_probs, returns, advantages):
            value = value_model.model(state)
    
            value_loss = 0.5 * (return_ - value).pow(2).mean()
            
            value_optimizer.zero_grad()
            
            if max_grad_norm:
                torch.nn.utils.clip_grad_norm_(value_model.model.parameters(), max_grad_norm)      
        
            value_loss.backward()
            value_optimizer.step()
            
            with torch.no_grad():
                values_pred_all = value_model.model(state)
                mse = 0.5 * (value - values_pred_all).pow(2).mean()
                if mse.item() > value_stopping_mse:
                    break

In [None]:
def loadCheckpoint(filename, model):
    checkpoint = torch.load(filename)
    model.load_state_dict(checkpoint['model_state_dict'])
    model.to(device)
    model.eval()

In [None]:
def saveCheckpoint(filename, epoch, model, optimizer):
    checkpoint = {
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
    }

    torch.save(checkpoint, filename)

In [None]:
num_inputs  = envs.observation_space.shape[0]
num_outputs = envs.action_space.shape[0]

policy_model = Actor(num_inputs, num_outputs, hidden_size).to(device)
policy_optimizer = optim.Adam(policy_model.parameters(), lr=policy_optimizer_lr) 

value_model = Critic(num_inputs, hidden_size).to(device)
value_optimizer = optim.Adam(value_model.parameters(), lr=value_optimizer_lr)

if LOAD_CHECKPOINT:    
    loadCheckpoint(ACTOR_CHECKPOINT_PATH, policy_model)
    loadCheckpoint(CRITIC_CHECKPOINT_PATH, value_model)    
    
print(policy_model)
print(value_model)

In [None]:
def train():
    frame_idx  = 0
    train_epoch = 0

    test_rewards = []
    best_reward = None

    state = envs.reset()
    early_stop = False

    while not early_stop:
        state = envs.reset()

        log_probs = []
        values    = []
        states    = []
        actions   = []
        rewards   = []
        masks     = []

        for _ in range(num_steps):
            state = torch.FloatTensor(state).to(device)
            dist = policy_model(state)
            value = value_model(state)

            action = dist.sample()
            next_state, reward, done, _ = envs.step(action.cpu().numpy())

            log_prob = dist.log_prob(action)

            log_probs.append(log_prob)
            values.append(value)
            rewards.append(torch.FloatTensor(reward).unsqueeze(1).to(device))
            masks.append(torch.FloatTensor(1 - done).unsqueeze(1).to(device))

            states.append(state)
            actions.append(action)

            state = next_state
            frame_idx += 1

        next_state = torch.FloatTensor(next_state).to(device)
        next_value = value_model(next_state)
        returns = compute_gae(next_value, rewards, masks, values)

        returns   = torch.cat(returns).detach()
        log_probs = torch.cat(log_probs).detach()
        values    = torch.cat(values).detach()
        states    = torch.cat(states)
        actions   = torch.cat(actions)
        advantage = returns - values

        ppo_update(ppo_epochs, mini_batch_size, states, actions, log_probs, returns, advantage)
        train_epoch += 1    

        if train_epoch % 10 == 0:
            test_reward = np.mean([test_env() for _ in range(10)])        
            test_rewards.append(test_reward)
            plot(train_epoch, test_rewards)

            if best_reward is None or best_reward < test_reward:            
                if best_reward is not None:                
                    saveCheckpoint(ACTOR_FINAL_PATH, train_epoch, policy_model, policy_optimizer)
                    saveCheckpoint(CRITIC_FINAL_PATH, train_epoch, value_model, value_optimizer)

                best_reward = test_reward

            if test_reward > threshold_reward:            
                early_stop = True        

        if train_epoch % 100 == 0:
            saveCheckpoint(ACTOR_CHECKPOINT_PATH, train_epoch, policy_model, policy_optimizer)
            saveCheckpoint(CRITIC_CHECKPOINT_PATH, train_epoch, value_model, value_optimizer)
            
if DO_TRAINING:
    train()

### Replay

In [None]:
env = gym.make(env_name)
env.render(mode="human")

for i_episode in range(5):
    
    state = env.reset()
    done = False
    total_reward = 0
    
    frame_idx = 0
    
    distance = 3
    yaw = 0
    
    humanPos, humanOrn = pybullet.getBasePositionAndOrientation(1)
    pybullet.resetDebugVisualizerCamera(distance, yaw, -20, humanPos)   
            
    while not done:
        frame_idx += 1
        
        state = torch.FloatTensor(state).unsqueeze(0).to(device)
        dist = policy_model(state)
        action = dist.sample().cpu().numpy()[0]
        next_state, reward, done, _ = env.step(action)
        
        state = next_state
        total_reward += reward
                
        time.sleep(1/30)
        
        if frame_idx % 150 == 0:
            humanPos, humanOrn = pybullet.getBasePositionAndOrientation(1)
            pybullet.resetDebugVisualizerCamera(distance, yaw, -20, humanPos)   
    
    print("episode:", i_episode, "reward:", total_reward, "frames", frame_idx)

env.close()