<a href="https://colab.research.google.com/github/respect5716/Deep-Learning-Paper-Implementation/blob/master/04_RL/Proximal%20Policy%20Optimization%20Algorithms%20(PPO).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Proximal Policy Optimization Algorithms (PPO)

## 0. Paper

### Info
* TItle : Proximal Policy Optimization Algorithms (PPO)
* Author : John Schulman et al.
* Publication : [link](https://arxiv.org/pdf/1707.06347.pdf)

### Summary
* Clipped surrogate objective function을 통해 데이터를 여러번 학습할 수 있음

### Differences
* Environment : Atari -> Cartpole

## 1. Setting

In [0]:
# Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [0]:
# Libraries
import os
import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import gym

import tensorflow as tf

In [3]:
# GPU Setting
!nvidia-smi

print(f'tensorflow version : {tf.__version__}')
print(f'available GPU list : {tf.config.list_physical_devices("GPU")}')

Tue Jun  2 06:22:08 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.82       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   36C    P0    26W / 250W |      0MiB / 16280MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|  No ru

In [0]:
# Hyperparameters
CONFIG = {
    'base_dir' : '/content/drive/Shared drives/Yoon/Project/Doing/Deep Learning Paper Implementation',
    'gamma' : 0.99, # discount rate
    'lambda' : 0.95, # gae parameter
    'epsilon' : 0.2, # clipping parameter
    'c1' : 1, # critic loss coefficient
    'c2' : 0.01, # entropy bonus coefficient,
    'len_memory' : 128,
    'learning_rate' : 1e-3,
    'batch_size' : 32,
    'epoch_size' : 3,
    'episode_size' : 3000
}

## 2. Data

In [0]:
env = gym.make('CartPole-v1')

In [0]:
state = env.reset()

## 3. Model

In [0]:
class Memory(object):
    def __init__(self):
        self.reset()

    def __len__(self):
        return len(self.states)
    
    def store(self, state, action, reward, next_state, done):
        self.states.append(state)
        self.actions.append(action)
        self.rewards.append(reward)
        self.next_states.append(next_state)
        self.dones.append(done)
        
    def reset(self):
        self.states = []
        self.actions = []
        self.rewards = []
        self.next_states = []
        self.dones = []

In [0]:
class Agent(object):
    def __init__(self, state_space, action_space):
        self.state_space = state_space
        self.action_space = action_space
        self.network = self.build_network()
        self.network_old = tf.keras.models.clone_model(self.network)
        self.optimizer = tf.keras.optimizers.Adam(CONFIG['learning_rate'])
        self.step_size = 0

    def build_network(self):
        inputs = tf.keras.layers.Input(self.state_space)
        x = tf.keras.layers.Dense(100, activation='relu')(inputs)
        x = tf.keras.layers.Dense(100, activation='relu')(x)
        actor_outputs = tf.keras.layers.Dense(self.action_space, activation='softmax')(x)
        critic_outputs = tf.keras.layers.Dense(1, activation='linear')(x)
        
        network = tf.keras.Model(inputs, [actor_outputs, critic_outputs])
        return network
    
    def act(self, state):
        policy, _ = self.network(state[None,:])
        action = tf.random.categorical(policy, 1)[0][0].numpy()
        return action
    
    def get_gae_and_td(self, rewards, dones, values, next_values):
        gae = np.zeros(len(rewards))
        td_targets = np.zeros(len(rewards))
        gae_cumulative = 0

        for i in reversed(range(len(rewards))):
            if dones[i]:
                gae_cumulative = 0
            td_targets[i] = rewards[i] + (1-dones[i])*CONFIG['gamma']*next_values[i]
            delta = td_targets[i] - values[i]    
            gae_cumulative = delta + CONFIG['gamma']*CONFIG['lambda']*gae_cumulative
            gae[i] = gae_cumulative
        return gae, td_targets

    def train(self, memory):
        states = np.stack(memory.states)
        actions = np.array(memory.actions)
        rewards = np.array(memory.rewards)
        next_states = np.array(memory.next_states)
        dones = np.array(memory.dones)

        _, values = self.network(states)
        _, next_values = self.network(next_states)

        values = values[:,0]
        next_values = next_values[:,0]
        gae, td_targets = self.get_gae_and_td(rewards, dones, values, next_values)

        for i in range(CONFIG['epoch_size']):
            batch_step_size = CONFIG['len_memory'] // CONFIG['batch_size']
            for _ in range(batch_step_size):
                idx = np.random.randint(0, CONFIG['len_memory'], CONFIG['batch_size'])
                batch_states = states[idx]
                batch_actions = actions[idx]
                batch_gae = gae[idx]
                batch_td_targets = td_targets[idx]

                self.train_step(batch_states, batch_actions, batch_gae, batch_td_targets)
                self.step_size += 1
        self.network_old.set_weights(self.network.get_weights())

    def train_step(self, states, actions, gae_targets, td_targets):
        with tf.GradientTape() as g:
            prob, values = self.network(states)
            prob_old, _ = self.network_old(states)
            policy = tf.reduce_sum(prob * tf.one_hot(actions, self.action_space), axis=1)
            policy_old = tf.reduce_sum(prob_old * tf.one_hot(actions, self.action_space), axis=1)

            log_policy = tf.math.log(policy + 1e-20)
            log_policy_old = tf.math.log(policy_old + 1e-20)

            rt = tf.math.exp(log_policy - log_policy_old)
            clipped_rt = tf.clip_by_value(rt, 1-CONFIG['epsilon'], 1+CONFIG['epsilon'])
            
            actor_loss = tf.reduce_mean(tf.minimum(rt*gae_targets, clipped_rt*gae_targets))
            critic_loss = tf.reduce_mean(tf.square(td_targets - values[:,0]))
            entropy = tf.reduce_mean(-tf.reduce_sum(prob * tf.math.log(prob+1e-5), axis=1))
            loss = tf.reduce_mean(actor_loss - CONFIG['c1']*critic_loss + CONFIG['c2']*entropy)
            loss = -loss
        
        gradients = g.gradient(loss, self.network.trainable_variables)
        self.optimizer.apply_gradients(zip(gradients, self.network.trainable_variables))

## 4. Train

In [0]:
agent = Agent((4,), 2)
memory = Memory()
score = 0

In [104]:
for ep in range(1, CONFIG['episode_size']+1):
    state = env.reset()
    done = False
    episode_score = 0

    while not done:
        action = agent.act(state)
        next_state, reward, done, info = env.step(action)
        episode_score += reward

        memory.store(state, action, reward, next_state, done)
        if len(memory) >= CONFIG['len_memory']:
            agent.train(memory)
            memory.reset()
        state = next_state

    score = 0.9*score + 0.1*episode_score
    if (ep % 100 == 0) or (ep == CONFIG['episode_size']):
        print(f'EP : {str(ep).zfill(4)} | Score : {str(int(score)).zfill(3)} | Step size : {str(agent.step_size).zfill(5)}')

EP : 0100 | Score : 056 | Step size : 00348
EP : 0200 | Score : 086 | Step size : 01128
EP : 0300 | Score : 142 | Step size : 02292
EP : 0400 | Score : 127 | Step size : 03516
EP : 0500 | Score : 109 | Step size : 04620
EP : 0600 | Score : 106 | Step size : 05472
EP : 0700 | Score : 157 | Step size : 06924
EP : 0800 | Score : 126 | Step size : 08196
EP : 0900 | Score : 156 | Step size : 09792
EP : 1000 | Score : 108 | Step size : 11064
EP : 1100 | Score : 145 | Step size : 12384
EP : 1200 | Score : 104 | Step size : 13596
EP : 1300 | Score : 125 | Step size : 14808
EP : 1400 | Score : 119 | Step size : 16032
EP : 1500 | Score : 105 | Step size : 17340
EP : 1600 | Score : 131 | Step size : 18708
EP : 1700 | Score : 155 | Step size : 20052
EP : 1800 | Score : 132 | Step size : 21204
EP : 1900 | Score : 160 | Step size : 22476
EP : 2000 | Score : 140 | Step size : 23772
EP : 2100 | Score : 178 | Step size : 25128
EP : 2200 | Score : 136 | Step size : 26484
EP : 2300 | Score : 119 | Step s

## 5. Test

In [105]:
for i in range(10):
    state = env.reset()
    done = False
    score = 0

    while not done:
        policy, _ = agent.network(state[None,:])
        action = tf.argmax(policy[0]).numpy()
        next_state, reward, done, info = env.step(action)
        score += reward
        state = next_state
    
    print(f'EP : {str(i)} | Score : {str(int(score)).zfill(3)}')

EP : 0 | Score : 383
EP : 1 | Score : 420
EP : 2 | Score : 344
EP : 3 | Score : 431
EP : 4 | Score : 384
EP : 5 | Score : 367
EP : 6 | Score : 375
EP : 7 | Score : 421
EP : 8 | Score : 386
EP : 9 | Score : 420
