<a href="https://colab.research.google.com/github/respect5716/Deep-Learning-Paper-Implementation/blob/master/04_RL/Asynchronous%20Methods%20for%20Deep%20Reinforcement%20Learning%20(A3C).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Asynchronous Methods for Deep Reinforcement Learning

## 0. Paper

### Info
* TItle : Asynchronous Methods for Deep Reinforcement Learning
* Author : Volodymyr Mnih et al.
* Publication : ICML 2016

### Summary
* asynchronous execution을 통해 데이터의 correlation 감소
* 단일 CPU 만으로 빠른 학습 가능

### Differences
* Environment : Atari -> Cartpole

## 1. Setting

In [0]:
# Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [0]:
# Libraries
import os
import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import gym
import threading
import multiprocessing

import tensorflow as tf

In [0]:
# GPU Setting
!nvidia-smi

print(f'tensorflow version : {tf.__version__}')
print(f'available GPU list : {tf.config.list_physical_devices("GPU")}')

In [0]:
# Hyperparameters
CONFIG = {
    'base_dir' : '/content/drive/Shared drives/Yoon/Project/Doing/Deep Learning Paper Implementation',
    'num_workers' : 16,
    'n_step' : 5,
    'gamma' : 0.95,
    'beta' : 0.01,
    'actor_lr' : 0.0001,
    'critic_lr' : 0.0001,
    'episode_size' : 5000
}

## 2. Env

In [0]:
env = gym.make('CartPole-v1')

In [0]:
state = env.reset()

In [5]:
state.shape

(4,)

## 3. Model

In [0]:
class Tracker(object):
    def __init__(self):
        self.episode_count = 0
        self.score = 0
    
    def update(self, score):
        self.episode_count += 1
        self.score = CONFIG['gamma']*self.score + (1-CONFIG['gamma'])*score


class Memory(object):
    def __init__(self):
        self.states = []
        self.actions = []
        self.rewards = []

    def __len__(self):
        return len(self.states)
        
        
    def store(self, state, action, reward):
        self.states.append(state)
        self.actions.append(action)
        self.rewards.append(reward)
        
    def clear(self):
        self.states = []
        self.actions = []
        self.rewards = []

def clip_gradient(g):
    g, _ = tf.clip_by_global_norm(g, 40)
    return g

In [0]:
class GlobalAgent(object):
    def __init__(self, state_space, action_space):
        self.state_space = state_space
        self.action_space = action_space
        self.actor_network, self.critic_network = self.build_network()
        self.tracker = Tracker()
    
    def build_network(self):
        inputs = tf.keras.layers.Input(self.state_space)
        x = tf.keras.layers.Dense(100, activation='relu')(inputs)
        x = tf.keras.layers.Dense(100, activation='relu')(x)
        actor_outputs = tf.keras.layers.Dense(self.action_space, activation='softmax')(x)
        critic_outputs = tf.keras.layers.Dense(1, activation='linear')(x)
        
        actor_network = tf.keras.Model(inputs, actor_outputs)
        critic_network = tf.keras.Model(inputs, critic_outputs)
        return actor_network, critic_network
    
    def train(self):
        workers = [WorkerAgent(self.action_space, self.actor_network, self.critic_network, self.tracker) for _ in range(CONFIG['num_workers'])]
        for worker in workers:
            worker.start()
        
        for worker in workers:
            worker.join()

In [0]:
class WorkerAgent(threading.Thread):
    def __init__(self, action_space, actor_network, critic_network, tracker):
        super(WorkerAgent, self).__init__()
        self.env = env = gym.make('CartPole-v1')
        self.action_space = action_space
        self.actor_network = actor_network
        self.critic_network = critic_network
        self.actor_optimizer = tf.keras.optimizers.Adam(learning_rate=CONFIG['actor_lr'])
        self.critic_optimizer = tf.keras.optimizers.Adam(learning_rate=CONFIG['critic_lr'])
        self.worker_actor_network = tf.keras.models.clone_model(self.actor_network)
        self.worker_critic_network = tf.keras.models.clone_model(self.critic_network)
        self.tracker = tracker
    
    def act(self, state):
        logits = self.worker_actor_network(state[None,:])
        action = tf.random.categorical(logits, 1)[0][0].numpy()
        return action
    
    def get_n_step_td_targets(self, rewards, next_v_value, done):
        td_targets = np.zeros(len(rewards))
        if done:
            cumulative = 0
        else:
            cumulative = next_v_value

        for i in reversed(range(len(rewards))):
            cumulative = rewards[i] + CONFIG['gamma'] * cumulative
            td_targets[i] = cumulative
        return td_targets

    def train_actor_network(self, states, actions, advantages):
        with tf.GradientTape() as g:
            policy = self.actor_network(states)
            policy = tf.reduce_sum(policy * tf.one_hot(actions, self.action_space), axis=-1)
            log_policy = tf.math.log(policy + 1e-20)
            entropy = -tf.reduce_sum(policy * log_policy)
            loss = -(tf.reduce_sum(log_policy * advantages) + CONFIG['beta'] * entropy)

        
        gradients = g.gradient(loss, self.actor_network.trainable_variables)
        gradients = clip_gradient(gradients)
        self.actor_optimizer.apply_gradients(zip(gradients, self.actor_network.trainable_variables))
    
    def train_critic_network(self, states, td_targets):
        with tf.GradientTape() as g:
            v_values = tf.squeeze(self.critic_network(states))
            loss = tf.reduce_mean(tf.square(td_targets - v_values))
        
        gradients = g.gradient(loss, self.critic_network.trainable_variables)
        gradients = clip_gradient(gradients)
        self.critic_optimizer.apply_gradients(zip(gradients, self.critic_network.trainable_variables))

    def train(self, memory, next_state, done):
        states = np.stack(memory.states)
        actions = np.array(memory.actions)
        rewards = np.array(memory.rewards)

        v_values = self.worker_critic_network.predict(states)
        next_v_value = self.worker_critic_network.predict(next_state[None,:])
        n_step_td_targets = self.get_n_step_td_targets(rewards, next_v_value, done)
        advantages = n_step_td_targets - v_values

        self.train_actor_network(states, actions, advantages)
        self.train_critic_network(states, n_step_td_targets)
        self.worker_actor_network.set_weights(self.actor_network.get_weights())
        self.worker_critic_network.set_weights(self.critic_network.get_weights())
    
    def run(self):
        memory = Memory()
        while self.tracker.episode_count < CONFIG['episode_size']:
            state = self.env.reset()
            done = False
            score = 0
            while not done:
                action = self.act(state)
                next_state, reward, done, info = self.env.step(action)
                score += reward
                if done:
                    reward = -1
                memory.store(state, action, reward)

                if len(memory) == CONFIG['n_step'] or done:
                    self.train(memory, next_state, done)
                    memory.clear()
                
                state = next_state
            
            if self.tracker.episode_count % 100 == 0:
                ep = self.tracker.episode_count
                sc = self.tracker.score
                print(f'EP : {str(int(ep)).zfill(5)} | Score : {int(sc)}')

            self.tracker.update(score)

## 4. Train

In [0]:
agent = GlobalAgent((4,), 2)

In [65]:
agent.train()

EP : 00000 | Score : 0
EP : 00100 | Score : 23
EP : 00200 | Score : 32
EP : 00300 | Score : 38
EP : 00400 | Score : 32
EP : 00500 | Score : 40
EP : 00600 | Score : 25
EP : 00700 | Score : 23
EP : 00800 | Score : 42
EP : 00900 | Score : 54
EP : 01000 | Score : 59
EP : 01100 | Score : 67
EP : 01200 | Score : 76
EP : 01300 | Score : 75
EP : 01400 | Score : 89
EP : 01500 | Score : 92
EP : 01600 | Score : 112
EP : 01700 | Score : 86
EP : 01800 | Score : 92
EP : 01900 | Score : 115
EP : 02000 | Score : 107
EP : 02100 | Score : 97
EP : 02200 | Score : 101
EP : 02300 | Score : 122
EP : 02400 | Score : 128
EP : 02500 | Score : 132
EP : 02600 | Score : 119
EP : 02700 | Score : 125
EP : 02800 | Score : 161
EP : 02900 | Score : 129
EP : 03000 | Score : 149
EP : 03100 | Score : 89
EP : 03200 | Score : 100
EP : 03300 | Score : 107
EP : 03400 | Score : 120
EP : 03500 | Score : 153
EP : 03600 | Score : 131
EP : 03700 | Score : 117
EP : 03800 | Score : 119
EP : 03900 | Score : 133
EP : 04000 | Score : 

## 5. Test

In [99]:
env = gym.make('CartPole-v1')
for i in range(5):
    state = env.reset()
    done = False
    score = 0

    while not done:
        policy = agent.actor_network(state[None,:])
        action = np.argmax(policy[0])
        next_state, reward, done, info = env.step(action)
        score += reward
        state = next_state
    
    print(f'EP : {i+1} | Score : {int(score)}')

EP : 1 | Score : 313
EP : 2 | Score : 262
EP : 3 | Score : 337
EP : 4 | Score : 275
EP : 5 | Score : 349
