# A3C, Tensorflow 로 구현

Reinforcement learning의 학습 알고리즘 중 하나인 A3C를 Tensorflow로 구현하도록 하겠습니다. 

<img src="https://www.dropbox.com/s/i18k6m0rrtp7wem/Screenshot%202018-07-04%2000.44.32.png?raw=1">

In [1]:
import gym
import os
import multiprocessing
import shutil
import threading
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf

In [2]:
# 에이전트의 수
no_of_workers = multiprocessing.cpu_count() 

print(no_of_workers)

# episode의 time steps 

no_of_ep_steps = 200 

# 총 episode 

no_of_episodes = 2000 

global_net_scope = 'Global_Net'

# global network의 update 빈도수 설정 

update_global = 100 

# discount factor 

gamma = 0.99 

# entropy factor 

entropy_beta = 0.01 

#actor의 learning rate 

lr_a = 0.0001

#critic의 learning rate 

lr_c=0.001 

# environment render 유,무 

render = True 


# 저장 디렉토리 

log_dir = 'logs '



8


# 2. 환경 import 

![download](https://user-images.githubusercontent.com/11300712/42167947-891206e2-7e4a-11e8-8918-4c09db5d6c1d.png)

openai gym에서 MountainCar를 불러온다. 


In [3]:
env= gym.make('MountainCarContinuous-v0')

# 환경 초기화 

env.reset()



[2018-07-09 08:57:19,691] Making new env: MountainCarContinuous-v0


array([-0.56173612,  0.        ])

In [4]:

# state, action, bound 받기
no_of_states = env.observation_space.shape[0]
no_of_actions = env.action_space.shape[0]
action_bound = [env.action_space.low, env.action_space.high]

In [5]:


#state의 수

number_state = env.observation_space.shape[0]

print(number_state)

# action의 수 

number_actions = env.action_space.shape[0]

print(number_actions)

# action의 bound 


action_bound = [env.action_space.low, env.action_space.high]

print(action_bound)


2
1
[array([-1.]), array([ 1.])]


# 3. Actor Crtic network 생성

<img src="https://www.dropbox.com/s/pwed476ggbzwqxz/Screenshot%202018-07-03%2022.32.59.png?raw=1">

In [6]:
class ActorCritic(object):
     def __init__(self, scope, sess, globalAC=None):
         
        #actor과 critic 네트워크의 session과 RMS prop optimizer을 initializer한다. 
      
        
        self.sess=sess
        
        self.actor_optimizer = tf.train.RMSPropOptimizer(lr_a, name='RMSPropA')
        self.critic_optimizer = tf.train.RMSPropOptimizer(lr_c, name='RMSPropC')
 
        #네트워크가 글로벌일때
        if scope == global_net_scope:
            with tf.variable_scope(scope):
                # actor과 critic 네트워크를 build 하고 state를 초기화 한다. 
               
                self.s = tf.placeholder(tf.float32, [None, no_of_states], 'S')
                
                # actor과 critic 네트워크의 파라
                # get the parameters of actor and critic networks
                self.a_params, self.c_params = self._build_net(scope)[-2:]
                
        # 네트워크가 로컬일때
        else:
            with tf.variable_scope(scope):
                
                # state를 초기화하고 action과 target value를 v_target으로 
                # initialize state, action and also target value as v_target
                
                self.s = tf.placeholder(tf.float32, [None, no_of_states], 'S')
                self.a_his = tf.placeholder(tf.float32, [None, no_of_actions], 'A')
                self.v_target = tf.placeholder(tf.float32, [None, 1], 'Vtarget')
                
                # continuous actions space이기때문에 선택된 action의 mean과 variance를 구한다. 
               
                
                mean, var, self.v, self.a_params, self.c_params = self._build_net(scope)
                # td error계산 
                
                td = tf.subtract(self.v_target, self.v, name='TD_error')
                # td error을 minimize 한다. 
                
            
                with tf.name_scope('critic_loss'):
                    self.critic_loss = tf.reduce_mean(tf.square(td))

               
                # mean과 var value를 update한다. 
              
                with tf.name_scope('wrap_action'):
                    mean, var = mean * action_bound[1], var + 1e-4
                                            
                #mean과 var의 업데이트를 사용하여 distribution을 generate한다. 
                normal_dist = tf.contrib.distributions.Normal(mean, var)
                
                
                
               
                with tf.name_scope('actor_loss'):
                    # loss , log(pi(s))를 계산한다. 
                    
                    log_prob = normal_dist.log_prob(self.a_his)
                    exp_v = log_prob * td
                    
                    # exploration을 위해 action distribution의 entropy를 계산한다. 
                    
                    entropy = normal_dist.entropy()
                    
                    # final loss를 다음과 같이 계산한다. 
                  
                    self.exp_v = exp_v + entropy_beta * entropy
                    
                    # loss를 minimize한다. 
                    self.actor_loss = tf.reduce_mean(-self.exp_v)
                   
                # distribution에 의해 action을 선택하고 action의 bound로 clipping한다. 
                
                with tf.name_scope('choose_action'):
                    self.A = tf.clip_by_value(tf.squeeze(normal_dist.sample(1), axis=0), action_bound[0], action_bound[1])
                 # actor와  critic 네트워크의 gradients를 계산한다. 
               
                with tf.name_scope('local_grad'):

                    self.a_grads = tf.gradients(self.actor_loss, self.a_params)
                    self.c_grads = tf.gradients(self.critic_loss, self.c_params)
             
            # 글로벌 네트워크의 웨이트를 업데이트한다. 
    
            with tf.name_scope('sync'):
                
                # 글로벌 네트워크 웨이트를 로컬 네트워크로 pull한다. 
               
                with tf.name_scope('pull'):
                    self.pull_a_params_op = [l_p.assign(g_p) for l_p, g_p in zip(self.a_params, globalAC.a_params)]
                    self.pull_c_params_op = [l_p.assign(g_p) for l_p, g_p in zip(self.c_params, globalAC.c_params)]
                
                #local gradients를 global network로 push 한다. 
              
                with tf.name_scope('push'):
                    self.update_a_op = self.actor_optimizer.apply_gradients(zip(self.a_grads, globalAC.a_params))
                    self.update_c_op = self.critic_optimizer.apply_gradients(zip(self.c_grads, globalAC.c_params))
                    
        

     # actor와 critic 네트워크를 build하기 위해 함수를 정의한다. 
    
     def _build_net(self, scope):
     # 웨이트를 초기화한다. 
        w_init = tf.random_normal_initializer(0., .1)
        
        with tf.variable_scope('actor'):
            l_a = tf.layers.dense(self.s, 200, tf.nn.relu6, kernel_initializer=w_init, name='la')
            mean = tf.layers.dense(l_a, no_of_actions, tf.nn.tanh,kernel_initializer=w_init, name='mean')
            var = tf.layers.dense(l_a, no_of_actions, tf.nn.softplus, kernel_initializer=w_init, name='var')
            
        with tf.variable_scope('critic'):
            l_c = tf.layers.dense(self.s, 100, tf.nn.relu6, kernel_initializer=w_init, name='lc')
            v = tf.layers.dense(l_c, 1, kernel_initializer=w_init, name='v')
        
        a_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/actor')
        c_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/critic')
        
        return mean, var, v, a_params, c_params
    
         
     # 글로벌네트워크로 로컬 gradient 를 업데이트 한다. 
     def update_global(self, feed_dict):
        self.sess.run([self.update_a_op, self.update_c_op], feed_dict)
     
     
     def pull_global(self):
        self.sess.run([self.pull_a_params_op, self.pull_c_params_op])
     
     # action을 선택
     def choose_action(self, s):
        s = s[np.newaxis, :]
        return self.sess.run(self.A, {self.s: s})[0]

# Worker 클래스

In [7]:
class Worker(object):
    def __init__(self, name, globalAC, sess):
        #각 환경의 worker을 초기화 시킨다. 
        self.env= gym.make('MountainCarContinuous-v0').unwrapped
        self.name = name
        
        
        #ActorCritic 에이전트를 만든다 
        self.AC = ActorCritic(name,sess, globalAC)
        self.sess =sess
        
    def work(self):
        global global_rewards, global_episodes
        
        total_step = 1
        
        
        
        #state, action, reward를 저장한다.
        buffer_s, buffer_a, buffer_r= [],[],[]
        
        
        # loop if the coordinator is active and global episode is less than the maximum episode
        
        while not coord.should_stop() and global_episodes < no_of_episodes:
            # 리셋을 하여 환경을 초기화 한다. 
            
            s= self.env.reset()
            
            #episode의 reward를 store한다. 
            
            ep_r = 0
            
            for ep_t in range(no_of_ep_steps):
                
                # worker 1의 환경을 render한다. 
               
                
                if self.name=='W_0' and render: 
                    
                    self.env.render()
                    
                    
                    #policy를 이용하여 action을 선택한다. 
                    
                a = self.AC.choose_action(s)
                    
                    #action a를 선택하고, reward를 받으며, next state s_t+1로 움직인다. 
                    
                s_,r,done,info =self.env.step(a)
                    
                    #set done as true if we reached maximum step per episode
                    
                done = True if ep_t == no_of_ep_steps -1 else False
                    
                ep_r += r 
                    
                    #state, action, reward를 buffer에 넣는다. 
                    
                buffer_s.append(s)
                buffer_a.append(a)
                    
                    #reward를 normalize한다. 
                    
                buffer_r.append((r+8)/8)
                #일정한 타임스텝 뒤에 글로벌 네트워크를 업데이트 한다. 
                    
                if total_step % update_global == 0 or done:
                    if done:
                        v_s_= 0
                    else:
                        v_s_=self.sess.run(self.AC.v, {self.AC.s: s_[np.newaxis,:]})[0,0]
                            
                        # target v 를 위한 buffer
                        
                    buffer_v_target = []
                        
                    for r in buffer_r[::-1]:
                        v_s_= r+gamma * v_s_
                        buffer_v_target.append(v_s_)
                            
                    buffer_v_target.reverse()
                        
                    buffer_s, buffer_a, buffer_v_target = np.vstack(buffer_s), np.vstack(buffer_a), np.vstack(buffer_v_target)    
                    feed_dict = {
                        self.AC.s : buffer_s,
                        self.AC.a_his : buffer_a,
                        self.AC.v_target:buffer_v_target,
                    }
        
                        #글로벌 네트워크 업데이트 
            
                    self.AC.update_global(feed_dict)
                    buffer_s, buffer_a, buffer_r =[],[],[]
                    
                        # get global parameters to local ActorCritic 
                    self.AC.pull_global()
                        
                s=s
                total_step += 1
                    
                if done : 
                    if len(global_rewads) < 5:
                        global_rewards.append(ep_r)
                    else: 
                        global_rewards.append(ep_r)
                        global_rewards[-1]=(np.mean(global_rewards[-5:1]))
                            
                            
                    global_episodes += 1
                    
                break 
                        
          
        
        
        

# Run the model

In [None]:
#create a list for string global rewards and episodes

global_rewards = []
global_episodes = 0

#텐서플로우 시작 

sess = tf.Session()

with tf.device("/cpu:0"):
    
    #ActorCritic 클래스의 instance를 만든다. 
    global_ac = ActorCritic(global_net_scope,sess)
    
    workers = []
    
    # loop for each workers
    for i in range(no_of_workers):
        i_name = 'W_%i' % i
        workers.append(Worker(i_name, global_ac,sess))

coord = tf.train.Coordinator()
sess.run(tf.global_variables_initializer())
#모든것을 log로, 텐서보드로 visualize하기 위해 
if os.path.exists(log_dir):
    shutil.rmtree(log_dir)

tf.summary.FileWriter(log_dir, sess.graph)

worker_threads = []



for worker in workers:
    
    job = lambda: wormbker.work()
    t = threading.Thread(target=job)
    t.start()
    worker_threads.append(t)
coord.join(worker_threads)
