In [1]:
import os
import tensorflow as tf
import numpy as np
from collections import deque
os.environ['CUDA_VISIBLE_DEVICES'] = "-1"

In [2]:
from unityagents import UnityEnvironment
print("Start to load unity ENV")
env = UnityEnvironment(file_name='../Reacher_One_Linux_NoVis/Reacher_One_Linux_NoVis.x86_64')

Start to load unity ENV


INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		goal_speed -> 1.0
		goal_size -> 5.0
Unity brain name: ReacherBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 33
        Number of stacked Vector Observation: 1
        Vector Action space type: continuous
        Vector Action space size (per agent): 4
        Vector Action descriptions: , , , 


In [3]:
class DDPG(object):
    def __init__(self, a_dim, s_dim, a_bound, sess):
        self.memory = np.zeros((MEMORY_CAPACITY, s_dim * 2 + a_dim + 1 + 1), dtype=np.float32) # state, next_state, action, reward, done
        self.pointer = 0
        self.sess = sess

        self.a_dim, self.s_dim, self.a_bound = a_dim, s_dim, a_bound,
        self.S = tf.placeholder(tf.float32, [None, s_dim], 'state_input')
        self.S_ = tf.placeholder(tf.float32, [None, s_dim], 'next_state_input')
        self.R = tf.placeholder(tf.float32, [None, 1], 'reward')
        self.done = tf.placeholder(tf.float32, [None, 1], 'done')

        with tf.variable_scope('Actor'):
            self.action = self.create_actor(self.S, scope='eval', trainable=True)
            action_ = self.create_actor(self.S_, scope='target', trainable=False)
            
        with tf.variable_scope('Critic'):
            # assign self.a = a in memory when calculating q for td_error,
            # otherwise the self.a is from Actor when updating Actor
            self.v = self.create_critic(self.S, self.action, scope='eval', trainable=True)
            self.v_ = self.create_critic(self.S_, action_, scope='target', trainable=False)

        # networks parameters
        self.ae_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/eval')
        self.at_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/target')
        self.ce_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/eval')
        self.ct_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/target')

        # target net replacement
        self.soft_replace = [[tf.assign(ta, (1 - TAU) * ta + TAU * ea), 
                              tf.assign(tc, (1 - TAU) * tc + TAU * ec)]
                             for ta, ea, tc, ec in zip(self.at_params, self.ae_params, 
                                                       self.ct_params, self.ce_params)]

        self.v_target = (self.R) + GAMMA * self.v_ * (1.-self.done)
        
        # in the feed dict for the td_error, the self.action should change to actions in memory
        #self.td_error = tf.losses.mean_squared_error(labels=v_target, predictions=v)
        self.td_error = tf.reduce_mean(tf.square(self.v_target - self.v))
        self.critic_train = tf.train.AdamOptimizer(LR_C).minimize(self.td_error, 
                                                                  var_list=self.ce_params)

        self.exp_v = -tf.reduce_mean(self.v)    # maximize the q
        self.actor_train = tf.train.AdamOptimizer(LR_A).minimize(self.exp_v, 
                                                                 var_list=self.ae_params)

        self.sess.run(tf.global_variables_initializer())
        
        
    def choose_action(self, s):
        act = self.sess.run(self.action, {self.S: s[np.newaxis, :]})[0]
        return act

    def learn(self):
        indices = np.random.choice(MEMORY_CAPACITY, size=BATCH_SIZE)
        bt = self.memory[indices, :]
        bs = bt[:, :self.s_dim]
        ba = bt[:, (self.s_dim):(self.s_dim + self.a_dim)]
        br = bt[:, (self.s_dim + self.a_dim):(self.s_dim + self.a_dim + 1)]
        bs_ = bt[:, (self.s_dim + self.a_dim + 1):(self.s_dim + self.a_dim + 1 + self.s_dim)]
        bd = bt[:, -1::]
        
        _, td_error, ve, vt = self.sess.run([self.critic_train, self.td_error, self.v, self.v_target], 
                                    {self.S: bs, 
                                     self.action: ba, 
                                     self.R: br, 
                                     self.S_: bs_,
                                     self.done: bd
                                    })
        _, q_value = self.sess.run([self.actor_train, self.exp_v], {self.S: bs})

        # soft target replacement
        self.sess.run(self.soft_replace)
        return td_error, q_value, ve, vt

    def store_transition(self, s, a, r, s_, t):
        transition = np.hstack((s, a, [r], s_, [t]))
        index = self.pointer % MEMORY_CAPACITY  # replace the old memory with new memory
        self.memory[index, :] = transition
        self.pointer += 1
        
    def create_actor(self, s, scope, trainable):
        with tf.variable_scope(scope):
            net = tf.layers.dense(s, 64, activation=tf.nn.leaky_relu, name='l1', trainable=trainable)
            net = tf.layers.dense(net, 32, activation=tf.nn.leaky_relu, name='l2', trainable=trainable)
            a = tf.layers.dense(net, self.a_dim, activation=tf.nn.tanh, name='action', trainable=trainable)
            
            return tf.multiply(a, self.a_bound, name='policy_action')

    def create_critic(self, s, a, scope, trainable):
        with tf.variable_scope(scope):
            n_l1 = 64
            w1_s = tf.get_variable('w1_s', [self.s_dim, n_l1], trainable=trainable, initializer=tf.initializers.random_normal())
            w1_a = tf.get_variable('w1_a', [self.a_dim, n_l1], trainable=trainable, initializer=tf.initializers.random_normal())
            b1 = tf.get_variable('b1', [1, n_l1], trainable=trainable)
            net = tf.nn.leaky_relu(tf.matmul(s, w1_s) + tf.matmul(a, w1_a) + b1)
            
            net = tf.layers.dense(net, 32, activation = tf.nn.leaky_relu, trainable = trainable)
            
            return tf.layers.dense(net, 1, trainable=trainable)  # Q(s,a)

In [4]:
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

# reset the environment
env_info = env.reset(train_mode=True)[brain_name]
# number of agents
num_agents = len(env_info.agents)
print('Number of agents:', num_agents)

# size of each action
action_size = brain.vector_action_space_size
print('Size of each action:', action_size)

# examine the state space 
states = env_info.vector_observations
state_size = states.shape[1]
print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size))
print('The state for the first agent looks like:', states[0])

Number of agents: 1
Size of each action: 4
There are 1 agents. Each observes a state with length: 33
The state for the first agent looks like: [ 0.00000000e+00 -4.00000000e+00  0.00000000e+00  1.00000000e+00
 -0.00000000e+00 -0.00000000e+00 -4.37113883e-08  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00 -1.00000000e+01  0.00000000e+00
  1.00000000e+00 -0.00000000e+00 -0.00000000e+00 -4.37113883e-08
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  5.75471878e+00 -1.00000000e+00
  5.55726671e+00  0.00000000e+00  1.00000000e+00  0.00000000e+00
 -1.68164849e-01]


In [5]:
def train(sess, env, ddpg, actor_noise):
    time_steps = 20
    num_update = 10
    t_max = 800
    
    sess.run(tf.global_variables_initializer())    
    avg_score = []
    scores_deque = deque(maxlen = 100)
    scores = np.zeros(num_agents)
    len_agents = len(str(num_agents))
    
    env_info  = env.reset(train_mode=True)[brain_name]
    
    for i_episode in range(1, MAX_EPISODES+1):
        env_info  = env.reset(train_mode=True)[brain_name]
        state = env_info.vector_observations[0]
        
        for counter in range(t_max):
            
            # Generate action by Actor's local_network
            action = ddpg.choose_action(state) + actor_noise()
            env_info = env.step(action)[brain_name]
            next_state = env_info.vector_observations[0]   # get the next state
            reward = float(env_info.rewards[0])                   # get the reward
            done = env_info.local_done[0]                  # see if episode has finished

            ddpg.store_transition(np.reshape(state, (ddpg.s_dim,)), 
                                  np.reshape(action, (ddpg.a_dim,)),
                                  reward,
                                  np.reshape(next_state, (ddpg.s_dim,)),
                                  done * 1.
                                 )

            if (counter % time_steps == 0):
                for _ in range(num_update):
                    ddpg.learn()
            
            state = next_state
            scores += reward
            
            if np.any(done):
                break

        score = np.mean(scores)
        avg_score.append(score)
        scores_deque.append(score)
        
        print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque)), end="")
        
    return avg_score

In [6]:
MAX_EPISODES = 100
LR_A = 1e-3 # learning rate for actor
LR_C = 1e-3 # learning rate for critic
GAMMA = 0.99 # reward discount factor
TAU = 0.01 # soft replacement
MEMORY_CAPACITY = 10000
BATCH_SIZE = 64
NUM_UPDATES_PER_EPOCH = 5
ACTION_BOUND = 1

from agent import OrnsteinUhlenbeckActionNoise
from tqdm import tqdm

In [7]:
tf.reset_default_graph()
with tf.Session() as sess:
    env_info = env.reset(train_mode=True)[brain_name]
    action_size = brain.vector_action_space_size
    states = env_info.vector_observations
    state_size = states.shape[1]
    action_bound = 1
    
    ddpg = DDPG(s_dim=state_size, a_dim=action_size, a_bound=1, sess=sess)
    
    actor_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(action_size))
    
    score = train(sess, env, ddpg, actor_noise)

Episode 1000	Average Score: 1526.13