In [1]:
import os
import tensorflow as tf
import numpy as np
from collections import deque
os.environ['CUDA_VISIBLE_DEVICES'] = "-1"
from tensorflow import keras as tfk

In [2]:
from unityagents import UnityEnvironment
print("Start to load unity ENV")
env = UnityEnvironment(file_name='../Reacher_One_Linux_NoVis/Reacher_One_Linux_NoVis.x86_64')

Start to load unity ENV


INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		goal_size -> 5.0
		goal_speed -> 1.0
Unity brain name: ReacherBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 33
        Number of stacked Vector Observation: 1
        Vector Action space type: continuous
        Vector Action space size (per agent): 4
        Vector Action descriptions: , , , 


In [3]:
class Actor(object):
    def __init__(self, sess, action_dim, action_bound, learning_rate, t_replace_iter):
        self.sess = sess
        self.a_dim = action_dim
        self.action_bound = action_bound
        self.lr = learning_rate
        self.t_replace_iter = t_replace_iter
        self.t_replace_counter = 0

        with tf.variable_scope('Actor'):
            # input s, output a
            self.a = self._build_net(S, scope='eval_net', trainable=True)

            # input s_, output a, get a_ for critic
            self.a_ = self._build_net(S_, scope='target_net', trainable=False)

        self.e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/eval_net')
        self.t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/target_net')
        
        self.soft_update = [tf.assign(t, (1-TAU) * t + TAU * e) for t, e in zip(self.t_params, self.e_params)]

    def _build_net(self, s, scope, trainable):
        with tf.variable_scope(scope):
            init_w = tf.contrib.layers.xavier_initializer()
            #init_w = tf.initializers.random_normal()
            init_b = tf.constant_initializer(0.001)
            
            net = tf.layers.dense(s, 256, activation=tf.nn.relu6,
                                  kernel_initializer=init_w, bias_initializer=init_b, name='l1',
                                  trainable=trainable)
            
            net = tf.layers.dense(net, 256, activation=tf.nn.relu6,
                                  kernel_initializer=init_w, bias_initializer=init_b, name='l2',
                                  trainable=trainable)
            
            net = tf.layers.dense(net, 16, activation=tf.nn.relu,
                                  kernel_initializer=init_w, bias_initializer=init_b, name='l3',
                                  trainable=trainable)
            
            with tf.variable_scope('a'):
                actions = tf.layers.dense(net, self.a_dim, activation=tf.nn.tanh, kernel_initializer=init_w,
                                          name='a', trainable=trainable)
                scaled_a = tf.multiply(actions, self.action_bound, name='scaled_a')  # Scale output to -action_bound to action_bound
        return scaled_a

    def learn(self, s):   # batch update
        self.sess.run(self.train_op, feed_dict={S: s})
        if self.t_replace_counter % self.t_replace_iter == 0:
            self.sess.run(self.soft_update)
        self.t_replace_counter += 1

    def act(self, s):
        s = s[np.newaxis, :]    # single state
        return self.sess.run(self.a, feed_dict={S: s})[0]  # single action

    def add_grad_to_graph(self, a_grads):
        with tf.variable_scope('policy_grads'):
            self.policy_grads = tf.gradients(ys=self.a, xs=self.e_params, grad_ys=a_grads)

        with tf.variable_scope('A_train'):
            opt = tf.train.AdamOptimizer(-self.lr)  # (- learning rate) for ascent policy
            self.train_op = opt.apply_gradients(zip(self.policy_grads, self.e_params))
            
            
class Critic(object):
    def __init__(self, sess, state_dim, action_dim, learning_rate, gamma, t_replace_iter, a, a_):
        self.sess = sess
        self.s_dim = state_dim
        self.a_dim = action_dim
        self.lr = learning_rate
        self.gamma = gamma
        self.t_replace_iter = t_replace_iter
        self.t_replace_counter = 0
        global_step = tf.Variable(0, trainable = False)

        with tf.variable_scope('Critic'):
            # Input (s, a), output q
            self.a = a
            self.q = self._build_net(S, self.a, 'eval_net', trainable=True)

            # Input (s_, a_), output q_ for q_target
            self.q_ = self._build_net(S_, a_, 'target_net', trainable=False)    # target_q is based on a_ from Actor's target_net

            self.e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/eval_net')
            self.t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/target_net')

        with tf.variable_scope('target_q'):
            self.target_q = R + self.gamma * self.q_

        with tf.variable_scope('TD_error'):
            self.loss = tf.reduce_mean(tf.squared_difference(self.target_q, self.q))
            
        lr_c = tf.train.exponential_decay(self.lr, global_step, 100000, 0.999)
        with tf.variable_scope('C_train'):
            self.opt = tf.train.AdamOptimizer(lr_c)
            c_grads = self.opt.compute_gradients(self.loss)
            capped_gvs = [(tf.clip_by_value(grad, -3., 3.), var) for grad, var in c_grads]
            self.train_op = self.opt.apply_gradients(capped_gvs)
            #.minimize(self.loss)

        with tf.variable_scope('a_grad'):
            self.a_grads = tf.gradients(self.q, a)[0]   # tensor of gradients of each sample (None, a_dim)
            
        self.soft_update = [tf.assign(t, (1-TAU) * t + TAU * e) for t, e in zip(self.t_params, self.e_params)]

    def _build_net(self, s, a, scope, trainable):
        with tf.variable_scope(scope):
            init_w = tf.contrib.layers.xavier_initializer()
            #init_w = tf.initializers.random_normal()
            init_b = tf.constant_initializer(0.01)

            with tf.variable_scope('l1'):
                n_l1 = 256
                w1_s = tf.get_variable('w1_s', [self.s_dim, n_l1], initializer=init_w, trainable=trainable)
                w1_a = tf.get_variable('w1_a', [self.a_dim, n_l1], initializer=init_w, trainable=trainable)
                b1 = tf.get_variable('b1', [1, n_l1], initializer=init_b, trainable=trainable)
                net = tf.nn.relu6(tf.matmul(s, w1_s) + tf.matmul(a, w1_a) + b1)
                
            net = tf.layers.dense(net, 256, activation=tf.nn.relu6,
                                  kernel_initializer=init_w, bias_initializer=init_b, name='l2',
                                  trainable=trainable)
            net = tf.layers.dense(net, 16, activation=tf.nn.relu,
                                  kernel_initializer=init_w, bias_initializer=init_b, name='l3',
                                  trainable=trainable)
            with tf.variable_scope('q'):
                q = tf.layers.dense(net, 1, kernel_initializer=init_w, bias_initializer=init_b, trainable=trainable)   # Q(s,a)
        return q

    def learn(self, s, a, r, s_):
        self.sess.run(self.train_op, feed_dict={S: s, self.a: a, R: r, S_: s_})
        if self.t_replace_counter % self.t_replace_iter == 0:
            self.sess.run(self.soft_update)
        self.t_replace_counter += 1
        
        
class Memory(object):
    def __init__(self, capacity, dims):
        self.capacity = capacity
        self.data = np.zeros((capacity, dims))
        self.pointer = 0

    def store_transition(self, s, a, r, s_):
        transition = np.hstack((s, a, [r], s_))
        index = self.pointer % self.capacity  # replace the old memory with new memory
        self.data[index, :] = transition
        self.pointer += 1

    def sample(self, n):
        assert self.pointer >= self.capacity, 'Memory has not been fulfilled'
        indices = np.random.choice(self.capacity, size=n)
        return self.data[indices, :]

In [4]:
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

# reset the environment
env_info = env.reset(train_mode=True)[brain_name]
# number of agents
num_agents = len(env_info.agents)
print('Number of agents:', num_agents)

# size of each action
action_size = brain.vector_action_space_size
print('Size of each action:', action_size)

# examine the state space 
states = env_info.vector_observations
state_size = states.shape[1]
print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size))
print('The state for the first agent looks like:', states[0])

Number of agents: 1
Size of each action: 4
There are 1 agents. Each observes a state with length: 33
The state for the first agent looks like: [ 0.00000000e+00 -4.00000000e+00  0.00000000e+00  1.00000000e+00
 -0.00000000e+00 -0.00000000e+00 -4.37113883e-08  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00 -1.00000000e+01  0.00000000e+00
  1.00000000e+00 -0.00000000e+00 -0.00000000e+00 -4.37113883e-08
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  5.75471878e+00 -1.00000000e+00
  5.55726671e+00  0.00000000e+00  1.00000000e+00  0.00000000e+00
 -1.68164849e-01]


In [5]:
def train(sess, env, actor, critic, actor_noise, M):
    time_steps = 20
    t_max = 2000
    saver = tf.train.Saver()
    sess.run(tf.global_variables_initializer())    
    avg_score = []
    scores_deque = deque(maxlen = 100)
    len_agents = len(str(num_agents))
    
    for i_episode in range(1, MAX_EPISODES+1):
        scores = np.zeros(num_agents)
        env_info  = env.reset(train_mode=True)[brain_name]
        states = env_info.vector_observations[0]
        actor_noise.reset()
        
        for counter in range(t_max):       
            # Generate action by Actor's local_network
            actions = np.clip(actor.act(states) + actor_noise(), -1, 1) #
            env_info = env.step(actions)[brain_name]
            next_states = env_info.vector_observations[0]   # get the next state
            rewards = env_info.rewards[0]                   # get the reward
            dones = env_info.local_done[0]                  # see if episode has finished

            M.store_transition(states, actions, rewards, next_states)
            if (counter % UPDATE_PER_ITER == 0) & (M.pointer > MEMORY_CAPACITY):
                for _ in range(10):
                    b_M = M.sample(BATCH_SIZE)
                    b_s = b_M[:, :STATE_DIM]
                    b_a = b_M[:, STATE_DIM: STATE_DIM + ACTION_DIM]
                    b_r = b_M[:, -STATE_DIM - 1: -STATE_DIM]
                    b_s_ = b_M[:, -STATE_DIM:]

                    critic.learn(b_s, b_a, b_r, b_s_)
                    actor.learn(b_s)
            
            states = next_states
            scores += rewards
            
            if np.any(dones):
                break
        score = np.mean(scores)
        avg_score.append(score)
        scores_deque.append(score)
        
        print('\rEpisode {}\tEpisode Score: {:.2f}\tAverage Score: {:.2f}\tMax Score: {:.2f}'.format(i_episode, score, np.mean(scores_deque), np.max(avg_score)), end="")
        
        if np.mean(scores_deque) >= 30.:
            print("Game solved")
            saver.save(sess, "./ddpg.ckpt", write_meta_graph = False)
            break
    return avg_score

In [6]:
MAX_EPISODES = 1200
UPDATE_PER_ITER = 20
LR_A = 1e-3  # learning rate for actor
LR_C = 1e-3  # learning rate for critic
GAMMA = 0.9  # reward discount
TAU = 1e-2
REPLACE_ITER_A = 10 #1100
REPLACE_ITER_C = 10 #1000
MEMORY_CAPACITY = 10000
BATCH_SIZE = 64

from agent import OrnsteinUhlenbeckActionNoise
from tqdm import tqdm

In [7]:
tf.reset_default_graph()
with tf.Session() as sess:
    env_info = env.reset(train_mode=True)[brain_name]
    action_size = ACTION_DIM = brain.vector_action_space_size
    states = env_info.vector_observations
    state_size = STATE_DIM = states.shape[1]
    action_bound = ACTION_BOUND = 1
    print("State size: %i, Action size: %i, Action bound: %.2f" % (STATE_DIM, ACTION_DIM, ACTION_BOUND) )
    """
    Set model
    """
    with tf.name_scope('S'):
        S = tf.placeholder(tf.float32, shape=[None, state_size], name='s')
    
    with tf.name_scope('R'):
        R = tf.placeholder(tf.float32, [None, 1], name='r')
        
    with tf.name_scope('S_'):
        S_ = tf.placeholder(tf.float32, shape=[None, state_size], name='s_')
    
    actor = Actor(sess, action_size, action_bound, learning_rate=LR_A, t_replace_iter=REPLACE_ITER_A)
    critic = Critic(sess, state_size, action_size, LR_C, GAMMA, REPLACE_ITER_C, actor.a, actor.a_)
    actor.add_grad_to_graph(critic.a_grads)
    M = Memory(MEMORY_CAPACITY, dims=2 * state_size + action_size + 1)
    
    actor_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(action_size))
    
    score = train(sess, env, actor, critic, actor_noise, M)

State size: 33, Action size: 4, Action bound: 1.00
Episode 644	Episode Score: 6.45	Average Score: 16.32	Max Score: 38.266

KeyboardInterrupt: 

In [None]:
import pandas as pd
df_score = pd.DataFrame({'score':score})
df_score.to_csv("result.csv")

In [None]:
import matplotlib.pyplot as plt
plt.plot(range(len(score)), score)
plt.show()