In [1]:
import os
import tensorflow as tf
import numpy as np
from collections import deque
os.environ['CUDA_VISIBLE_DEVICES'] = "-1"
from tensorflow import keras as tfk

In [2]:
from unityagents import UnityEnvironment
print("Start to load unity ENV")
env = UnityEnvironment(file_name='../Reacher_One_Linux_NoVis/Reacher_One_Linux_NoVis.x86_64')

Start to load unity ENV


INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		goal_size -> 5.0
		goal_speed -> 1.0
Unity brain name: ReacherBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 33
        Number of stacked Vector Observation: 1
        Vector Action space type: continuous
        Vector Action space size (per agent): 4
        Vector Action descriptions: , , , 


In [3]:
class DDPG(object):
    def __init__(self, a_dim, s_dim, a_bound, sess):
        self.memory = np.zeros((MEMORY_CAPACITY, s_dim * 2 + a_dim + 1 + 1), 
                               dtype=np.float32) # state, next_state, action, reward, done
        self.pointer = 0
        self.sess = sess
        global_step = tf.Variable(0, trainable=False)

        self.a_dim, self.s_dim, self.a_bound = a_dim, s_dim, a_bound,
        self.S = tf.placeholder(tf.float32, [None, s_dim], 'state_input')
        self.S_ = tf.placeholder(tf.float32, [None, s_dim], 'next_state_input')
        self.R = tf.placeholder(tf.float32, [None, 1], 'reward')
        self.done = tf.placeholder(tf.float32, [None, 1], 'done')
        
        self.actor_phase = tf.placeholder(tf.bool, shape = [], name = 'is_training')
        with tf.variable_scope('Actor'):
            self.action = self.create_actor(self.S, scope='eval', trainable=True)
            action_ = self.create_actor(self.S_, scope='target', trainable=False)
        
        self.critic_phase = tf.placeholder(tf.bool, shape = [], name = 'is_training')
        with tf.variable_scope('Critic'):
            # assign self.a = a in memory when calculating q for td_error,
            # otherwise the self.a is from Actor when updating Actor
            self.v = self.create_critic(self.S, self.action, scope='eval', trainable=True)
            self.v_ = self.create_critic(self.S_, action_, scope='target', trainable=False)

        # networks parameters
        self.ae_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/eval')
        self.at_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/target')
        self.ce_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/eval')
        self.ct_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/target')

        # target net replacement
        self.soft_replace = [[tf.assign(ta, (1 - TAU) * ta + TAU * ea), 
                              tf.assign(tc, (1 - TAU) * tc + TAU * ec)]
                             for ta, ea, tc, ec in zip(self.at_params, self.ae_params, 
                                                       self.ct_params, self.ce_params)]

        self.v_target = (self.R) + GAMMA * self.v_ * (1.-self.done)
        
        # in the feed dict for the td_error, the self.action should change to actions in memory
        self.td_error = tf.reduce_mean(tf.reduce_sum(tf.square(self.v_target - self.v), axis = 1))
        
        lr_c = tf.train.exponential_decay(LR_C, global_step, 10000, 0.999)
        optimizer_critic = tf.train.AdamOptimizer(lr_c)
        
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        with tf.control_dependencies(update_ops):
            c_grads = optimizer_critic.compute_gradients(self.td_error, var_list = self.ce_params)
            capped_gvs = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in c_grads]
            self.critic_train = optimizer_critic.apply_gradients(capped_gvs)
        
        
        self.exp_v = -tf.reduce_mean(self.v)    # maximize the q
        
        lr_a = tf.train.exponential_decay(LR_A, global_step, 10000, 0.999)
        optimizer_actor = tf.train.AdamOptimizer(lr_a)
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        with tf.control_dependencies(update_ops):
            self.actor_train = optimizer_actor.minimize(self.exp_v, var_list = self.ae_params)
        
        self.saver = tf.train.Saver()
        self.sess.run(tf.global_variables_initializer())
        
        
    def act(self, s):
        act = self.sess.run(self.action, {self.S: s[np.newaxis, :],
                                          self.actor_phase: False})
        return act

    def step(self, s, a, r, s_, t, timestep):
        transition = np.hstack((s, a, [r], s_, [t]))
        index = self.pointer % MEMORY_CAPACITY  # replace the old memory with new memory
        self.memory[index, :] = transition
        self.pointer += 1
        
        if len(self.memory) > BATCH_SIZE and timestep % 20 == 0:
            for _ in range(NUM_UPDATES_PER_EPOCH):
                indices = np.random.choice(MEMORY_CAPACITY, size=BATCH_SIZE)
                bt = self.memory[indices, :]

                bs = bt[:, :self.s_dim]
                ba = bt[:, (self.s_dim):(self.s_dim + self.a_dim)]
                br = bt[:, (self.s_dim + self.a_dim):(self.s_dim + self.a_dim + 1)]
                bs_ = bt[:, (self.s_dim + self.a_dim + 1):(self.s_dim + self.a_dim + 1 + self.s_dim)]
                bd = bt[:, -1:]

                # Learn
                self.sess.run(self.critic_train, 
                              {self.S: bs, 
                               self.action: ba, 
                               self.R: br, 
                               self.S_: bs_,
                               self.done: bd,
                               self.critic_phase: True,
                               self.actor_phase: False
                              })
                actions_pred = self.sess.run(self.action, feed_dict = {self.S:bs,
                                                                       self.actor_phase: False})
                self.sess.run(self.actor_train, {self.S: bs})

                # soft target replacement
                self.sess.run(self.soft_replace)
        
        
    def create_actor(self, s, scope, trainable):
        init_w = tf.contrib.layers.xavier_initializer()
        init_b = tf.constant_initializer(0.001)
        
        with tf.variable_scope(scope):
            x = tf.layers.dense(s, 128,
                                  kernel_initializer=init_w, bias_initializer=init_b,
                                  name='l1', trainable=trainable)
            #x = tf.layers.batch_normalization(x, trainable = trainable, training=self.actor_phase)
            x = tf.nn.relu(x)
            
            x = tf.layers.dense(x, 128,
                                kernel_initializer=init_w, bias_initializer=init_b,
                                name='l2', trainable=trainable)
            #x = tf.layers.batch_normalization(x, trainable = trainable, training=self.actor_phase)
            x = tf.nn.relu(x)
            
            a = tf.layers.dense(x, self.a_dim, activation=tf.nn.tanh, 
                                name='action', trainable=trainable)
            
            return tf.multiply(a, self.a_bound, name='policy_action')

    def create_critic(self, s, a, scope, trainable):
        init_w = tf.contrib.layers.xavier_initializer()
        init_b = tf.constant_initializer(0.01)
        
        with tf.variable_scope(scope):
            x = tf.layers.dense(s, 128,
                                kernel_initializer=init_w, bias_initializer=init_b, 
                                trainable = trainable)
            #x = tf.layers.batch_normalization(x, trainable= trainable, training=self.critic_phase)
            x = tf.nn.relu(x)
            
            x = tf.concat([x, a], axis = 1)
            
            x = tf.layers.dense(x, 128,
                                kernel_initializer=init_w, bias_initializer=init_b, 
                                trainable = trainable)
            #x = tf.layers.batch_normalization(x, trainable= trainable, training=self.critic_phase)
            x = tf.nn.relu(x)
            
            return tf.layers.dense(x, 1, trainable=trainable)  # Q(s,a)
        
    def save_model(self, model_name = None):
        self.saver.save(self.sess, 'ddpg.ckpt' if model_name is None else model_name)
        
    def load_model(self, model_name = None):
        self.saver.restore(self.sess, 'ddpg.ckpt' if model_name is None else model_name)

In [4]:
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

# reset the environment
env_info = env.reset(train_mode=True)[brain_name]
# number of agents
num_agents = len(env_info.agents)
print('Number of agents:', num_agents)

# size of each action
action_size = brain.vector_action_space_size
print('Size of each action:', action_size)

# examine the state space 
states = env_info.vector_observations
state_size = states.shape[1]
print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size))
print('The state for the first agent looks like:', states[0])

Number of agents: 1
Size of each action: 4
There are 1 agents. Each observes a state with length: 33
The state for the first agent looks like: [ 0.00000000e+00 -4.00000000e+00  0.00000000e+00  1.00000000e+00
 -0.00000000e+00 -0.00000000e+00 -4.37113883e-08  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00 -1.00000000e+01  0.00000000e+00
  1.00000000e+00 -0.00000000e+00 -0.00000000e+00 -4.37113883e-08
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  5.75471878e+00 -1.00000000e+00
  5.55726671e+00  0.00000000e+00  1.00000000e+00  0.00000000e+00
 -1.68164849e-01]


In [5]:
def train(sess, env, ddpg, actor_noise):
    time_steps = 20
    t_max = 2000
    
    sess.run(tf.global_variables_initializer())    
    avg_score = []
    scores_deque = deque(maxlen = 100)
    len_agents = len(str(num_agents))
    
    for i_episode in range(1, MAX_EPISODES+1):
        scores = np.zeros(num_agents)
        env_info  = env.reset(train_mode=True)[brain_name]
        states = env_info.vector_observations[0]
        
        for counter in range(t_max):       
            # Generate action by Actor's local_network
            actions = ddpg.act(states) #+ actor_noise()
            env_info = env.step(actions)[brain_name]
            next_states = env_info.vector_observations[0]   # get the next state
            rewards = env_info.rewards[0]                   # get the reward
            dones = env_info.local_done[0]                  # see if episode has finished

            ddpg.step(np.reshape(states, (ddpg.s_dim,)), 
                      np.reshape(actions, (ddpg.a_dim,)),
                      rewards,
                      np.reshape(next_states, (ddpg.s_dim,)),
                      dones * 1.,
                      counter
                     )
            states = next_states
            scores += rewards
            if np.any(dones):
                break
        score = np.mean(scores)
        avg_score.append(score)
        scores_deque.append(score)
        
        print('\rEpisode {}\tEpisode Score: {:.2f}\tAverage Score: {:.2f}\tMax Score: {:.2f}\tMin Scroe: {:.2f}'.format(i_episode, score, np.mean(scores_deque), np.max(avg_score), np.min(avg_score)), end="")
        
        if np.mean(scores_deque) >= 30.:
            print("Game solved")
            #ddpg.save_model()
            break
    return avg_score

In [6]:
MAX_EPISODES = 500
LR_A = 1e-3 # learning rate for actor
LR_C = 1e-3 # learning rate for critic
GAMMA = 0.99 # reward discount factor
TAU = 1e-3 # soft replacement
MEMORY_CAPACITY = int(1e6)
BATCH_SIZE = 256
NUM_UPDATES_PER_EPOCH = 10
ACTION_BOUND = 1
NN = 1

from agent import OrnsteinUhlenbeckActionNoise
from tqdm import tqdm

In [7]:
tf.reset_default_graph()
with tf.Session() as sess:
    env_info = env.reset(train_mode=True)[brain_name]
    action_size = brain.vector_action_space_size
    states = env_info.vector_observations
    state_size = states.shape[1]
    action_bound = 1
    
    ddpg = DDPG(s_dim=state_size, a_dim=action_size, a_bound=1, sess=sess)
    
    actor_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(action_size))
    
    score = train(sess, env, ddpg, actor_noise)

Episode 75	Episode Score: 0.64	Average Score: 0.54	Max Score: 1.56	Min Scroe: 0.00

KeyboardInterrupt: 

In [None]:
import pandas as pd
df_score = pd.DataFrame({'score':score})
df_score.to_csv("result.csv")

In [None]:
import matplotlib.pyplot as plt
plt.plot(range(len(score)), score)
plt.show()