In [1]:
import tensorflow as tf
import tensorflow.contrib as tc
import numpy as np
import gym
import matplotlib.pyplot as plt
import seaborn as sns

from replay_buffer import ReplayBuffer

In [2]:
config = tf.ConfigProto()
config.gpu_options.allow_growth = True

In [3]:
class Actor(object):
    def __init__(self, sess, nObs, nAct, actionBound, lr, tau, \
                 nodes, batchSize, trainable, layer_norm):
        self.sess = sess
        self.nObs = nObs
        self.nAct = nAct
        self.actionBound = actionBound
        self.lr = lr
        self.tau = tau
        self.nodes = nodes
        self.batchSize = batchSize
        self.trainable = trainable
        self.layer_norm = layer_norm
        
        with tf.variable_scope('actor'):
            self.obs, self.action, self.scaledAction = self.buildNet()
            
        self.netParams = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope = 'actor')
        
        with tf.variable_scope('target_actor'):
            self.target_obs, self.target_action, self.target_scaledAction = self.buildNet()
        
        self.target_netParams = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope = 'target_actor')
        self.update_target = \
            [self.target_netParams[i].assign(tf.multiply(self.netParams[i], self.tau) + \
                                             tf.multiply(self.target_netParams[i], 1. - self.tau))
                        for i in range(len(self.target_netParams))]
        
        # from critic with action taken
        self.actionGrads = tf.placeholder(tf.float32, [None, self.nAct], 'gradient')
        
        # actor gradients
        self.grads_ = tf.gradients(self.scaledAction, self.netParams, \
                                 -self.actionGrads)
        # normalized actor gradients by batchSize
        self.grads = list(map(lambda x: tf.div(x, self.batchSize), \
                             self.grads_))
        
        self.optimize = tf.train.AdamOptimizer(self.lr).apply_gradients(\
                        zip(self.grads, self.netParams))
        
    def buildNet(self):
        obs = tf.placeholder(tf.float32, [None, self.nObs], 'observation')

        
        W1 = tf.get_variable("W1", [self.nObs, self.nodes[0]],
                       initializer=tf.contrib.layers.xavier_initializer())
        b1 = tf.Variable(tf.random_normal([self.nodes[0]]))
        if self.layer_norm:
            L1_norm = tc.layers.layer_norm(tf.add(tf.matmul(obs, W1), b1), \
                                          center=True, scale=True)
            L1 = tf.nn.relu(L1_norm)
        else:
            L1 = tf.nn.relu(tf.add(tf.matmul(obs, W1), b1))

        W2 = tf.get_variable("W2", [self.nodes[0], self.nodes[1]],
                       initializer=tf.contrib.layers.xavier_initializer())
        b2 = tf.Variable(tf.random_normal([self.nodes[1]]))
        if self.layer_norm:
            L2_norm = tc.layers.layer_norm(tf.add(tf.matmul(L1, W2), b2), \
                                      center=True, scale=True)
            L2 = tf.nn.relu(L2_norm)
        else:
            L2 = tf.nn.relu(tf.add(tf.matmul(L1, W2), b2))


        
        W3 = tf.get_variable("W3", [self.nodes[1], self.nAct],
                       initializer=tf.contrib.layers.xavier_initializer())
        b3 = tf.Variable(tf.random_normal([self.nAct]))

        action = tf.nn.tanh(tf.add(tf.matmul(L2, W3), b3))    
        scaledAction = tf.multiply(action, self.actionBound)
    
        return obs, action, scaledAction
   
    def gen_action(self, s):
        return self.sess.run(self.scaledAction, {self.obs: s})
        
    def target_gen_action(self, s):
        return self.sess.run(self.target_scaledAction, {self.target_obs: s})
        
    def train(self, s, grads):
        self.sess.run(self.optimize, {self.obs: s, self.actionGrads: grads})
        
    def update_target_net(self):
        self.sess.run(self.update_target)
    
    def get_num_net_params(self):
        return len(self.netParams) + len(self.target_netParams)
    

In [22]:
class Critic(object):
    def __init__(self, sess, nObs, nAct, lr, tau, gamma, nodes,\
                 trainable, layer_norm):
        self.sess = sess
        self.nObs = nObs
        self.nAct = nAct
        self.lr = lr
        self.tau = tau
        self.gamma = gamma
        self.nodes = nodes
        self.trainable = trainable 
        self.layer_norm = layer_norm
        
        with tf.variable_scope('critic'):
            self.obs, self.action, self.v = self.buildNet()
            
        self.netParams = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='critic')
    
        with tf.variable_scope('target_critic'):
            self.target_obs, self.target_action, self.target_v = self.buildNet()
        
        self.target_netParams = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='target_critic')
    
        self.update_target = \
            [self.target_netParams[i].assign(tf.multiply(self.netParams[i], self.tau) + \
                                             tf.multiply(self.target_netParams[i], 1. - self.tau))
                        for i in range(len(self.target_netParams))]
        
        self.actionGrads = tf.gradients(self.v, self.action)
        
        # from target
        self.predictedQ = tf.placeholder(tf.float32, [None, 1])
        
        self.loss = tf.reduce_sum(tf.square(self.predictedQ - self.v))
        self.optimize = tf.train.AdamOptimizer(self.lr).minimize(self.loss)
                
        
    def buildNet(self):
        obs = tf.placeholder(tf.float32, [None, self.nObs], 'observation')
        a = tf.placeholder(tf.float32, [None, self.nAct], 'action')
                         
        W1 = tf.get_variable("W1", [self.nObs, self.nodes[0]], \
                initializer=tf.contrib.layers.xavier_initializer())
        b1 = tf.Variable(tf.random_normal([self.nodes[0]]), 'b1')
        cell = tf.nn.rnn_cell.BasicRNNCell(self.nodes[0])
        
        outputs, states = tf.nn.dynamic_rnn(cell, obs, dtype=tf.float32)
        
        outputs = tf.transpose(outputs, [1, 0, 2])
        outputs = outputs[-1]
        
        if self.layer_norm:
            L1_norm = tc.layers.layer_norm(tf.add(tf.matmul(outputs, W1), b1), \
                                      center=True, scale=True)
            L1 = tf.nn.relu(L1_norm)
        else:
            L1 = tf.nn.relu(tf.add(tf.matmul(outputs, w1), b1))
        
        W2 = tf.get_variable('W2', [self.nodes[0], self.nodes[1]], \
                initializer=tf.contrib.layers.xavier_initializer())
        b2 = tf.Variable(tf.random_normal([self.nodes[1]]))
       
        W2_ = tf.get_variable('W2_', [self.nAct, self.nodes[1]], \
                initializer=tf.contrib.layers.xavier_initializer())
        b2_ = tf.Variable(tf.random_normal([self.nodes[1]]))
        if self.layer_norm:
            L2_norm = tc.layers.layer_norm(
                tf.matmul(L1, W2) + b2 + tf.matmul(a, W2_) + b2_, \
                                      center=True, scale=True)
            L2 = tf.nn.relu(L2_norm)
        else:
            L2 = tf.nn.relu(
                tf.matmul(L1, W2) + b2 + tf.matmul(a, W2_) + b2_)

        
        W3 = tf.get_variable('W3', [self.nodes[1], 1], \
                initializer=tf.contrib.layers.xavier_initializer())
        b3 = tf.Variable(tf.random_normal([1]))
        v = tf.matmul(L2, W3) + b3

        return obs, a, v
    
    def gen_value(self, s, a):
        return self.sess.run(self.v, {self.obs: s, self.action: a})
        
    def target_gen_value(self, s, a):
        return self.sess.run(self.target_v, {self.target_obs: s, 
                                             self.target_action: a})
        
    def train(self, s, a, q):
        return self.sess.run([self.v, self.optimize], \
                                  {self.obs: s, self.action: a,\
                                      self.predictedQ: q})
        
    def get_action_grads(self, s, a):
        return self.sess.run(self.actionGrads, {self.obs: s, self.action: a})
        
    def update_target_net(self):
        self.sess.run(self.update_target)   

In [23]:
class OrnsteinUhlenbeckActionNoise:
    def __init__(self, mu, sigma=0.3, theta=.15, dt=1e-2, x0=None):
        self.theta = theta
        self.mu = mu
        self.sigma = sigma
        self.dt = dt
        self.x0 = x0
        self.reset()

    def __call__(self):
        x = self.x_prev + self.theta * (self.mu - self.x_prev) * self.dt + \
                self.sigma * np.sqrt(self.dt) * np.random.normal(size=self.mu.shape)
        self.x_prev = x
        return x

    def reset(self):
        self.x_prev = self.x0 if self.x0 is not None else np.zeros_like(self.mu)

    def __repr__(self):
        return 'OrnsteinUhlenbeckActionNoise(mu={}, sigma={})'.format(self.mu, self.sigma)

In [24]:
def normalize(x, stats):
    if stats is None:
        return x
    
    return (x - stats.mean)/stats.std

def denormalize(x, stats):
    if stats is None:
        return x
    
    return x*stats.std + stats.mean

In [25]:
def build_summaries():
    ep_reward = tf.Variable(0.)
    tf.summary.scalar("Reward", ep_reward)
    ep_Qmax = tf.Variable(0.)
    tf.summary.scalar("Qmax", ep_Qmax)

    summary_vars = [ep_reward, ep_Qmax]
    summary_ops = tf.summary.merge_all()

    return summary_ops, summary_vars

In [26]:
random_seed = '1234'
envName = 'Pendulum-v0'
env = gym.make(envName)

nObs = env.observation_space.shape[0]
nAct = env.action_space.shape[0]
actionBound = env.action_space.high
print("*********** {} ************".format(envName))
print('  observation: {}   |   action: {}'.format(nObs, nAct))
print("************************************")


actorLr = 1e-3
actorNodes = [16, 64]
criticNodes = [16, 64]
criticLr = 1e-2
tau = 1e-3
batchSize = 64
gamma = 0.99
bufferSize = 1e6
layer_norm = True

episode = 200
step = 200
render = False

summaryFile = './log/summary_dir'

*********** Pendulum-v0 ************
  observation: 3   |   action: 1
************************************


In [27]:
summary_ops, summary_vars = build_summaries()

tf.reset_default_graph()
sess = tf.Session(config=config)
writer = tf.summary.FileWriter(summaryFile, sess.graph)

actor = Actor(sess, nObs=nObs, nAct=nAct, lr=actorLr, tau=tau, nodes=actorNodes, \
    batchSize=batchSize, actionBound=actionBound, trainable=True, layer_norm=layer_norm)

critic = Critic(sess, nObs=nObs, nAct=nAct, lr=criticLr, tau=tau, nodes=criticNodes, \
            gamma=gamma, trainable=True, layer_norm=layer_norm)

actorNoise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nAct))

sess.run(tf.global_variables_initializer())

actor.update_target_net()
critic.update_target_net()

replayBuffer = ReplayBuffer(bufferSize)

R, Qmax = [], []

for nEp in range(episode):
    s = env.reset()
    
    ep_reward = 0
    ep_Qmax = 0
    for nStep in range(step):
        a = actor.gen_action(np.reshape(s, (1, actor.nObs))) + actorNoise()
        
        s2 , r, done, info = env.step(a[0])
            
        replayBuffer.add(np.reshape(s, (actor.nObs,)), np.reshape(a, (actor.nAct,)),\
                         r, done, np.reshape(s2, (actor.nObs, )))
        
        if replayBuffer.size() > batchSize:
            sBatch, aBatch, rBatch, doneBatch, s2Batch = \
                            replayBuffer.sample_batch(batchSize)
            
            targetQ = critic.target_gen_value(s2Batch, \
                        actor.gen_action(s2Batch))
            
            for i in range(batchSize):
                if not doneBatch[i]:
                    rBatch[i] = rBatch[i] + critic.gamma*targetQ[i]
                    
            predictedQ, _ = critic.train(sBatch, aBatch, np.reshape(rBatch, (batchSize, 1)))
            
            ep_Qmax += np.amax(predictedQ)
            
            grads = critic.get_action_grads(sBatch, actor.gen_action(sBatch))
            actor.train(sBatch, grads[0])
            
            actor.update_target_net()
            critic.update_target_net()
            
        s = s2
        ep_reward += r
        
        print('\rReward: {:d} | Episode: {:d} | Qmax: {:.4f}'.format(int(ep_reward), \
                    nEp, (ep_Qmax / (float(nStep)+0.1))), end='')
            
    if done:
        print('\n')
        Qmax.append(ep_Qmax / (float(nStep)+0.1))
        R.append(ep_reward)
        
#         summary_str = sess.run(summary_ops, feed_dict=\
#                               {summary_vars[0]: 1,
#                                summary_vars[1]: 1})
#         writer.add_summary(summary_str, nEp)
#         writer.flush()
        
        if nEp != 0 and nEp % 20 == 0:
            fig = plt.figure(figsize=(20, 10))
            plt.style.use('seaborn-darkgrid')
            plt.subplot(211)
            plt.plot(range(len(R)), R)
            plt.title("reward per episode")
            plt.subplot(212)
            plt.plot(range(len(Qmax)), Qmax)
            plt.title("Qmax per episode")
            plt.savefig("./figs/results/results_" + str(nEp) + ".png")
            plt.show()

ValueError: Shape (3, ?) must have rank at least 3

## Issues when building actor & critic net.

* <U>**Assigning weight and bias variables is better as the above than the below**</U>

x = tf.layers.dense(obs, units=self.nodes[0], activation=tf.nn.relu,    trainable=self.trainable, name='p_fc0') 
                         
x = tf.layers.dense(x, units=self.nodes[1], activation=tf.nn.relu, trainable=self.trainable, name='p_fc1')     
                         
action = tf.layers.dense(x, units=self.nAct, activation=tf.nn.relu,
trainable=self.trainable, name='p_fc2')
                         
scaledAction = tf.multiply(action, self.actionBound)

-------------------------------------------------------------------------


* <U>**The below "batch normalization" does not work well**</U> 

L1 = tf.nn.relu(tf.contrib.layers.batch_norm(tf.add(tf.matmul(obs, W1), b1)))

L2 = tf.nn.relu(tf.contrib.layers.batch_norm(tf.add(tf.matmul(L1, W2), b2)))

-------------------------------------------------------------------------


* <U>**Getting trainable variables of networks:**</U>

self.netParams = tf.trainable_variables()

self.target_netParams = tf.trainable_variables()[len(self.netParams):]
        
self.netParams = tf.trainable_variables()[numActorParams:]

self.target_netParams = tf.trainable_variables()[len(self.netParams) + numActorParams:]

> **The above method is not good, b/c it requires the number of trainable variables in each networks(actor, target_actor, critic, target_critic). Thus, the below method is better !!! 
In this way, it just requires the scope of each network to classify.**

self.netParams = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope = 'actor')

self.target_netParams = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope = 'target_actor')

self.netParams = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='critic')
  
self.target_netParams = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='target_critic')
  

-------------------------------------------------------------------------
*