In [1]:
import tensorflow as tf
import numpy as np
import gym
from gym import wrappers
import argparse
import pprint as pp
import tflearn

from replay_buffer import ReplayBuffer

In [8]:
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.Session(config = config)

In [30]:
class ActorNetwork(object):
    """
    Input to the network is the state, output is the action
    under a deterministic policy.
    The output layer activation is a tanh to keep the action
    between -action_bound and action_bound
    """

    def __init__(self, sess, state_dim, action_dim, action_bound, learning_rate, tau, batch_size):
        self.sess = sess
        self.s_dim = state_dim
        self.a_dim = action_dim
        self.action_bound = action_bound
        self.learning_rate = learning_rate
        self.tau = tau
        self.batch_size = batch_size

        # Actor Network
        with tf.variable_scope('actor'):
            self.inputs, self.out, self.scaled_out = self.create_actor_network()

        self.network_params = tf.trainable_variables()

        # Target Network
        with tf.variable_scope('target'):
            self.target_inputs, self.target_out, self.target_scaled_out = self.create_actor_network()

        self.target_network_params = tf.trainable_variables()[
            len(self.network_params):]

        # Op for periodically updating target network with online network
        # weights
        self.update_target_network_params = \
            [self.target_network_params[i].assign(tf.multiply(self.network_params[i], self.tau) +
                                                  tf.multiply(self.target_network_params[i], 1. - self.tau))
                for i in range(len(self.target_network_params))]

        # This gradient will be provided by the critic network
        self.action_gradient = tf.placeholder(tf.float32, [None, self.a_dim])

        # Combine the gradients here
        self.unnormalized_actor_gradients = tf.gradients(
            self.scaled_out, self.network_params, -self.action_gradient)
        self.actor_gradients = list(map(lambda x: tf.div(x, self.batch_size), self.unnormalized_actor_gradients))

        # Optimization Op
        self.optimize = tf.train.AdamOptimizer(self.learning_rate).\
            apply_gradients(zip(self.actor_gradients, self.network_params))

        self.num_trainable_vars = len(
            self.network_params) + len(self.target_network_params)

    def create_actor_network(self):   
        inputs = tf.placeholder(tf.float32, [None, self.s_dim])
        W1 = tf.get_variable("W1", [self.s_dim, 16],
                       initializer=tf.contrib.layers.xavier_initializer())
#         b1 = tf.get_variable("b1", [16],
#                         initializer=tf.constant_initializer(0))
        b1 = tf.Variable(tf.random_normal([16]))
        L1 = tf.nn.relu(tf.add(tf.matmul(inputs, W1), b1))
#         L1 = tf.contrib.layers.batch_norm(tf.add(tf.matmul(inputs, W1), b1))
#         L1 = tf.nn.relu(L1)

        W2 = tf.get_variable("W2", [16, 64],
                       initializer=tf.contrib.layers.xavier_initializer())
#         b2 = tf.get_variable("b2", [64],
#                         initializer=tf.constant_initializer(0))
        b2 = tf.Variable(tf.random_normal([64]))
        L2 = tf.nn.relu(tf.add(tf.matmul(L1, W2), b2))
#         L2 = tf.contrib.layers.batch_norm(tf.add(tf.matmul(L1, W2), b2))
#         L2 = tf.nn.relu(L2)

        
        W3 = tf.get_variable("W3", [64, self.a_dim],
                       initializer=tf.contrib.layers.xavier_initializer())
#         b3 = tf.get_variable("b3", [self.a_dim],
#                         initializer=tf.constant_initializer(0))
        b3 = tf.Variable(tf.random_normal([self.a_dim]))

        out = tf.nn.tanh(tf.add(tf.matmul(L2, W3), b3))
        x = tf.layers.dense(inputs, units=64, activation=tf.nn.relu, name='p_fc0')
        x = tf.layers.dense(x, units=64, activation=tf.nn.relu, name='p_fc1')

#         for i in range(2):
#             x = tf.layers.dense(inputs, units=64, activation=tf.nn.relu, name='p_fc'+str(i))
        
#         out = tf.layers.dense(x, units=self.a_dim, activation=tf.nn.tanh, name='p_fc2')
        scaled_out = tf.multiply(out, self.action_bound)
        return inputs, out, scaled_out

    def train(self, inputs, a_gradient):
        self.sess.run(self.optimize, feed_dict={
            self.inputs: inputs,
            self.action_gradient: a_gradient
        })

    def predict(self, inputs):
        return self.sess.run(self.scaled_out, feed_dict={
            self.inputs: inputs
        })

    def predict_target(self, inputs):
        return self.sess.run(self.target_scaled_out, feed_dict={
            self.target_inputs: inputs
        })

    def update_target_network(self):
        self.sess.run(self.update_target_network_params)

    def get_num_trainable_vars(self):
        return self.num_trainable_vars

In [31]:
class CriticNetwork(object):
    """
    Input to the network is the state and action, output is Q(s,a).
    The action must be obtained from the output of the Actor network.
    """

    def __init__(self, sess, state_dim, action_dim, learning_rate, tau, gamma, num_actor_vars):
        self.sess = sess
        self.s_dim = state_dim
        self.a_dim = action_dim
        self.learning_rate = learning_rate
        self.tau = tau
        self.gamma = gamma

        # Create the critic network
        with tf.variable_scope('critic'):
            self.inputs, self.action, self.out = self.create_critic_network()

        self.network_params = tf.trainable_variables()[num_actor_vars:]

        # Target Network
        self.target_inputs, self.target_action, self.target_out = self.create_critic_network()

        self.target_network_params = tf.trainable_variables()[(len(self.network_params) + num_actor_vars):]

        # Op for periodically updating target network with online network
        # weights with regularization
        self.update_target_network_params = \
            [self.target_network_params[i].assign(tf.multiply(self.network_params[i], self.tau) \
            + tf.multiply(self.target_network_params[i], 1. - self.tau))
                for i in range(len(self.target_network_params))]

        # Network target (y_i)
        self.predicted_q_value = tf.placeholder(tf.float32, [None, 1])

        # Define loss and optimization Op
        self.loss = tflearn.mean_square(self.predicted_q_value, self.out)
        self.optimize = tf.train.AdamOptimizer(
            self.learning_rate).minimize(self.loss)

        # Get the gradient of the net w.r.t. the action.
        # For each action in the minibatch (i.e., for each x in xs),
        # this will sum up the gradients of each critic output in the minibatch
        # w.r.t. that action. Each output is independent of all
        # actions except for one.
        self.action_grads = tf.gradients(self.out, self.action)

    def create_critic_network(self):
        inputs = tf.placeholder(tf.float32, [None, self.s_dim])
        action = tf.placeholder(tf.float32, [None, self.a_dim])
        
#         l1 = tflearn.fully_connected(inputs, 16)
# #         l1 = tflearn.layers.normalization.batch_normalization(l1)
#         l1 = tflearn.activations.relu(l1)
        
        w1 = tf.get_variable("w1", [self.s_dim, 16], \
                initializer=tf.contrib.layers.xavier_initializer())
        b1 = tf.Variable(tf.random_normal([16]), 'b1')
        l1 = tf.nn.relu(tf.add(tf.matmul(inputs, w1), b1))
        
        # Add the action tensor in the 2nd hidden layer
        # Use two temp layers to get the corresponding weights and biases
        w2 = tf.get_variable('w2', [16, 64], \
                initializer=tf.contrib.layers.xavier_initializer())
        b2 = tf.Variable(tf.random_normal([64]))
        
        w2_ = tf.get_variable('w2_', [self.a_dim, 64], \
                initializer=tf.contrib.layers.xavier_initializer())
        b2_ = tf.Variable(tf.random_normal([64]))
        
#         net = tflearn.activation(
#             tf.matmul(l1, t1.W) + tf.matmul(action, t2.W) + t2.b, activation='relu')

        l2 = tf.nn.relu(
            tf.matmul(l1, w2) + b2 + tf.matmul(action, w2_) + b2_)

        # linear layer connected to 1 output representing Q(s,a)
        # Weights are init to Uniform[-3e-3, 3e-3]
#         w_init = tflearn.initializations.uniform(minval=-0.003, maxval=0.003)
#         out = tflearn.fully_connected(net, 1, weights_init=w_init)
        w3 = tf.get_variable('w3', [64, 1], \
                initializer=tf.contrib.layers.xavier_initializer())
        b3 = tf.Variable(tf.random_normal([1]))
        out = tf.matmul(l2, w3) + b3
        return inputs, action, out

    def train(self, inputs, action, predicted_q_value):
        return self.sess.run([self.out, self.optimize], feed_dict={
            self.inputs: inputs,
            self.action: action,
            self.predicted_q_value: predicted_q_value
        })

    def predict(self, inputs, action):
        return self.sess.run(self.out, feed_dict={
            self.inputs: inputs,
            self.action: action
        })

    def predict_target(self, inputs, action):
        return self.sess.run(self.target_out, feed_dict={
            self.target_inputs: inputs,
            self.target_action: action
        })

    def action_gradients(self, inputs, actions):
        return self.sess.run(self.action_grads, feed_dict={
            self.inputs: inputs,
            self.action: actions
        })

    def update_target_network(self):
        self.sess.run(self.update_target_network_params)


In [32]:
class OrnsteinUhlenbeckActionNoise:
    def __init__(self, mu, sigma=0.3, theta=.15, dt=1e-2, x0=None):
        self.theta = theta
        self.mu = mu
        self.sigma = sigma
        self.dt = dt
        self.x0 = x0
        self.reset()

    def __call__(self):
        x = self.x_prev + self.theta * (self.mu - self.x_prev) * self.dt + \
                self.sigma * np.sqrt(self.dt) * np.random.normal(size=self.mu.shape)
        self.x_prev = x
        return x

    def reset(self):
        self.x_prev = self.x0 if self.x0 is not None else np.zeros_like(self.mu)

    def __repr__(self):
        return 'OrnsteinUhlenbeckActionNoise(mu={}, sigma={})'.format(self.mu, self.sigma)

In [33]:
def build_summaries():
    episode_reward = tf.Variable(0.)
    tf.summary.scalar("Reward", episode_reward)
    episode_ave_max_q = tf.Variable(0.)
    tf.summary.scalar("Qmax Value", episode_ave_max_q)

    summary_vars = [episode_reward, episode_ave_max_q]
    summary_ops = tf.summary.merge_all()

    return summary_ops, summary_vars

In [34]:
random_seed = '1234'
env = gym.make('Pendulum-v0')

state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
action_bound = env.action_space.high
assert (env.action_space.high == -env.action_space.low)

actor_lr = 1e-3
critic_lr = 1e-2
tau = 1e-3
minibatch_size = 64
gamma = 0.99
buffer_size = 1000000

max_episodes = 200
max_episodes_len = 1000
render_env = False

tf.reset_default_graph()
sess = tf.Session() 

actor = ActorNetwork(sess, state_dim, action_dim, action_bound,
                     actor_lr, tau, minibatch_size)

critic = CriticNetwork(sess, state_dim, action_dim,
                       critic_lr, tau, gamma, actor.get_num_trainable_vars())

actor_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(action_dim))

In [35]:


# if args['use_gym_monitor']:
#     if not args['render_env']:
#         env = wrappers.Monitor(
#             env, args['monitor_dir'], video_callable=False, force=True)
#     else:
#         env = wrappers.Monitor(env, args['monitor_dir'], force=True)


# if args['use_gym_monitor']:
#     env.monitor.close()



# summary_ops, summary_vars = build_summaries()
sess.run(tf.global_variables_initializer())
# writer = tf.summary.FileWriter(args['summary_dir'], sess.graph)

# Initialize target network weights
actor.update_target_network()
critic.update_target_network()

# Initialize replay memory
replay_buffer = ReplayBuffer(buffer_size)


R, Qmax = [], []
for i in range(max_episodes):

    s = env.reset()

    ep_reward = 0
    ep_ave_max_q = 0

    for j in range(max_episodes_len):

#         if args['render_env']:
#             env.render()

        # Added exploration noise
        #a = actor.predict(np.reshape(s, (1, 3))) + (1. / (1. + i))
        a = actor.predict(np.reshape(s, (1, actor.s_dim))) + actor_noise()

        s2, r, terminal, info = env.step(a[0])

        replay_buffer.add(np.reshape(s, (actor.s_dim,)), np.reshape(a, (actor.a_dim,)), r,
                          terminal, np.reshape(s2, (actor.s_dim,)))

        # Keep adding experience to the memory until
        # there are at least minibatch size samples
        if replay_buffer.size() > minibatch_size:
            s_batch, a_batch, r_batch, t_batch, s2_batch = \
                replay_buffer.sample_batch(minibatch_size)

            # Calculate targets
            target_q = critic.predict_target(
                s2_batch, actor.predict_target(s2_batch))

#             y_i = []
#             for k in range(minibatch_size):
#                 if t_batch[k]:
#                     y_i.append(r_batch[k])
#                 else:
#                     y_i.append(r_batch[k] + critic.gamma * target_q[k])

            for k in range(minibatch_size):
                if not t_batch[k]:
                    r_batch[k] = r_batch[k] + critic.gamma * target_q[k]


            # Update the critic given the targets
            predicted_q_value, _ = critic.train(
                s_batch, a_batch, np.reshape(r_batch, (minibatch_size, 1)))

            ep_ave_max_q += np.amax(predicted_q_value)

            # Update the actor policy using the sampled gradient
            a_outs = actor.predict(s_batch)
            grads = critic.action_gradients(s_batch, a_outs)
            actor.train(s_batch, grads[0])

            # Update target networks
            actor.update_target_network()
            critic.update_target_network()

        s = s2
        ep_reward += r

        print('\rReward: {:d} | Episode: {:d} | Qmax: {:.4f}'.format(int(ep_reward), \
                    i, (ep_ave_max_q / (float(j)+0.1))), end='')
        if terminal:
            print('\n')
            Qmax.append(ep_ave_max_q/float(j))
            R.append(ep_reward)
#             summary_str = sess.run(summary_ops, feed_dict={
#                 summary_vars[0]: ep_reward,
#                 summary_vars[1]: ep_ave_max_q / float(j)
#             })

#             writer.add_summary(summary_str, i)
#             writer.flush()

            break

import matplotlib.pyplot as plt
        
fig = plt.figure()
plt.subplot(211)
plt.plot(range(len(R)), R)
plt.title("reward per episode")
plt.subplot(212)
plt.plot(range(len(Qmax)), Qmax)
plt.title("Qmax per episode")
plt.show()
plt.savefig("./figs/results.png")



Reward: -1111 | Episode: 0 | Qmax: -1.8065

Reward: -1614 | Episode: 1 | Qmax: -0.6349

Reward: -1585 | Episode: 2 | Qmax: -0.7878

Reward: -838 | Episode: 3 | Qmax: -0.7609

KeyboardInterrupt: 