Skip to content
Permalink
Branch: master
Find file Copy path
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
608 lines (536 sloc) 24.5 KB
import inspect
import os
import time
from multiprocessing import Process
import gym
import numpy as np
import tensorflow as tf
import logz
#============================================================================================#
# Utilities
#============================================================================================#
def build_mlp(
input_placeholder,
output_size,
scope,
n_layers=2,
size=64,
activation=tf.tanh,
output_activation=None
):
#========================================================================================#
# ----------SECTION 3----------
# Network building
#
# Your code should make a feedforward neural network (also called a multilayer perceptron)
# with 'n_layers' hidden layers of size 'size' units.
#
# The output layer should have size 'output_size' and activation 'output_activation'.
#
# Hint: use tf.layers.dense
#========================================================================================#
with tf.variable_scope(scope):
# YOUR_CODE_HERE
hidden_layer = input_placeholder
for i in range(n_layers):
hidden_layer = tf.layers.dense(hidden_layer, size, activation)
return tf.layers.dense(hidden_layer, output_size, output_activation)
def pathlength(path):
return len(path["reward"])
class MultiEnv:
def __init__(self, env_name: str, n_job: int):
"""
Create multiple environments.
:param env_name: environment name
:param n_job: the number of environments
"""
assert n_job > 0
self.envs = []
for i in range(n_job):
self.envs.append(gym.make(env_name))
def reset(self):
"""
Reset all environments.
:return: initial observations
"""
obs = []
for env in self.envs:
obs.append(env.reset())
return np.asarray(obs)
def step(self, actions):
"""
Step all environments.
:param actions: actions for environments
:return: (observations, rewards, done(s))
"""
observations, rewards, dones = [], [], []
for i in range(len(self.envs)):
observation, reward, done, _ = self.envs[i].step(actions[i])
observations.append(observation)
rewards.append(reward)
dones.append(done)
return np.asarray(observations), np.asarray(rewards), np.asarray(dones)
def render(self):
"""
Render a environment.
"""
self.envs[0].render()
#============================================================================================#
# Policy Gradient
#============================================================================================#
def train_PG(exp_name='',
env_name='CartPole-v0',
n_iter=100,
gamma=1.0,
min_timesteps_per_batch=1000,
max_path_length=None,
learning_rate=5e-3,
reward_to_go=True,
animate=True,
logdir=None,
normalize_advantages=True,
nn_baseline=False,
seed=0,
n_job=1,
epoch=1,
gae_lambda=None,
# network arguments
n_layers=1,
size=32
):
start = time.time()
# Configure output directory for logging
logz.configure_output_dir(logdir)
# Log experimental parameters
args = inspect.getargspec(train_PG)[0]
locals_ = locals()
params = {k: locals_[k] if k in locals_ else None for k in args}
logz.save_params(params)
# Set random seeds
tf.set_random_seed(seed)
np.random.seed(seed)
# Make the gym environment
env = gym.make(env_name)
# Is this env continuous, or discrete?
discrete = isinstance(env.action_space, gym.spaces.Discrete)
# Maximum length for episodes
max_path_length = max_path_length or env.spec.max_episode_steps
#========================================================================================#
# Notes on notation:
#
# Symbolic variables have the prefix sy_, to distinguish them from the numerical values
# that are computed later in the function
#
# Prefixes and suffixes:
# ob - observation
# ac - action
# _no - this tensor should have shape (batch size /n/, observation dim)
# _na - this tensor should have shape (batch size /n/, action dim)
# _n - this tensor should have shape (batch size /n/)
#
# Note: batch size /n/ is defined at runtime, and until then, the shape for that axis
# is None
#========================================================================================#
# Observation and action sizes
ob_dim = env.observation_space.shape[0]
ac_dim = env.action_space.n if discrete else env.action_space.shape[0]
#========================================================================================#
# ----------SECTION 4----------
# Placeholders
#
# Need these for batch observations / actions / advantages in policy gradient loss function.
#========================================================================================#
sy_ob_no = tf.placeholder(shape=[None, ob_dim], name="ob", dtype=tf.float32)
if discrete:
sy_ac_na = tf.placeholder(shape=[None], name="ac", dtype=tf.int32)
else:
sy_ac_na = tf.placeholder(shape=[None, ac_dim], name="ac", dtype=tf.float32)
# Define a placeholder for advantages
sy_adv_n = tf.placeholder(shape=[None], name="adv", dtype=tf.float32)
#========================================================================================#
# ----------SECTION 4----------
# Networks
#
# Make symbolic operations for
# 1. Policy network outputs which describe the policy distribution.
# a. For the discrete case, just logits for each action.
#
# b. For the continuous case, the mean / log std of a Gaussian distribution over
# actions.
#
# Hint: use the 'build_mlp' function you defined in utilities.
#
# Note: these ops should be functions of the placeholder 'sy_ob_no'
#
# 2. Producing samples stochastically from the policy distribution.
# a. For the discrete case, an op that takes in logits and produces actions.
#
# Should have shape [None]
#
# b. For the continuous case, use the reparameterization trick:
# The output from a Gaussian distribution with mean 'mu' and std 'sigma' is
#
# mu + sigma * z, z ~ N(0, I)
#
# This reduces the problem to just sampling z. (Hint: use tf.random_normal!)
#
# Should have shape [None, ac_dim]
#
# Note: these ops should be functions of the policy network output ops.
#
# 3. Computing the log probability of a set of actions that were actually taken,
# according to the policy.
#
# Note: these ops should be functions of the placeholder 'sy_ac_na', and the
# policy network output ops.
#
#========================================================================================#
if discrete:
# YOUR_CODE_HERE
sy_logits_na = build_mlp(sy_ob_no, ac_dim, "nn_policy", n_layers, size)
# Hint: Use the tf.multinomial op
sy_sampled_ac = tf.squeeze(tf.multinomial(sy_logits_na, 1), 1)
sy_logprob_na = tf.nn.log_softmax(sy_logits_na)
sy_index_n = tf.stack([tf.range(tf.shape(sy_logits_na)[0]), sy_ac_na], 1)
sy_logprob_n = tf.gather_nd(sy_logprob_na, sy_index_n)
else:
# YOUR_CODE_HERE
sy_mu_na = build_mlp(sy_ob_no, ac_dim, "nn_mu", n_layers, size)
# logstd should just be a trainable variable, not a network output.
sy_logstd = tf.get_variable("nn_logstd", shape=[1,ac_dim], initializer=tf.zeros_initializer())
norm_dist = tf.distributions.Normal(sy_mu_na, tf.exp(sy_logstd))
sy_sampled_ac = norm_dist.sample()
# Hint: Use the log probability under a multivariate gaussian.
sy_logprob_n = tf.reduce_sum(norm_dist.log_prob(sy_ac_na), 1)
#========================================================================================#
# ----------SECTION 4----------
# Loss Function and Training Operation
#========================================================================================#
# Loss function that we'll differentiate to get the policy gradient.
loss = -tf.reduce_sum(sy_logprob_n * sy_adv_n)
update_op = tf.train.AdamOptimizer(learning_rate).minimize(loss)
#========================================================================================#
# ----------SECTION 5----------
# Optional Baseline
#========================================================================================#
if nn_baseline or gae_lambda:
baseline_prediction = tf.squeeze(build_mlp(
sy_ob_no,
1,
"nn_baseline",
n_layers=n_layers,
size=size))
# Define placeholders for targets, a loss function and an update op for fitting a
# neural network baseline. These will be used to fit the neural network baseline.
# YOUR_CODE_HERE
baseline_target = tf.placeholder(tf.float32, [None])
baseline_loss = tf.losses.mean_squared_error(baseline_target, baseline_prediction)
baseline_update_op = tf.train.AdamOptimizer(learning_rate).minimize(baseline_loss)
#========================================================================================#
# Tensorflow Engineering: Config, Session, Variable initialization
#========================================================================================#
tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1)
sess = tf.Session(config=tf_config)
sess.__enter__() # equivalent to `with sess:`
tf.global_variables_initializer().run() #pylint: disable=E1101
#========================================================================================#
# Training Loop
#========================================================================================#
total_timesteps = 0
# Create environments
envs = MultiEnv(env_name, n_job)
for itr in range(n_iter):
print("********** Iteration %i ************"%itr)
# Start timer for sample timing
time_start = time.time()
# Collect paths until we have enough timesteps
timesteps_this_batch = 0
paths = []
while True:
animate_this_episode = (len(paths) == 0 and (itr % 10 == 0) and animate)
observations = envs.reset()
paths_done = [False] * n_job
paths_observations = [[] for _ in range(n_job)]
paths_actions = [[] for _ in range(n_job)]
paths_rewards = [[] for _ in range(n_job)]
steps = 0
while True:
if animate_this_episode:
envs.render()
time.sleep(0.05)
# Append observations
for i in range(n_job):
if not paths_done[i]:
paths_observations[i].append(observations[i])
# Get actions from current policy
actions = sess.run(sy_sampled_ac, feed_dict={sy_ob_no: observations})
for i in range(n_job):
if not paths_done[i]:
paths_actions[i].append(actions[i])
# Run step
observations, rewards, path_done_next = envs.step(actions)
# Append rewards
for i in range(n_job):
if not paths_done[i]:
paths_rewards[i].append(rewards[i])
steps += 1
paths_done = path_done_next
if np.all(paths_done) or steps > max_path_length:
break
# Append paths
for i in range(n_job):
path = {"observation": np.array(paths_observations[i]),
"reward": np.array(paths_rewards[i]),
"action": np.array(paths_actions[i])}
paths.append(path)
timesteps_this_batch += pathlength(path)
if timesteps_this_batch > min_timesteps_per_batch:
break
total_timesteps += timesteps_this_batch
# Get sample time
time_used = time.time() - time_start
# Build arrays for observation, action for the policy gradient update by concatenating
# across paths
ob_no = np.concatenate([path["observation"] for path in paths])
ac_na = np.concatenate([path["action"] for path in paths])
#====================================================================================#
# ----------SECTION 4----------
# Computing Q-values
#
# Your code should construct numpy arrays for Q-values which will be used to compute
# advantages (which will in turn be fed to the placeholder you defined above).
#
# Recall that the expression for the policy gradient PG is
#
# PG = E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * (Q_t - b_t )]
#
# where
#
# tau=(s_0, a_0, ...) is a trajectory,
# Q_t is the Q-value at time t, Q^{pi}(s_t, a_t),
# and b_t is a baseline which may depend on s_t.
#
# You will write code for two cases, controlled by the flag 'reward_to_go':
#
# Case 1: trajectory-based PG
#
# (reward_to_go = False)
#
# Instead of Q^{pi}(s_t, a_t), we use the total discounted reward summed over
# entire trajectory (regardless of which time step the Q-value should be for).
#
# For this case, the policy gradient estimator is
#
# E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * Ret(tau)]
#
# where
#
# Ret(tau) = sum_{t'=0}^T gamma^t' r_{t'}.
#
# Thus, you should compute
#
# Q_t = Ret(tau)
#
# Case 2: reward-to-go PG
#
# (reward_to_go = True)
#
# Here, you estimate Q^{pi}(s_t, a_t) by the discounted sum of rewards starting
# from time step t. Thus, you should compute
#
# Q_t = sum_{t'=t}^T gamma^(t'-t) * r_{t'}
#
#
# Store the Q-values for all timesteps and all trajectories in a variable 'q_n',
# like the 'ob_no' and 'ac_na' above.
#
#====================================================================================#
# YOUR_CODE_HERE
q_n = np.asanyarray([])
for path in paths:
reward = path['reward']
length = len(reward)
if not reward_to_go:
q_path = np.sum(reward * np.logspace(0, length-1, length, base=gamma))
q_n = np.append(q_n, np.ones_like(reward) * q_path)
else:
q_path = np.zeros_like(reward)
# Accumulate reward from right to left
temp = reward.copy()
for t in range(length):
q_path += np.pad(temp[t:], (0,t), 'constant')
temp *= gamma
q_n = np.append(q_n, q_path)
#====================================================================================#
# ----------SECTION 5----------
# Computing Baselines
#====================================================================================#
if nn_baseline or gae_lambda:
# If nn_baseline is True, use your neural network to predict reward-to-go
# at each timestep for each trajectory, and save the result in a variable 'b_n'
# like 'ob_no', 'ac_na', and 'q_n'.
#
# Hint #bl1: rescale the output from the nn_baseline to match the statistics
# (mean and std) of the current or previous batch of Q-values. (Goes with Hint
# #bl2 below.)
b_n = sess.run(baseline_prediction, {sy_ob_no: ob_no})
# Rescale to normal distribution
b_std = np.std(b_n)
b_mean = np.mean(b_n)
b_n = (b_n - b_mean) / b_std
# Rescale to Q-value distribution
q_std = np.std(q_n)
q_mean = np.mean(q_n)
b_n = q_mean + b_n * q_std
if gae_lambda: # Generalized advantage estimator
adv_n = np.zeros_like(q_n)
index_start = 0
for path in paths:
reward = path['reward']
length = len(reward)
index_end = index_start + length
path_v = b_n[index_start:index_end]
path_v_next = b_n[index_start+1:index_end]
path_v_next = np.append(path_v_next, 0)
delta = reward + gamma * path_v_next - path_v
# Accumulate critic from right to left
temp = delta.copy()
for t in range(length):
adv_n[index_start:index_end] += np.pad(temp[t:], (0, t), 'constant')
temp *= gamma * gae_lambda
index_start = index_end
else: # Baseline estimator
adv_n = q_n - b_n
else:
adv_n = q_n.copy()
#====================================================================================#
# ----------SECTION 4----------
# Advantage Normalization
#====================================================================================#
if normalize_advantages:
# On the next line, implement a trick which is known empirically to reduce variance
# in policy gradient methods: normalize adv_n to have mean zero and std=1.
# YOUR_CODE_HERE
adv_std = np.std(adv_n) + 1e-5
adv_mean = np.mean(adv_n)
adv_n = (adv_n - adv_mean) / adv_std
#====================================================================================#
# ----------SECTION 5----------
# Optimizing Neural Network Baseline
#====================================================================================#
if nn_baseline or gae_lambda:
# ----------SECTION 5----------
# If a neural network baseline is used, set up the targets and the inputs for the
# baseline.
#
# Fit it to the current batch in order to use for the next iteration. Use the
# baseline_update_op you defined earlier.
#
# Hint #bl2: Instead of trying to target raw Q-values directly, rescale the
# targets to have mean zero and std=1. (Goes with Hint #bl1 above.)
# YOUR_CODE_HERE
target_n = (q_n - q_mean) / q_std
feed_dict = {sy_ob_no: ob_no, baseline_target: target_n}
for i in range(epoch):
sess.run(baseline_update_op, feed_dict)
#====================================================================================#
# ----------SECTION 4----------
# Performing the Policy Update
#====================================================================================#
# Call the update operation necessary to perform the policy gradient update based on
# the current batch of rollouts.
#
# For debug purposes, you may wish to save the value of the loss function before
# and after an update, and then log them below.
# YOUR_CODE_HERE
feed_dict = {
sy_ob_no: ob_no,
sy_ac_na: ac_na,
sy_adv_n: adv_n / len(paths)
}
# Save the loss function before the update
loss_before = sess.run(loss, feed_dict)
# Train
for i in range(epoch):
sess.run(update_op, feed_dict)
# Save the loss function after the update
loss_after = sess.run(loss, feed_dict)
# Log diagnostics
returns = [path["reward"].sum() for path in paths]
ep_lengths = [pathlength(path) for path in paths]
logz.log_tabular("Time", time.time() - start)
logz.log_tabular("Iteration", itr)
logz.log_tabular("AverageReturn", np.mean(returns))
logz.log_tabular("StdReturn", np.std(returns))
logz.log_tabular("MaxReturn", np.max(returns))
logz.log_tabular("MinReturn", np.min(returns))
logz.log_tabular("EpLenMean", np.mean(ep_lengths))
logz.log_tabular("EpLenStd", np.std(ep_lengths))
logz.log_tabular("TimestepsThisBatch", timesteps_this_batch)
logz.log_tabular("TimestepsSoFar", total_timesteps)
logz.log_tabular("LossBeforeUpdate", loss_before)
logz.log_tabular("LossAfterUpdate", loss_after)
logz.log_tabular("LossUpdated", loss_after - loss_before)
logz.log_tabular("SampleTime", time_used)
logz.dump_tabular()
logz.pickle_tf_vars()
def main():
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('env_name', type=str)
parser.add_argument('--exp_name', type=str, default='vpg')
parser.add_argument('--render', action='store_true')
parser.add_argument('--discount', type=float, default=1.0)
parser.add_argument('--n_iter', '-n', type=int, default=100)
parser.add_argument('--batch_size', '-b', type=int, default=1000)
parser.add_argument('--ep_len', '-ep', type=float, default=-1.)
parser.add_argument('--learning_rate', '-lr', type=float, default=5e-3)
parser.add_argument('--reward_to_go', '-rtg', action='store_true')
parser.add_argument('--dont_normalize_advantages', '-dna', action='store_true')
parser.add_argument('--nn_baseline', '-bl', action='store_true')
parser.add_argument('--seed', type=int, default=1)
parser.add_argument('--n_experiments', '-e', type=int, default=1)
parser.add_argument('--n_layers', '-l', type=int, default=1)
parser.add_argument('--size', '-s', type=int, default=32)
parser.add_argument('--n_job', '-j', type=int, default=1, help='The number of environments used to sample')
parser.add_argument('--gae_lambda', '-g', type=float, default=None, help='λ for generalized advantage estimation (default off)')
parser.add_argument('--epoch', type=int, default=1, help='Epoch of neural network training (default 1)')
args = parser.parse_args()
if not(os.path.exists('data')):
os.makedirs('data')
logdir = args.exp_name + '_' + args.env_name + '_' + time.strftime("%d-%m-%Y_%H-%M-%S")
logdir = os.path.join('data', logdir)
if not(os.path.exists(logdir)):
os.makedirs(logdir)
max_path_length = args.ep_len if args.ep_len > 0 else None
for e in range(args.n_experiments):
seed = args.seed + 10*e
print('Running experiment with seed %d'%seed)
def train_func():
train_PG(
exp_name=args.exp_name,
env_name=args.env_name,
n_iter=args.n_iter,
gamma=args.discount,
min_timesteps_per_batch=args.batch_size,
max_path_length=max_path_length,
learning_rate=args.learning_rate,
reward_to_go=args.reward_to_go,
animate=args.render,
logdir=os.path.join(logdir,'%d'%seed),
normalize_advantages=not(args.dont_normalize_advantages),
nn_baseline=args.nn_baseline,
seed=seed,
n_job=args.n_job,
epoch=args.epoch,
gae_lambda=args.gae_lambda,
n_layers=args.n_layers,
size=args.size
)
# Awkward hacky process runs, because Tensorflow does not like
# repeatedly calling train_PG in the same thread.
p = Process(target=train_func, args=tuple())
p.start()
p.join()
if __name__ == "__main__":
main()
You can’t perform that action at this time.