In [None]:
import gym
import matplotlib.pyplot as plt
import numpy as np
import sys
import tensorflow as tf

In [None]:
class FfAgentContinuous(object):
    def __init__(self, session, input_size, output_size, gamma=0.99):
        self.session = session
        self.input_size = input_size
        self.output_size = output_size
        self.gamma = gamma
        
        self.observations_ph = tf.placeholder(dtype=tf.float32, shape=[None, self.input_size])
        # expected sum of discounted rewards
        self.esdr_ph = tf.placeholder(dtype=tf.float32, shape=[None, 1])
        self.v_s_ph  = tf.placeholder(dtype=tf.float32, shape=[None, 1]) # V(s)
        self.v_sp_ph = tf.placeholder(dtype=tf.float32, shape=[None, 1]) # V(s')
        self.r_ph    = tf.placeholder(dtype=tf.float32, shape=[None, 1]) # r_t+1
        self.actions_ph = tf.placeholder(dtype=tf.float32, shape=[None, self.output_size])
        #self.reward_ph = tf.placeholder(dtype=tf.float32, shape=[None, 1])
        
        advantage = self.esdr_ph - self.v_s_ph
        mean_adv = tf.reduce_mean(advantage)
        stddev_adv = tf.sqrt(tf.reduce_mean(advantage*advantage) - mean_adv*mean_adv)
        self.adv_normalized = (advantage - mean_adv)/(stddev_adv + 1e-8)
        
        # Policy network
        W1p = tf.get_variable("w1p", [self.input_size, 128], initializer=tf.initializers.random_normal(stddev=0.01))
        b1p = tf.get_variable("b1p", [128], initializer=tf.initializers.random_normal(stddev=0.01))
        W2p_means = tf.get_variable("w2pmeans", [128, self.output_size], initializer=tf.initializers.random_normal(stddev=0.01)) # policy
        W2p_stdevs = tf.get_variable("w2pstdevs", [128, self.output_size], initializer=tf.initializers.random_normal(stddev=0.01)) # policy
        b2p_means = tf.get_variable("b2pmeans", [self.output_size], initializer=tf.initializers.random_normal(stddev=0.01))
        b2p_stdevs = tf.get_variable("b2pstdevs", [self.output_size], initializer=tf.initializers.random_normal(stddev=0.01))
        
        l1p = tf.nn.relu(tf.matmul(self.observations_ph, W1p) + b1p)
        # this will need to be changed to accommodate the range and character of action values
        l2p_means = tf.matmul(l1p, W2p_means) + b2p_means
        # Trying to start with a large standard deviation to encourage exploration early on.
        l2p_stdevs = tf.matmul(l1p, W2p_stdevs) + b2p_stdevs + 7
        l2_policy_means = 2*tf.nn.tanh(l2p_means)
        l2_policy_stdevs = tf.math.minimum(tf.nn.softplus(l2p_stdevs), 10) + 0.01
        
        # Critic network
        W1v = tf.get_variable("w1v", [self.input_size, 128], initializer=tf.initializers.random_normal(stddev=0.01))
        b1v = tf.get_variable("b1v", [128], initializer=tf.initializers.random_normal(stddev=0.01))
        W2v = tf.get_variable("w2v", [128, 1], initializer=tf.initializers.random_normal(stddev=0.01)) # value
        b2v = tf.get_variable("b2v", [1], initializer=tf.initializers.random_normal(stddev=0.01))
        
        l1v = tf.nn.relu(tf.matmul(self.observations_ph, W1v) + b1v)
        l2v = tf.matmul(l1v, W2v) + b2v
        
        self.reinforce_loss = tf.reduce_mean(
            (self.esdr_ph)*tf.square((self.actions_ph - l2_policy_means)/(l2_policy_stdevs + 1e-8))
            - 0.01*tf.log(l2_policy_stdevs)
        )
        self.reinforce_optimizer = tf.train.AdamOptimizer(learning_rate=0.001).minimize(self.reinforce_loss)
        
        self.actor_loss = tf.reduce_mean(
            (advantage)*tf.square((self.actions_ph - l2_policy_means)/(l2_policy_stdevs + 1e-8)
        ) - 0.01*tf.log(l2_policy_stdevs))
        optimizer = tf.train.AdamOptimizer(learning_rate=0.001)
        grads_and_vars = optimizer.compute_gradients(self.actor_loss)
        capped_grads = [(grad if grad is None else tf.clip_by_norm(grad, 2.0), var) for grad, var in grads_and_vars]
        self.actor_optimizer = optimizer.apply_gradients(capped_grads)
        
        self.critic_loss = tf.reduce_mean(
            tf.square(l2v - self.esdr_ph)
        )
        self.critic_optimizer = tf.train.AdamOptimizer(learning_rate=0.001).minimize(self.critic_loss)
        
        self.action_prediction_means = l2_policy_means
        self.action_prediction_stdevs = l2_policy_stdevs
        self.esdr_predictions = l2v
        
    # For advantage:
    #    Add single timestep reward samples
    #    Add placeholders for estimated V(s) and V(s')
    def trainSarBatches(self, states, actions, discounted_rewards):
        '''
        Expects inputs to be numpy arrays of shape:
            states = [batch_size, num_state_features]
            actions = [batch_size, num_available_actions]
            discounted_rewards = [batch_size, 1]
        
        The idea is that all episodes have been parsed through and shuffled into
        one big batch of training data.
        '''
        
        advantage_feeds = {
            self.observations_ph: states
        }
        #print("shape of discounted rewards:", discounted_rewards.shape)
        
        advantage_fetches = self.esdr_predictions
        
        v_predictions = self.session.run(advantage_fetches, feed_dict=advantage_feeds)
        #print(v_predictions.shape)
        
        optimize_feeds = {
            self.observations_ph: states,
            self.esdr_ph: discounted_rewards,
            self.v_s_ph: v_predictions,
            self.actions_ph: actions
        }
        
        optimize_fetches = [
            #self.reinforce_loss,
            self.actor_loss,
            self.action_prediction_means,
            self.action_prediction_stdevs,
            self.esdr_predictions,
            #self.reinforce_optimizer
            #self.actor_critic_optimizer
            self.actor_optimizer,
            self.critic_optimizer
        ]
        
        loss, action_prediction_means, action_prediction_stdevs, esdr_predictions, _1, _2 = self.session.run(optimize_fetches, feed_dict=optimize_feeds)
        return loss, action_prediction_means, action_prediction_stdevs, esdr_predictions
    
    def predict(self, state):
        '''
        Expects state to have the shape [num_state_features]
        '''
        
        feeds = {
            self.observations_ph: np.array([state])
        }
        #print("state received by agent:", state)
        fetches = [
            self.action_prediction_means,
            self.action_prediction_stdevs,
            self.esdr_predictions
        ]
        action_prediction_means, action_prediction_stdevs, esdr_predictions = self.session.run(fetches, feed_dict=feeds)
        return action_prediction_means, action_prediction_stdevs, esdr_predictions

In [None]:
def prepSarData(states, actions, rewards, gamma=0.99):
    '''
    Converts temporally synced lists of states, actions, and rewards into shuffled
    numpy matrices for training.
    '''
    #print(len(states), len(actions), len(rewards))
    discounted_sum_rewards = 0
    discounted_rewards = []
    for i in range(len(rewards) - 1, -1, -1):
        discounted_sum_rewards = gamma*discounted_sum_rewards + rewards[i]
        discounted_rewards.append(discounted_sum_rewards)
    discounted_rewards = np.expand_dims(np.array(discounted_rewards[::-1]), axis=1)
    
    actions = np.array(actions)
    states = np.array(states)
    indices = [i for i in range(len(actions))]
    np.random.shuffle(indices)
    
    actions_shuffled = actions[indices]
    states_shuffled = states[indices]
    discounted_rewards_shuffled = discounted_rewards[indices]
    
    return actions_shuffled, states_shuffled, discounted_rewards_shuffled

In [None]:
def accumulateData(env, agent, max_steps=1000, max_rollouts=200):
    states = []
    actions = []
    rewards = []
    for rollout_count in range(max_rollouts):
        ep_states = []
        ep_actions = []
        ep_rewards = []
        ep_state_t = env.reset()
        ep_states.append(ep_state_t)
        for t in range(max_steps):
            ep_action_t = np.random.normal(loc=agent.predict(ep_state_t)[0][0], scale=agent.predict(ep_state_t)[1][0])
            #print(ep_action_t)
            ep_action_t = min(max(ep_action_t, [-2.0]), [2.0])
            #print(ep_action_t)
            ep_state_tp1, ep_reward_tp1, done, _ = env.step(ep_action_t)

            ep_actions.append(ep_action_t)
            ep_states.append(ep_state_tp1)
            ep_rewards.append(ep_reward_tp1)
            if done:
                ep_states.pop(-1)
                #ep_rewards.pop(-1)
                break
            ep_state_t = ep_state_tp1
        states.append(ep_states)
        actions.append(ep_actions)
        rewards.append(ep_rewards)
    return states, actions, rewards

In [None]:
def renderAgent(env, agent):
    state_t = env.reset()
    rewards = 0
    actions = []
    while i < 1000:
        action_t = np.random.normal(loc=agent.predict(state_t)[0][0], scale=agent.predict(state_t)[1][0])
        #print(ep_action_t)
        action_t = min(max(action_t, [-2.0]), [2.0])
        actions.append(action_t)
        #print(ep_action_t)
        state_tp1, reward_tp1, done, _ = env.step(action_t)
        rewards += reward_tp1
        env.render()
        state_t = state_tp1
        if done:
            print("Rewards from rendering:", rewards)
            break
    return actions

In [None]:
#print(type(gym.envs.registry.all()))
env_ids = [espec.id for espec in gym.envs.registry.all()]
for e in sorted(env_ids):
    print(e)


In [None]:
pendulum = gym.make("Pendulum-v0")
session = tf.Session()
print(pendulum.observation_space.shape)
print(pendulum.action_space)
num_actions = len(pendulum.action_space.high)
agent = FfAgentContinuous(session, pendulum.observation_space.shape[0], num_actions)

session.run(tf.global_variables_initializer())

In [None]:
average_rewards = []
average_stdevs = []
for i in range(10000):
    states, actions, rewards = accumulateData(pendulum, agent)
    #print(actions[0:10])
    #print(rewards[0:10])
    states_pro = []
    actions_pro = []
    rewards_pro = []
    if i % 10 == 0 and i > 0:
        
        plt.figure()
        plt.plot(average_stdevs)
        plt.title("Average stdevs so far")
        plt.figure()
        plt.plot(average_rewards)
        plt.title("average rewards so far")
        
        plottable_actions = renderAgent(pendulum, agent)
        plt.figure()
        plt.scatter(range(len(plottable_actions)), plottable_actions)
        plt.title("Actions Taken in Rendered Environment")
        plt.show()
        plt.close()
    for j in range(len(actions)):
        ret = prepSarData(actions[j], states[j], rewards[j])
        mean_reward = np.average(ret[1])
        stdev_reward = np.std(ret[1])
        states_pro.append(ret[0])
        #actions_pro.append((ret[1] - mean_reward)/stdev_reward)
        actions_pro.append(ret[1])
        rewards_pro.append(ret[2])
        #print(ret[0].shape, ret[1].shape, ret[2].shape)
        #for k in range(10):
        #    agent.trainSarBatches(ret[0], ret[1], ret[2])
    for k in range(5*len(states_pro)):
        train_index = np.random.choice(a=range(len(states_pro)))
        ret = agent.trainSarBatches(states_pro[train_index], actions_pro[train_index], rewards_pro[train_index])
        if np.isnan(ret[0]):
            print("Received nan loss, stopping training.")
            pendulum.close()
            sys.exit(-1)
    print(i)
    average_reward = np.average([sum(r) for r in rewards])
    print("average reward: ", average_reward, "stdevs:", np.average(np.squeeze(ret[2])), "losses:", np.average(np.squeeze(ret[0])))
    average_stdevs.append(np.average(np.squeeze(ret[2])))
    average_rewards.append(average_reward)

plt.figure()
plt.plot(average_rewards)
plt.show()
pendulum.close()

In [None]:
help(pendulum.close)

In [None]:
help(tf.math.maximum)

In [None]:
help(tf.sqrt)