In [1]:
import datetime
import gym
import matplotlib.pyplot as plt
import numpy as np
import os
import sys
import tensorflow as tf

In [2]:
class FfAgentContinuous(object):
    def __init__(self, session, input_size, output_size, gamma=0.99, ppo_epsilon=0.2):
        self.session = session
        self.input_size = input_size
        self.output_size = output_size
        self.gamma = gamma
        self.ppo_epsilon = ppo_epsilon
        
        self.observations_ph = tf.placeholder(dtype=tf.float32, shape=[None, self.input_size])
        # esdr = expected sum of discounted rewards
        self.esdr_ph = tf.placeholder(dtype=tf.float32, shape=[None, 1])
        self.v_s_ph  = tf.placeholder(dtype=tf.float32, shape=[None, 1]) # V(s)
        self.v_sp_ph = tf.placeholder(dtype=tf.float32, shape=[None, 1]) # V(s')
        self.r_ph    = tf.placeholder(dtype=tf.float32, shape=[None, 1]) # r_t+1; r'
        self.actions_ph = tf.placeholder(dtype=tf.float32, shape=[None, self.output_size])
        self.advantages_ph = tf.placeholder(dtype=tf.float32, shape=[None, 1])
        
#         advantage = self.r_ph + self.gamma*self.v_sp_ph - self.v_s_ph

        l2_policy_means, l2_policy_stdevs = self.buildActor("policy_new")
        l2_policy_means_prev, l2_policy_stdevs_prev = self.buildActor("policy_old", trainable=False)
            
        W1v = tf.get_variable("w1v", [self.input_size, 128], initializer=tf.initializers.orthogonal)
        b1v = tf.get_variable("b1v", [128], initializer=tf.initializers.random_normal(stddev=0.01))
        W2v = tf.get_variable("w2v", [128, 1], initializer=tf.initializers.orthogonal) # value
        b2v = tf.get_variable("b2v", [1], initializer=tf.initializers.random_normal(stddev=0.01))
        
        l1v = tf.nn.relu(tf.matmul(self.observations_ph, W1v) + b1v)
        l2v = tf.matmul(l1v, W2v) + b2v

        actor_ratio_numerator_log = -tf.log(l2_policy_stdevs) + -0.5*tf.square(
                (l2_policy_means - self.actions_ph)/l2_policy_stdevs
        )
        actor_ratio_denominator_log = -tf.log(l2_policy_stdevs_prev) + -0.5*tf.square(
                (l2_policy_means_prev - self.actions_ph)/l2_policy_stdevs_prev
        )
        
        actor_ratio = tf.exp(actor_ratio_numerator_log - actor_ratio_denominator_log)
        
        self.actor_loss = -1.0*tf.reduce_mean(
            tf.minimum(
                self.advantages_ph*actor_ratio,
                self.advantages_ph*tf.clip_by_value(
                    actor_ratio,
                    1 - self.ppo_epsilon,
                    1 + self.ppo_epsilon
                )
            )
        )
        self.actor_optimizer = tf.train.AdamOptimizer(learning_rate=1e-4).minimize(self.actor_loss)
        #optimizer = tf.train.AdamOptimizer(learning_rate=0.001)
        #grads_and_vars = optimizer.compute_gradients(self.actor_loss)
        #capped_grads = [(grad if grad is None else tf.clip_by_norm(grad, 2.0), var) for grad, var in grads_and_vars]
        #self.actor_optimizer = optimizer.apply_gradients(capped_grads)
        
        #self.critic_loss = tf.reduce_mean(
        #    tf.square(l2v - self.esdr_ph)
        #)
        self.critic_loss = tf.reduce_mean(
            tf.square(l2v - self.r_ph - self.gamma*self.v_sp_ph)
        )
        self.critic_optimizer = tf.train.AdamOptimizer(learning_rate=5e-4).minimize(self.critic_loss)
        
        self.action_prediction_means = l2_policy_means
        self.action_prediction_stdevs = l2_policy_stdevs
        self.esdr_predictions = l2v
        
        old_params = [v for v in tf.global_variables() if "policy_old" in v.name]
        new_params = [v for v in tf.global_variables() if "policy_new" in v.name]
        
        self.assignments = [op.assign(np) for op, np in zip(old_params, new_params)]
        
    def updatePrevActor(self):

        self.session.run(self.assignments)
        
    def buildActor(self, scope_name, reuse_scope=False, trainable=True):
        with tf.variable_scope(scope_name, reuse=reuse_scope):
            W1p = tf.get_variable(
                "w1p",
                [self.input_size, 128],
                #initializer=tf.initializers.random_normal(stddev=0.01),
                initializer=tf.initializers.orthogonal,
                trainable=trainable
            )
            b1p = tf.get_variable(
                "b1p",
                [128],
                initializer=tf.initializers.random_normal(stddev=0.01),
                trainable=trainable
            )
            W2p_means = tf.get_variable(
                "w2pmeans",
                [128, self.output_size],
                initializer=tf.initializers.orthogonal,
                trainable=trainable                
            )
            W2p_stdevs = tf.get_variable(
                "w2pstdevs",
                [128, self.output_size],
                initializer=tf.initializers.orthogonal,
                trainable=trainable
            )
            b2p_means = tf.get_variable(
                "b2pmeans",
                [self.output_size],
                initializer=tf.initializers.random_normal(stddev=0.01),
                trainable=trainable
            )
            b2p_stdevs = tf.get_variable(
                "b2pstdevs",
                [self.output_size],
                initializer=tf.initializers.random_normal(stddev=0.01),
                trainable=trainable
            )

            l1p = tf.nn.relu(tf.matmul(self.observations_ph, W1p) + b1p)
            l2p_means = tf.matmul(l1p, W2p_means) + b2p_means
            l2p_stdevs = tf.matmul(l1p, W2p_stdevs) + b2p_stdevs
            l2_policy_means = 2*tf.nn.tanh(l2p_means)

            l2_policy_stdevs = tf.maximum(tf.nn.softplus(l2p_stdevs), 1e-5)
            
            return l2_policy_means, l2_policy_stdevs
        
    # For advantage:
    #    Add single timestep reward samples
    #    Add placeholders for estimated V(s) and V(s')
    def trainSarBatches(self, states, actions, discounted_rewards, rewards, next_states, advantages=None):
        '''
        Expects inputs to be numpy arrays of shape:
            states = [batch_size, num_state_features]
            actions = [batch_size, num_available_actions]
            discounted_rewards = [batch_size, 1]
            next_states = [batch_size, num_state_features]
        
        The idea is that all episodes have been parsed through and shuffled into
        one big batch of training data.
        '''

        advantage_feeds = {
            self.observations_ph: states
        }

        advantage_fetches = self.esdr_predictions

        v_predictions = self.session.run(advantage_fetches, feed_dict=advantage_feeds)

        esdr_estimate_feeds = {
            self.observations_ph: next_states
        }

        v_sp_predictions = self.session.run(advantage_fetches, feed_dict=esdr_estimate_feeds)

        if advantages is None:
            advantages = rewards + gamma * v_sp_predictions - v_predictions

        optimize_feeds = {
            self.observations_ph: states,
            self.esdr_ph: discounted_rewards,
            self.v_s_ph: v_predictions,
            self.actions_ph: actions,
            self.v_sp_ph: v_sp_predictions,
            self.r_ph: rewards,
            self.advantages_ph: advantages
        }
        
        optimize_fetches = [
            self.actor_loss,
            self.action_prediction_means,
            self.action_prediction_stdevs,
            self.esdr_predictions,
            self.actor_optimizer,
            self.critic_optimizer
        ]
        
        loss, action_prediction_means, action_prediction_stdevs, esdr_predictions, _1, _2 = \
            self.session.run(optimize_fetches, feed_dict=optimize_feeds)
        
        return loss, action_prediction_means, action_prediction_stdevs, esdr_predictions

    def predict(self, state):
        '''
        Expects state to have the shape [num_state_features]
        '''
        
        feeds = {
            self.observations_ph: np.array([state])
        }

        fetches = [
            self.action_prediction_means,
            self.action_prediction_stdevs,
            self.esdr_predictions
        ]
        action_prediction_means, action_prediction_stdevs, esdr_predictions = self.session.run(fetches, feed_dict=feeds)
        return action_prediction_means, action_prediction_stdevs, esdr_predictions


In [3]:
def prepSarData(states, actions, rewards, values, gamma=0.99):
    '''
    Converts temporally synced lists of states, actions, and rewards into shuffled
    numpy matrices for training.
    '''
    
    next_states = states[1:]
    states = states[:-1]
    actions = actions[:-1]
    rewards = rewards[:-1]
    discounted_sum_rewards = 0
    discounted_rewards = []
    for i in range(len(rewards) - 1, -1, -1):
        discounted_sum_rewards = gamma*discounted_sum_rewards + rewards[i]
        discounted_rewards.append(discounted_sum_rewards)
    discounted_rewards = np.expand_dims(np.array(discounted_rewards[::-1]), axis=1)
    
    actions = np.array(actions)
    states = np.array(states)
    rewards = np.expand_dims(np.array(rewards), axis=1)
    next_states = np.array(next_states)
    values = np.array(values)
    #print("size of states:", len(states))
    #print("size of actions:", len(actions))
    #print("size of rewards:", len(rewards))
    #print("size of next states:", len(next_states))
    #print("size of values:", len(values))
    
    advantages = rewards + gamma*values[1:] - values[:-1]
    indices = [i for i in range(len(actions))]
    np.random.shuffle(indices)
    
    #print("shape of next states:", next_states.shape)
    
    actions_shuffled = actions[indices]
    states_shuffled = states[indices]
    discounted_rewards_shuffled = discounted_rewards[indices]
    rewards_shuffled = rewards[indices]
    next_states_shuffled = next_states[indices]
    advantages_shuffled = advantages[indices]
    
    return states_shuffled, actions_shuffled, discounted_rewards_shuffled, rewards_shuffled, next_states_shuffled, advantages_shuffled

In [4]:
def accumulateData(env, agent, max_steps=1000, max_rollouts=40):
    states = []
    actions = []
    rewards = []
    values = []
    for rollout_count in range(max_rollouts):
        ep_states = []
        ep_actions = []
        ep_rewards = []
        ep_values = []
        ep_state_t = env.reset()
        ep_states.append(ep_state_t)
        for t in range(max_steps):
            ret_vals = agent.predict(ep_state_t)
            ep_action_t = np.random.normal(loc=ret_vals[0][0], scale=ret_vals[1][0])
            ep_action_t = np.clip(ep_action_t, -2.0, 2.0)
            ep_state_tp1, ep_reward_tp1, done, _ = env.step(ep_action_t)

            ep_values.append(ret_vals[2][0])
            ep_actions.append(ep_action_t)
            ep_states.append(ep_state_tp1)
            ep_rewards.append(ep_reward_tp1)
            if done:
                ep_states.pop(-1)
                #ep_rewards.pop(-1)
                break
            ep_state_t = ep_state_tp1
        #print("states len:", len(ep_states))
        #print("actions len:", len(ep_actions))
        #print("rewards len:", len(ep_rewards))
        #print("values len:", len(ep_values))
        states.append(ep_states)
        actions.append(ep_actions)
        rewards.append(ep_rewards)
        values.append(ep_values)
    
    return states, actions, rewards, values

In [5]:
def renderAgent(env, agent, debug=False):
    state_t = env.reset()
    rewards = 0
    actions = []
    means = []
    stdevs = []
    iterator_variable = 0
    while iterator_variable < 1000:
        ret_vals = agent.predict(state_t)
        action_t = np.random.normal(loc=ret_vals[0][0], scale=ret_vals[1][0])
        #print(ep_action_t)
        action_t = min(max(action_t, [-2.0]), [2.0])
        actions.append(action_t)
        means.append(ret_vals[0][0])
        stdevs.append(ret_vals[1][0])
        #print(ep_action_t)
        state_tp1, reward_tp1, done, _ = env.step(action_t)
        rewards += reward_tp1
        env.render()
        state_t = state_tp1
        if done:
            print("Rewards from rendering:", rewards)
            break
    return actions, stdevs, means

In [6]:
pendulum = gym.make("Pendulum-v0")
session = tf.Session()
print(pendulum.observation_space.shape)
print(pendulum.action_space)
num_actions = len(pendulum.action_space.high)
agent = FfAgentContinuous(session, pendulum.observation_space.shape[0], num_actions)

session.run(tf.global_variables_initializer())
saver = tf.train.Saver()



(3,)
Box(-2.0, 2.0, (1,), float32)
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [7]:
# Optionally load a checkpoint and continue training.
#saver.restore(session, "./checkpoints/periodic_-1159.562864019951_2020-01-05-04-22-44.191582")

In [None]:
average_rewards = []
average_stdevs = []
last_saved_at = 0
for i in range(300):
    states, actions, rewards, values = accumulateData(pendulum, agent)
    states_pro = []
    actions_pro = []
    rewards_pro = []
    discounted_rewards_pro = []
    next_states_pro = []
    advantages_pro = []
    last_10_average_rewards = np.average(average_rewards[-10:])
    if (len(average_rewards) > 20) and (last_10_average_rewards >= -900) and (i - last_saved_at > 50) or ((i > 0) and average_rewards[-1] >= -400):
        print("Saving the model after finding last 10 average rewards of:", last_10_average_rewards)
        save_name = "holyfuckingshit_" + str(last_10_average_rewards) + "_" + str(datetime.datetime.today()).replace(":", "-").replace(" ", "-")
        save_dir = os.path.join("checkpoints", save_name)
        saver.save(session, save_dir)
        last_saved_at = i
    elif i % 1000 == 0 and i > 0:
        save_name = "periodic_" + str(last_10_average_rewards) + "_" + str(datetime.datetime.today()).replace(":", "-").replace(" ", "-")
        save_dir = os.path.join("checkpoints", save_name)
        saver.save(session, save_dir)
    if i % 20 == 0 and i > 0:
        plt.figure()
        plt.plot(average_stdevs)
        plt.title("Average stdevs so far")
        plt.figure()
        plt.plot(average_rewards)
        plt.title("average rewards so far")

        plottable_actions, plottable_stdevs, plottable_means = renderAgent(pendulum, agent)
        plt.figure()
        max_stddev = np.max(plottable_stdevs)

        plottable_stdevs = np.squeeze(np.array(plottable_stdevs))
        plt.errorbar(range(len(plottable_means)), plottable_means, plottable_stdevs/max_stddev, linestyle='None')
        plt.scatter(range(len(plottable_actions)), plottable_actions, color='y')
        plt.scatter(range(len(plottable_means)), plottable_means, color='r')
        plt.title("Actions Taken in Rendered Environment")
        plt.xlabel("max stddev:" + str(max_stddev))
        plt.show()
        plt.close('all')

    for j in range(len(actions)):
        ret = prepSarData(states[j], actions[j], rewards[j], values[j])
        states_pro.append(ret[0])
        actions_pro.append(ret[1])
        discounted_rewards_pro.append(ret[2])
        rewards_pro.append(ret[3])
        next_states_pro.append(ret[4])
        advantages_pro.append(ret[5])
        
        mean_reward = np.average(ret[1])
        stdev_reward = np.std(ret[1])

    print("num sgd loops:", 5*len(states_pro))
    for k in range(5*len(states_pro)):
        train_index = np.random.choice(a=range(len(states_pro)))
        #print("Shape of selected next states:", next_states_pro[train_index].shape)
        #print("Shape of selected states:", states_pro[train_index].shape)
        ret = agent.trainSarBatches(
            states_pro[train_index],
            actions_pro[train_index],
            discounted_rewards_pro[train_index],
            rewards_pro[train_index],
            next_states_pro[train_index],
            advantages_pro[train_index]
        )
        if np.isnan(ret[0]):
            print("Received nan loss, stopping training.")
            pendulum.close()
            sys.exit(-1)
    agent.updatePrevActor()
    print(i)
    average_reward = np.average([sum(r) for r in rewards])
    print(
        "average reward: ", average_reward,
        "stdevs:", np.average(np.squeeze(ret[2])),
        "losses:", np.average(np.squeeze(ret[0])),
        "advantages:", np.average(np.squeeze(advantages_pro))
    )
    average_stdevs.append(np.average(np.squeeze(ret[2])))
    average_rewards.append(average_reward)

plt.figure()
plt.plot(average_rewards)
plt.show()
#pendulum.close()

  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)


num sgd loops: 200
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signatu

num sgd loops: 200
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signatu

num sgd loops: 200
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signatu

num sgd loops: 200
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signatu

num sgd loops: 200
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signatu

num sgd loops: 200
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signatu

num sgd loops: 200
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signatu

num sgd loops: 200
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signatu

num sgd loops: 200
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signatu

num sgd loops: 200
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signatu

num sgd loops: 200
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signatu

num sgd loops: 200
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signatu

num sgd loops: 200
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signatu

num sgd loops: 200
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signatu

num sgd loops: 200
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signatu

num sgd loops: 200
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signature
using advantages from function signatu

In [None]:
def renderAndPlot():
    plottable_actions, plottable_stdevs, plottable_means = renderAgent(pendulum, agent)
    plt.figure()
    max_stddev = np.max(plottable_stdevs)
    plt.errorbar(range(len(plottable_means)), plottable_means, plottable_stdevs/max_stddev, linestyle='None')
    plt.scatter(range(len(plottable_actions)), plottable_actions, color='y')
    plt.scatter(range(len(plottable_means)), plottable_means, color='r')
    plt.title("Actions Taken in Rendered Environment")
    plt.xlabel("max stddev:" + str(max_stddev))
    plt.show()
    plt.close('all')

In [None]:
renderAndPlot()