In [0]:
!pip install numpy
!pip install tensorflow
!pip install matplotlib
!pip install gym

In [0]:
#Adapted from https://github.com/MorvanZhou/Reinforcement-learning-with-tensorflow/blob/master/contents/8_Actor_Critic_Advantage/AC_CartPole.py
import numpy as np
import tensorflow as tf
import gym
import matplotlib.pyplot as plt

In [0]:
# A2C(Advantage Actor-Critic) agent for the Cartpole
class Actor(object):
    def __init__(self, sess, n_features, n_actions, lr=0.001):
        self.sess = sess  # TF session
        self.s = tf.placeholder(tf.float32, [1, n_features], "state")  # State placeholder
        self.a = tf.placeholder(tf.int32, None, "act")  # Action placeholder
        self.td_error = tf.placeholder(tf.float32, None, "td_error")  # TD_error placeholder

        with tf.variable_scope('Actor'):  # Define actor network
            l1 = tf.layers.dense(
                inputs=self.s,
                units=20,    # number of hidden units
                activation=tf.nn.relu,
                kernel_initializer=tf.random_normal_initializer(0., .1),    # weights
                bias_initializer=tf.constant_initializer(0.1),  # biases
                name='l1'
            )

            self.acts_prob = tf.layers.dense(
                inputs=l1,
                units=n_actions,    # output units
                activation=tf.nn.softmax,   # get action probabilities
                kernel_initializer=tf.random_normal_initializer(0., .1),  # weights
                bias_initializer=tf.constant_initializer(0.1),  # biases
                name='acts_prob'
            )

        with tf.variable_scope('exp_v'):  # Actor optimization function
            log_prob = tf.log(self.acts_prob[0, self.a])  # Probability of each action
            self.exp_v = tf.reduce_mean(log_prob * self.td_error)  # advantage (TD_error) guided loss

        with tf.variable_scope('train'):
            self.train_op = tf.train.AdamOptimizer(lr).minimize(-self.exp_v)  # minimize(-exp_v) = maximize(exp_v)

    def learn(self, s, a, td): # function to train the actor
        s = s[np.newaxis, :]  # This is to have the right dimensions
        feed_dict = {self.s: s, self.a: a, self.td_error: td}
        _, exp_v = self.sess.run([self.train_op, self.exp_v], feed_dict)
        return exp_v

    def get_action(self, s):  # function to choose an action using the critic policy
        s = s[np.newaxis, :]  # This is to have the right dimensions
        probs = self.sess.run(self.acts_prob, {self.s: s})   # get probabilities for all actions
        return np.random.choice(np.arange(probs.shape[1]), p=probs.ravel())   # Choose a discrete actions using the probs given by the actor NN


class Critic(object):
    def __init__(self, sess, n_features, lr=0.01, gamma=0.99):
        self.sess = sess  # TF session
        self.gamma = gamma  # Discount factor  
        self.s = tf.placeholder(tf.float32, [1, n_features], "state") # State placeholder
        self.v_ = tf.placeholder(tf.float32, [1, 1], "v_next") # Value of next state placeholder
        self.r = tf.placeholder(tf.float32, None, 'r')  # Reward placeholder

        with tf.variable_scope('Critic'):  # Define the critic NN
            l1 = tf.layers.dense(
                inputs=self.s,
                units=20,  # number of hidden units
                activation=tf.nn.relu,  # None
                kernel_initializer=tf.random_normal_initializer(0., .1),  # weights
                bias_initializer=tf.constant_initializer(0.1),  # biases
                name='l1'
            )

            self.v = tf.layers.dense(
                inputs=l1,
                units=1,  # output units
                activation=None,  # Linear output: value may have any value in the range (-inf, +inf)
                kernel_initializer=tf.random_normal_initializer(0., .1),  # weights
                bias_initializer=tf.constant_initializer(0.1),  # biases
                name='V'
            )

        with tf.variable_scope('squared_TD_error'):
            self.td_error = self.r + self.gamma * self.v_ - self.v  # Note that self.v is the critic output!!
            self.loss = tf.square(self.td_error)    # TD_error = (r+gamma*V_next) - V_eval
        with tf.variable_scope('train'):
            self.train_op = tf.train.AdamOptimizer(lr).minimize(self.loss)

    def learn(self, s, r, s_):
        s, s_ = s[np.newaxis, :], s_[np.newaxis, :]  # This is to have the right dimensions
        v_ = self.sess.run(self.v, {self.s: s_})  # Estimate value of next state: this is needed for critic training
        td_error, _ = self.sess.run([self.td_error, self.train_op],
                                          {self.s: s, self.v_: v_, self.r: r})  # Train the critic
        return td_error

In [0]:
# Load Cart Pole from gym
    env = gym.make('CartPole-v1')
    # IMPORTANT NOTE: The CartPole environment allows up to 500 time steps. That is the maximum time of play
    # This time could be increased with env._max_episode_steps, but we use 500 for demostrating
    # Get size of state and action
    state_size = env.observation_space.shape[0]
    action_size = env.action_space.n

    EPISODES = 1000  # Max number of episodes
    gamma = 0.9  # Discount factor
    sess = tf.Session()

    actor = Actor(sess, n_features=state_size, n_actions=action_size, lr=0.001)
    critic = Critic(sess, n_features=state_size, lr=0.1, gamma=gamma) # Critic learns faster than the actor 

    sess.run(tf.global_variables_initializer())  # Initialize TF session

    scores, episodes = [], []  # To store values for plotting
    break_flag = False  # To stop training when the agent has successfully learned

    for e in range(EPISODES):
        if break_flag:
            break
        done = False
        score = 0
        state = env.reset()  # Set the initial state

        while not done:  # Iterate while the game has not finished
            # Get action for the current state and go one step in environment
            action = actor.get_action(state)
            next_state, reward, done, info = env.step(action)

            # If an action makes the episode end before time (i.e, before 499 time steps), then give a penalty of -100
            reward = reward if not done or score == 499 else -100
            score += reward
            # Train
            td_error = critic.learn(state, reward, next_state)  # Train the critic and obtain the TD error (= Advantage)
            actor.learn(state, action, td_error)  # Train the actor
            # Update state
            state = next_state
            if done:
                # Store values for plotting
                score = score if score == 500 else score + 100
                scores.append(score)
                episodes.append(e)
                # Output the results of the episode
                print("episode:", e, "  score:", score)

                # Stop if mean scores of last 10 episodes is higher than 490
                if np.mean(scores[-min(10, len(scores)):]) > 490:
                    break_flag = True
    # Output whether the agent learnt or not
    # IMPORTANT REMARK: A2C MAY NOT LEARN: DUE TO THE HIGH VARIANCE CAUSED BY THE ROLLOUTS USED 
    if break_flag:
        print("Training finished successfully")
    else:
        print("Training finished unsuccessfully")
    # Plot the scores vs the episodes
    plt.plot(episodes, scores)
    plt.show()

In [0]:
# Run the game once with the trained network and store the values of the position and angle of the cartpole
print("Obtaining data from trained network")
pos_vector=[]
angle_vector=[]
state = env.reset()
state = np.reshape(state, [1, state_size])
done=False
while not done:
    pos_vector.append(state[0,0])
    angle_vector.append(state[0,2])
    action = agent.get_action(state)
    next_state, reward, done, _ = env.step(action)
    next_state = np.reshape(next_state, [1, state_size])
    state = next_state
    if done:
        break
T=len(pos_vector)
print("Data obtained")
# Animate the data: to observe what our network has learned
%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib import animation, patches
from IPython.display import HTML
print("Creating animation")
fig = plt.figure()
ax = plt.axes(xlim=(-4, 4), ylim=(-1, 3), aspect=1)
ax.set_xticks([])
ax.set_yticks([])

mag1 = plt.Rectangle((0,0), width= 1, height= 0.5, angle=0, fc='black', alpha=0)
line1, = ax.plot([], [], 'b', zorder=1,linewidth=3.0)
line2, = ax.plot([], [], 'r', zorder=1,linewidth=2.0)

def init():
    line1.set_data([], [])
    line2.set_data([], [])
    ax.add_patch(mag1)
    return mag1, line1, line2,

def animate(i):
    mag1.set_alpha(1)
    mag1.xy = (pos_vector[i],0)
    x0=pos_vector[i]+0.5
    y0=0.5
    x1=x0+np.cos(np.pi/2-angle_vector[i])
    y1=y0+np.sin(np.pi/2-angle_vector[i])
    line1.set_data([x0, x1], [y0, y1])
    line2.set_data([-2.4, 2.4], [0,0])
    return mag1, line1, line2, 

anim = animation.FuncAnimation(fig, animate, init_func=init, frames=T, 
                               interval=25, blit=True)

HTML(anim.to_jshtml())