# Lab 3-2: Actor-Critic
    In this lab, you need to implement a REINFORCE algorithm with Tensorflow and solve OpenAI Gym CartPole-v0

In [1]:
from cartpole_env import *

import numpy as np
import tensorflow as tf

from collections import namedtuple, deque

# Define the data structure of experience
Experience = namedtuple('Experience', 'state action reward next_state done')

## Implement ```discount``` function to compute discounted reward

In [2]:
def discount(rewards, gamma):
    '''
    param rewards: a rewards numpy array
    param gamma: discount factor
    '''
    discounted_rewards = np.zeros_like(rewards)
    
    # TODO： Calculate discounted rewards
    discounted_sum = 0
    for t in reversed(range(len(rewards))):
        discounted_sum = discounted_sum * gamma + rewards[t]
        discounted_rewards[t] = discounted_sum
        
    return discounted_rewards

## Implement ```do_step``` function to collect step results

In [3]:
def do_step(env, policy):
    '''
    Collect a step from env with policy
    
    param env: RL Environment
    param policy: a function parameterized by environment state, return a action
    return a list (state, action, reward, next_state, done) with length 1
    '''        
    # Empty list
    rollout = []
    state = env.current_state()
    action = policy(state)
    next_state, reward, done, info = env.step(action)
    rollout.append(Experience(state, action, reward, next_state, done))
    state = next_state
        
    return rollout

## Implement ```ActorCriticAgent``` following ```TODO```

In [4]:
class ActorCriticAgent(object):
    def __init__(self, sess, n_states, n_actions, n_hiddens, lr_a, lr_c, gamma):
        '''
        param sess: tf session
        param n_states: dim of states
        param n_actions: dim of actions space
        param n_hiddens: dim of hidden state
        param lr_a: learning rate of actor
        param lr_c: learning rate of critic
        param gamma: discount factor
        '''
        self.sess = sess
        self.n_states = n_states
        self.n_actions = n_actions
        
        # Learning rate
        self.lr_a = lr_a
        self.lr_c = lr_c
        
        # Discount factor
        self.gamma = gamma
       
        self.state = tf.placeholder(shape=[None, n_states], dtype=tf.float32)
        self.value = tf.placeholder(shape=[None], dtype=tf.float32)
        self.action = tf.placeholder(shape=[None], dtype=tf.int32)
        
        # For value loss
        self.td_target = tf.placeholder(shape=[None], dtype=tf.float32)
        # For policy loss
        self.td_error_in = tf.placeholder(shape=[None], dtype=tf.float32)
 
        # TODO: Actor (using policy function)
        self.policy = self.policy_function(self.state, n_hiddens, n_actions)
    
        # TODO: Critic (using state-value function)
        self.state_value = self.state_value_function(self.state, n_hiddens)
        
        # TODO: TD-error
        self.td_error_out = self.td_target - self.state_value
        
        # TODO: State Value loss
        neg_log_state_value = tf.nn.softmax_cross_entropy_with_logits(
            labels=self.td_target, logits=self.state_value)
        self.value_loss = tf.reduce_mean(self.td_error_out * neg_log_state_value)
        self.train_op_critic = tf.train.AdamOptimizer(learning_rate=self.lr_c).minimize(self.value_loss)

        # TODO: Policy loss
        neg_log_policy = tf.nn.sparse_softmax_cross_entropy_with_logits(
            labels=self.action, logits=self.policy)
        self.policy_loss = tf.reduce_mean(self.td_error_in * neg_log_policy)
        self.train_op_actor = tf.train.AdamOptimizer(learning_rate=self.lr_a).minimize(self.policy_loss) 

    def policy_function(self, states, n_hiddens, n_actions):
        '''
        Define policy function using Neural Network to implement

        input:
            @param state : input state
            @param n_hiddens : num of hidden units in neural network
            @param n_acion : dim of action space

        output:
            @return policy : the probability how to choose an action
        
        '''
        
        h = tf.layers.dense(
            inputs=states,
            units=n_hiddens, 
            activation=tf.nn.relu,
            kernel_initializer=tf.random_normal_initializer(0.0, 0.1),
            bias_initializer=tf.constant_initializer(0.1))
        
        policy = tf.layers.dense(
            inputs=h,
            units=n_actions,
            activation=tf.nn.softmax,
            kernel_initializer=tf.random_normal_initializer(0.0, 0.1),
            bias_initializer= tf.constant_initializer(0.1),
            name='action')
        
        return policy
        
    def state_value_function(self, states, n_hiddens):
        '''
        Define state-value function using Neural Network to implement
        
        input:
            @param state : input state
            @param n_hiddens : num of hidden unit
        
        output:
            @return value : value computed by state-value function
        
        '''
        
        h = tf.layers.dense(
            inputs=states,
            units=n_hiddens,
            activation=tf.nn.relu,
            kernel_initializer=tf.random_normal_initializer(0.0, 0.1),
            bias_initializer=tf.constant_initializer(0.1))

        value = tf.layers.dense(
            inputs=h,
            units=1,
            activation=tf.nn.relu,
            kernel_initializer=tf.random_normal_initializer(0.0, 0.1),
            bias_initializer=tf.constant_initializer(0.1))
        
        return value
        
    def act(self, s):
        '''
        @param  s: a np.ndarray with shape [n_batches, n_states]
        @return    a batch of actions with shape [n_batches,]
        '''
        # TODO: Softmax stochastic policy
        probs= self.sess.run(self.policy, feed_dict={self.state: s})
        action = np.random.choice(self.n_actions, 1, p=probs.reshape(-1))
        
        return action
        
    
    def estimate(self, s):
        '''
        param s: a np.ndarray with shape [n_batches, n_states]
        return a batch of actions with shape [n_batches,]
        '''
        # TODO: Critic output
        value = self.sess.run(self.state_value, feed_dict={self.state: s})
        
        return value
        
    
    def train(self, rollout):
        '''
        param rollout: a list of experience
        '''
        states = np.array([ np.asarray(e.state) for e in rollout ])
        actions = np.reshape(np.array([ e.action for e in rollout ]), [len(states),])
        rewards = np.reshape(np.array([ e.reward for e in rollout ]), [len(states),])
        next_states = np.array([ np.asarray(e.next_state) for e in rollout ])

        value_s_next = self.estimate(next_states)
        value_s_next = np.reshape(value_s_next, [len(next_states),])

        # TODO: TD Target
        td_target = rewards + self.gamma * value_s_next
        
        td_error, _ = self.sess.run([self.td_error_out, self.train_op_critic], feed_dict={self.state: states,
                                                                                          self.td_target: td_target})
        td_error = td_error.reshape(-1)
        
        self.sess.run(self.train_op_actor, feed_dict={self.state: states, 
                                                      self.action: actions,
                                                      self.td_error_in: td_error})


In [5]:
LR_A = 0.001
LR_C = 0.01
GAMMA = 0.99

sess = tf.InteractiveSession()
env = CartpoleEnvironment()
agent = ActorCriticAgent(sess=sess, 
                       n_states=env.observation_space.shape[0],
                       n_actions=env.action_space.n,
                       n_hiddens=20,
                       lr_a=LR_A,
                       lr_c=LR_C,
                       gamma=GAMMA)
init = tf.global_variables_initializer()
sess.run(init)

InternalError: Failed to create session.

In [None]:
def policy(s):
    return agent.act([s])[0]

In [None]:
def eval_history_reward(history):
    arr = np.asarray(history)
    return arr.mean()

In [None]:
MAX_ITERATIONS = 500000

episode_reward = 0.0
history_episode_rewards = deque(maxlen=100)
episode = 0

plot_history_episode_rewards = []

env.reset()
for iter in range(MAX_ITERATIONS):
    rollout = do_step(env=env, policy=policy)
    agent.train(rollout=rollout)
    
    episode_reward += rollout[0].reward
    if rollout[0].done:
        history_episode_rewards.append(episode_reward)
        plot_history_episode_rewards.append(episode_reward)
        mean_rewards = eval_history_reward(history_episode_rewards)
        print('Episode %d: Reward = %f, Mean reward (over %d episodes) = %f' % (episode, 
                                                                                episode_reward,
                                                                                len(history_episode_rewards),
                                                                                mean_rewards))
        env.reset()
        episode += 1
        episode_reward = 0.0
        
        if mean_rewards > 195.0:
            print('Pass')
            break

## Plot the learning curve

In [None]:
import matplotlib.pyplot as plt

def plot(x, y, name):
    fig, ax = plt.subplots()
    ax.plot(x, y)
    
    ax.set(xlabel='Episode', ylabel='Reward', title=name)
    ax.grid()

    fig.savefig("%s.png" % name)
    plt.show()
    
plot(range(episode), plot_history_episode_rewards, 'Actor-Critic')

## Play the trained model

In [None]:
env.reset()
while True:
    rollout = do_step(env=env, policy=policy)
    agent.train(rollout=rollout)
    env.render()
    if rollout[0].done:
        env.reset()