# Lab 3-1: REINFORCE
    In this lab, you need to implement a REINFORCE algorithm with Tensorflow and solve OpenAI Gym CartPole-v0

In [1]:
from cartpole_env import *

import numpy as np
import tensorflow as tf

from collections import namedtuple, deque

# Define the data structure of experience
Experience = namedtuple('Experience', 'state action reward next_state done')

## Implement ```discount``` function to compute discounted reward

In [2]:
def discount(rewards, gamma):
    '''
    param rewards: a rewards numpy array
    param gamma: discount factor
    '''
    discounted_rewards = np.zeros_like(rewards)
    
    # TODO: Calculate discounted rewards
    discounted_rewards = rewards
    s = 0
    for i in range(len(rewards)):
        s = s * gamma + rewards[i]
        discounted_rewards[i] = s

    return discounted_rewards

## Implement ```do_rollout``` function to collect rollout

In [3]:
def do_rollout(env, policy, render=False):
    '''
    Collect a rollout from env with policy
    
    param env: RL Environment
    param policy: a function parameterized by environment state, return a action
    return a list of (state, action, reward, next_state, done)
    '''
    # Initialize done as False
    done = False
    
    # Reset the environment and get the initial state
    state = env.reset()
    
    # Empty list
    rollout = []
    
    while not done:
        action = policy(state)
        next_state, reward, done, info = env.step(action)
        
        # Render the environment (slow)
        if render:
            env.render()
        
        rollout.append(Experience(state, action, reward, next_state, done))
        state = next_state
        
    return rollout

## Implement ```ReinforceAgent``` following ```TODO```

In [4]:
class ReinforceAgent(object):
    def __init__(self, sess, n_states, n_actions, n_hiddens, lr, gamma):
        '''
        param sess: tf session
        param n_states: dim of states
        param n_actions: dim of actions space
        param n_hiddens: dim of hidden state
        '''
        self.sess = sess
        self.n_states = n_states
        self.n_actions = n_actions
        self.n_hiddens = n_hiddens
        
        # Learning rate
        self.lr = lr
        
        # Discount factor
        self.gamma = gamma
       
        self.state = tf.placeholder(shape=[None, n_states], dtype=tf.float32)
        self.value = tf.placeholder(shape=[None], dtype=tf.float32)
        self.action = tf.placeholder(shape=[None], dtype=tf.int32)
        
        # TODO: Declare 1-st hidden layer
        # Define a fully-connected layer with:
        # input = self.state
        # n_units = self.n_hiddens
        # activation = relu
        # weight_initializer = random_normal(0.0, 0.1)
        # bias_initializer = constant (0.1)
        self.h = tf.layers.dense(
                inputs=self.state,
                units=n_hiddens,   
                activation=tf.nn.relu,
                kernel_initializer=tf.random_normal_initializer(0., .1), 
                bias_initializer=tf.constant_initializer(0.1), 
                name='h')
        
        # Declare 1-st hidden layer
        # Define a fully-connected layer with:
        # input = 1-st hidden layer
        # n_units = self.n_actions
        # activation = relu
        # weight_initializer = random_normal(0.0, 0.1)
        # bias_initializer = constant (0.1)
        self.policy = tf.layers.dense(
                inputs=self.h,
                units=n_actions,    
                activation=tf.nn.softmax,  
                kernel_initializer=tf.random_normal_initializer(0., .1), 
                bias_initializer=tf.constant_initializer(0.1),
                name='policy')
        
        # TODO: negative log probability
        neg_log = tf.nn.sparse_softmax_cross_entropy_with_logits(
            labels=self.action, logits=self.policy)
        
        # TODO: policy gradient loss function
        self.loss = tf.reduce_mean(neg_log * self.value)
        
        # Optimizer
        self.optimizer = tf.train.AdamOptimizer(learning_rate=self.lr)
        self.train_op = self.optimizer.minimize(self.loss)
        
    def layer(self, inputs, n_input, n_units, activation=tf.nn.relu):
        weights = tf.Variable(
            tf.random_normal([n_input, n_units], mean=0.0, stddev=0.1), name="weights")
        bias = tf.Variable(tf.constant(0.1, shape=[n_units]))

        outputs = activation(tf.matmul(inputs, weights) + bias) 
        return outputs
    
    def act(self, s):
        '''
        param s: a np.ndarray with shape [n_batches, n_states]
        return a batch of actions with shape [n_batches,]
        '''
        # TODO: Softmax stochastic policy
        policy = sess.run(self.policy, feed_dict={self.state: s})
        action = np.random.choice(self.n_actions, 1, p=policy.reshape(-1))
        
        return action
    
    def train(self, rollout):
        '''
        param rollout: a list o
        '''
        states = np.array([np.asarray(e.state) for e in rollout ])
        actions = np.squeeze(np.array([ e.action for e in rollout ]))
        rewards = np.array([ e.reward for e in rollout ])
        discounted_rewards = discount(rewards, gamma=self.gamma)
        
        self.sess.run(self.train_op, feed_dict={self.state: states,
                                                self.action: actions,
                                                self.value: discounted_rewards})
   
        

In [5]:
LR = 0.001
GAMMA = 0.99

sess = tf.InteractiveSession()
env = CartpoleEnvironment()
agent = ReinforceAgent(sess=sess, 
                       n_states=env.observation_space.shape[0],
                       n_actions=env.action_space.n,
                       n_hiddens=20,
                       lr=LR,
                       gamma=GAMMA)
init = tf.global_variables_initializer()
sess.run(init)

[2017-10-17 01:22:09,693] Making new env: CartPole-v0


In [6]:
def policy(s):
    return agent.act([s])[0]

In [7]:
def calculate_episode_reward(rollout):
    rewards = [ e.reward for e in rollout ]
    return sum(rewards)

In [None]:
def eval_history_reward(history):
    arr = np.asarray(history)
    return arr.mean()

In [None]:
MAX_ITERATIONS = 100000

episode_reward = 0.0
history_episode_rewards = deque(maxlen=100)

plot_history_episode_rewards = []

for iter in range(MAX_ITERATIONS):
    rollout = do_rollout(env=env, policy=policy, render=False)
    agent.train(rollout=rollout)
    
    episode_reward = calculate_episode_reward(rollout)
    history_episode_rewards.append(episode_reward)
    plot_history_episode_rewards.append(episode_reward)
    mean_rewards = eval_history_reward(history_episode_rewards)
    
    print('Episode %d: Reward = %f, Mean reward (over %d episodes) = %f' % (iter, 
                                                                            episode_reward,
                                                                            len(history_episode_rewards),
                                                                            mean_rewards))
    if mean_rewards > 195.0:
        print('Pass')
        break

Episode 0: Reward = 13.000000, Mean reward (over 1 episodes) = 13.000000
Episode 1: Reward = -4.000000, Mean reward (over 2 episodes) = 4.500000
Episode 2: Reward = -4.000000, Mean reward (over 3 episodes) = 1.666667
Episode 3: Reward = -2.000000, Mean reward (over 4 episodes) = 0.750000
Episode 4: Reward = -10.000000, Mean reward (over 5 episodes) = -1.400000
Episode 5: Reward = 2.000000, Mean reward (over 6 episodes) = -0.833333
Episode 6: Reward = 53.000000, Mean reward (over 7 episodes) = 6.857143
Episode 7: Reward = -8.000000, Mean reward (over 8 episodes) = 5.000000
Episode 8: Reward = -2.000000, Mean reward (over 9 episodes) = 4.222222
Episode 9: Reward = -7.000000, Mean reward (over 10 episodes) = 3.100000
Episode 10: Reward = -2.000000, Mean reward (over 11 episodes) = 2.636364
Episode 11: Reward = 50.000000, Mean reward (over 12 episodes) = 6.583333
Episode 12: Reward = -5.000000, Mean reward (over 13 episodes) = 5.692308
Episode 13: Reward = -4.000000, Mean reward (over 14 e

Episode 114: Reward = 6.000000, Mean reward (over 100 episodes) = 0.200000
Episode 115: Reward = -5.000000, Mean reward (over 100 episodes) = 0.150000
Episode 116: Reward = 2.000000, Mean reward (over 100 episodes) = 0.080000
Episode 117: Reward = 26.000000, Mean reward (over 100 episodes) = 0.400000
Episode 118: Reward = 1.000000, Mean reward (over 100 episodes) = 0.500000
Episode 119: Reward = -4.000000, Mean reward (over 100 episodes) = 0.470000
Episode 120: Reward = -1.000000, Mean reward (over 100 episodes) = 0.390000
Episode 121: Reward = -4.000000, Mean reward (over 100 episodes) = 0.340000
Episode 122: Reward = -6.000000, Mean reward (over 100 episodes) = 0.210000
Episode 123: Reward = 2.000000, Mean reward (over 100 episodes) = 0.060000
Episode 124: Reward = -1.000000, Mean reward (over 100 episodes) = 0.080000
Episode 125: Reward = 5.000000, Mean reward (over 100 episodes) = 0.070000
Episode 126: Reward = -11.000000, Mean reward (over 100 episodes) = -0.030000
Episode 127: Re

Episode 233: Reward = -12.000000, Mean reward (over 100 episodes) = -2.430000
Episode 234: Reward = -6.000000, Mean reward (over 100 episodes) = -2.590000
Episode 235: Reward = 2.000000, Mean reward (over 100 episodes) = -2.610000
Episode 236: Reward = 1.000000, Mean reward (over 100 episodes) = -2.520000
Episode 237: Reward = -11.000000, Mean reward (over 100 episodes) = -2.560000
Episode 238: Reward = -2.000000, Mean reward (over 100 episodes) = -2.680000
Episode 239: Reward = -11.000000, Mean reward (over 100 episodes) = -2.710000
Episode 240: Reward = -10.000000, Mean reward (over 100 episodes) = -2.700000
Episode 241: Reward = -1.000000, Mean reward (over 100 episodes) = -2.690000
Episode 242: Reward = 6.000000, Mean reward (over 100 episodes) = -2.560000
Episode 243: Reward = -11.000000, Mean reward (over 100 episodes) = -2.620000
Episode 244: Reward = -6.000000, Mean reward (over 100 episodes) = -2.580000
Episode 245: Reward = -2.000000, Mean reward (over 100 episodes) = -2.7400

Episode 347: Reward = -1.000000, Mean reward (over 100 episodes) = -5.830000
Episode 348: Reward = 0.000000, Mean reward (over 100 episodes) = -5.940000
Episode 349: Reward = -10.000000, Mean reward (over 100 episodes) = -6.000000
Episode 350: Reward = 1.000000, Mean reward (over 100 episodes) = -5.980000
Episode 351: Reward = -7.000000, Mean reward (over 100 episodes) = -6.130000
Episode 352: Reward = 0.000000, Mean reward (over 100 episodes) = -6.130000
Episode 353: Reward = -11.000000, Mean reward (over 100 episodes) = -6.220000
Episode 354: Reward = -10.000000, Mean reward (over 100 episodes) = -6.270000
Episode 355: Reward = -11.000000, Mean reward (over 100 episodes) = -6.280000
Episode 356: Reward = -3.000000, Mean reward (over 100 episodes) = -6.250000
Episode 357: Reward = 0.000000, Mean reward (over 100 episodes) = -6.210000
Episode 358: Reward = -9.000000, Mean reward (over 100 episodes) = -6.290000
Episode 359: Reward = -11.000000, Mean reward (over 100 episodes) = -6.40000

Episode 453: Reward = -11.000000, Mean reward (over 100 episodes) = -10.070000
Episode 454: Reward = -8.000000, Mean reward (over 100 episodes) = -10.050000
Episode 455: Reward = -12.000000, Mean reward (over 100 episodes) = -10.060000
Episode 456: Reward = -13.000000, Mean reward (over 100 episodes) = -10.160000
Episode 457: Reward = -12.000000, Mean reward (over 100 episodes) = -10.280000
Episode 458: Reward = -12.000000, Mean reward (over 100 episodes) = -10.310000
Episode 459: Reward = -6.000000, Mean reward (over 100 episodes) = -10.260000
Episode 460: Reward = -11.000000, Mean reward (over 100 episodes) = -10.300000
Episode 461: Reward = -9.000000, Mean reward (over 100 episodes) = -10.290000
Episode 462: Reward = -11.000000, Mean reward (over 100 episodes) = -10.310000
Episode 463: Reward = 4.000000, Mean reward (over 100 episodes) = -10.170000
Episode 464: Reward = -12.000000, Mean reward (over 100 episodes) = -10.160000
Episode 465: Reward = -6.000000, Mean reward (over 100 ep

Episode 557: Reward = -9.000000, Mean reward (over 100 episodes) = -10.580000
Episode 558: Reward = -12.000000, Mean reward (over 100 episodes) = -10.580000
Episode 559: Reward = -10.000000, Mean reward (over 100 episodes) = -10.620000
Episode 560: Reward = -9.000000, Mean reward (over 100 episodes) = -10.600000
Episode 561: Reward = -11.000000, Mean reward (over 100 episodes) = -10.620000
Episode 562: Reward = -11.000000, Mean reward (over 100 episodes) = -10.620000
Episode 563: Reward = -9.000000, Mean reward (over 100 episodes) = -10.750000
Episode 564: Reward = -13.000000, Mean reward (over 100 episodes) = -10.760000
Episode 565: Reward = -10.000000, Mean reward (over 100 episodes) = -10.800000
Episode 566: Reward = -11.000000, Mean reward (over 100 episodes) = -10.780000
Episode 567: Reward = -5.000000, Mean reward (over 100 episodes) = -10.740000
Episode 568: Reward = -12.000000, Mean reward (over 100 episodes) = -10.740000
Episode 569: Reward = -11.000000, Mean reward (over 100 

Episode 691: Reward = -9.000000, Mean reward (over 100 episodes) = -11.330000
Episode 692: Reward = -10.000000, Mean reward (over 100 episodes) = -11.330000
Episode 693: Reward = -12.000000, Mean reward (over 100 episodes) = -11.340000
Episode 694: Reward = -11.000000, Mean reward (over 100 episodes) = -11.360000
Episode 695: Reward = -7.000000, Mean reward (over 100 episodes) = -11.310000
Episode 696: Reward = -12.000000, Mean reward (over 100 episodes) = -11.320000
Episode 697: Reward = -13.000000, Mean reward (over 100 episodes) = -11.320000
Episode 698: Reward = -11.000000, Mean reward (over 100 episodes) = -11.310000
Episode 699: Reward = -12.000000, Mean reward (over 100 episodes) = -11.310000
Episode 700: Reward = -12.000000, Mean reward (over 100 episodes) = -11.320000
Episode 701: Reward = -12.000000, Mean reward (over 100 episodes) = -11.320000
Episode 702: Reward = -12.000000, Mean reward (over 100 episodes) = -11.330000
Episode 703: Reward = -12.000000, Mean reward (over 10

Episode 831: Reward = -12.000000, Mean reward (over 100 episodes) = -11.100000
Episode 832: Reward = -11.000000, Mean reward (over 100 episodes) = -11.180000
Episode 833: Reward = -12.000000, Mean reward (over 100 episodes) = -11.190000
Episode 834: Reward = -12.000000, Mean reward (over 100 episodes) = -11.200000
Episode 835: Reward = -11.000000, Mean reward (over 100 episodes) = -11.190000
Episode 836: Reward = -12.000000, Mean reward (over 100 episodes) = -11.190000
Episode 837: Reward = -11.000000, Mean reward (over 100 episodes) = -11.180000
Episode 838: Reward = -12.000000, Mean reward (over 100 episodes) = -11.180000
Episode 839: Reward = -11.000000, Mean reward (over 100 episodes) = -11.200000
Episode 840: Reward = -12.000000, Mean reward (over 100 episodes) = -11.210000
Episode 841: Reward = -11.000000, Mean reward (over 100 episodes) = -11.200000
Episode 842: Reward = -12.000000, Mean reward (over 100 episodes) = -11.230000
Episode 843: Reward = -11.000000, Mean reward (over 

In [None]:
import matplotlib.pyplot as plt

def plot(x, y, name):
    fig, ax = plt.subplots()
    ax.plot(x, y)
    
    ax.set(xlabel='Episode', ylabel='Reward', title=name)
    ax.grid()

    fig.savefig("%s.png" % name)
    plt.show()
    
plot(range(episode), plot_history_episode_rewards, 'REINFORCE')

In [None]:
while True:
    do_rollout(env=env, policy=policy, render=True)