# Lab 3-1: REINFORCE
    In this lab, you need to implement a REINFORCE algorithm with Tensorflow and solve OpenAI Gym CartPole-v0

In [1]:
from cartpole_env import *

import numpy as np
import tensorflow as tf

from collections import namedtuple, deque

# Define the data structure of experience
Experience = namedtuple('Experience', 'state action reward next_state done')

## Implement ```discount``` function to compute discounted reward

In [2]:
def discount(rewards, gamma):
    '''
    param rewards: a rewards numpy array
    param gamma: discount factor
    '''
    discounted_rewards = np.zeros_like(rewards)
    
    # TODO: Calculate discounted rewards
    discounted_rewards = rewards
    s = 0
    for i in reversed(range(len(rewards))):
        #
        # reward = [1, 2, 3]
        # discounted_rewards = [
        #    1 + gamma * 2 + gamma * gamma * 3, 
        #    2 + gamma * 3,
        #    3 ]
        #
        
        s = s * gamma + rewards[i]
        discounted_rewards[i] = s

    return discounted_rewards

## Implement ```do_rollout``` function to collect rollout

In [3]:
def do_rollout(env, policy, render=False):
    '''
    Collect a rollout from env with policy
    
    param env: RL Environment
    param policy: a function parameterized by environment state, return a action
    return a list of (state, action, reward, next_state, done)
    '''
    # Initialize done as False
    done = False
    
    # Reset the environment and get the initial state
    state = env.reset()
    
    # Empty list
    rollout = []
    
    while not done:
        action = policy(state)
        next_state, reward, done, info = env.step(action)
        
        # Render the environment (slow)
        if render:
            env.render()
        
        rollout.append(Experience(state, action, reward, next_state, done))
        state = next_state
        
    return rollout

## Implement ```ReinforceAgent``` following ```TODO```

In [4]:
class ReinforceAgent(object):
    def __init__(self, sess, n_states, n_actions, n_hiddens, lr, gamma):
        '''
        param sess: tf session
        param n_states: dim of states
        param n_actions: dim of actions space
        param n_hiddens: dim of hidden state
        '''
        self.sess = sess
        self.n_states = n_states
        self.n_actions = n_actions
        self.n_hiddens = n_hiddens
        
        # Learning rate
        self.lr = lr
        
        # Discount factor
        self.gamma = gamma
       
        self.state = tf.placeholder(shape=[None, n_states], dtype=tf.float32)
        self.value = tf.placeholder(shape=[None], dtype=tf.float32)
        self.action = tf.placeholder(shape=[None], dtype=tf.int32)
        
        # TODO: Declare 1-st hidden layer
        # Define a fully-connected layer with:
        # input = self.state
        # n_units = self.n_hiddens
        # activation = relu
        # weight_initializer = random_normal(0.0, 0.1)
        # bias_initializer = constant (0.1)
        self.h = tf.layers.dense(
            inputs=self.state,
            units=n_hiddens,   
            activation=tf.nn.relu,
            kernel_initializer=tf.random_normal_initializer(0., .1), 
            bias_initializer=tf.constant_initializer(0.1), 
            name='h')
        
        # Declare 1-st hidden layer
        # Define a fully-connected layer with:
        # input = 1-st hidden layer
        # n_units = self.n_actions
        # activation = relu
        # weight_initializer = random_normal(0.0, 0.1)
        # bias_initializer = constant (0.1)
        self.policy = tf.layers.dense(
            inputs=self.h,
            units=n_actions,    
            activation=tf.nn.relu,  
            kernel_initializer=tf.random_normal_initializer(0., .1), 
            bias_initializer=tf.constant_initializer(0.1),
            name='policy')
        
        self.probs = tf.nn.softmax(self.policy)
        
        # TODO: negative log probability
        neg_log = tf.nn.sparse_softmax_cross_entropy_with_logits(
            labels=self.action, logits=self.policy)  
        
        # TODO: policy gradient loss function
        self.loss = tf.reduce_mean(neg_log * self.value)
        
        # Optimizer
        self.optimizer = tf.train.AdamOptimizer(learning_rate=self.lr)
        self.train_op = self.optimizer.minimize(self.loss)
        
    def layer(self, inputs, n_input, n_units, activation=tf.nn.relu):
        weights = tf.Variable(
            tf.random_normal([n_input, n_units], mean=0.0, stddev=0.1), name="weights")
        bias = tf.Variable(tf.constant(0.1, shape=[n_units]))
        outputs = activation(tf.matmul(inputs, weights) + bias) 
        
        return outputs
    
    def act(self, s):
        '''
        param s: a np.ndarray with shape [n_batches, n_states]
        return a batch of actions with shape [n_batches,]
        '''
        # TODO: Softmax stochastic policy
        probs = sess.run(self.probs, feed_dict={self.state: s})
        action = np.random.choice(self.n_actions, 1, p=probs[0])
        
        return action
    
    def train(self, rollout):
        '''
        param rollout: a list o
        '''
        states = np.array([np.asarray(e.state) for e in rollout ])
        actions = np.squeeze(np.array([ e.action for e in rollout ]))
        rewards = np.array([ e.reward for e in rollout ])
        discounted_rewards = discount(rewards, gamma=self.gamma)
        
        self.sess.run(self.train_op, feed_dict={self.state: states,
                                                self.action: actions,
                                                self.value: discounted_rewards})
   
        

In [5]:
LR = 0.001
GAMMA = 0.99

sess = tf.InteractiveSession()
env = CartpoleEnvironment()
agent = ReinforceAgent(sess=sess, 
                       n_states=env.observation_space.shape[0],
                       n_actions=env.action_space.n,
                       n_hiddens=20,
                       lr=LR,
                       gamma=GAMMA)
init = tf.global_variables_initializer()
sess.run(init)

[2017-10-19 13:25:46,278] Making new env: CartPole-v0


In [6]:
def policy(s):
    return agent.act([s])[0]

In [7]:
def calculate_episode_reward(rollout):
    rewards = [ e.reward for e in rollout ]
    return sum(rewards)

In [8]:
def eval_history_reward(history):
    arr = np.asarray(history)
    return arr.mean()

In [9]:
MAX_ITERATIONS = 100000

episode_reward = 0.0
history_episode_rewards = deque(maxlen=100)

plot_history_episode_rewards = []

episode = 0
for iter in range(MAX_ITERATIONS):
    episode = iter
    
    rollout = do_rollout(env=env, policy=policy, render=False)
    agent.train(rollout=rollout)
    
    episode_reward = calculate_episode_reward(rollout)
    history_episode_rewards.append(episode_reward)
    plot_history_episode_rewards.append(episode_reward)
    mean_rewards = eval_history_reward(history_episode_rewards)
    
    print('Episode %d: Reward = %f, Mean reward (over %d episodes) = %f' % (episode, 
                                                                            episode_reward,
                                                                            len(history_episode_rewards),
                                                                            mean_rewards))
    if mean_rewards > 195.0:
        print('Pass')
        break

Episode 0: Reward = -9.000000, Mean reward (over 1 episodes) = -9.000000
Episode 1: Reward = -10.000000, Mean reward (over 2 episodes) = -9.500000
Episode 2: Reward = 0.000000, Mean reward (over 3 episodes) = -6.333333
Episode 3: Reward = -6.000000, Mean reward (over 4 episodes) = -6.250000
Episode 4: Reward = 30.000000, Mean reward (over 5 episodes) = 1.000000
Episode 5: Reward = -8.000000, Mean reward (over 6 episodes) = -0.500000
Episode 6: Reward = 0.000000, Mean reward (over 7 episodes) = -0.428571
Episode 7: Reward = 0.000000, Mean reward (over 8 episodes) = -0.375000
Episode 8: Reward = 19.000000, Mean reward (over 9 episodes) = 1.777778
Episode 9: Reward = -8.000000, Mean reward (over 10 episodes) = 0.800000
Episode 10: Reward = -4.000000, Mean reward (over 11 episodes) = 0.363636
Episode 11: Reward = 0.000000, Mean reward (over 12 episodes) = 0.333333
Episode 12: Reward = -1.000000, Mean reward (over 13 episodes) = 0.230769
Episode 13: Reward = -2.000000, Mean reward (over 14 

Episode 123: Reward = -4.000000, Mean reward (over 100 episodes) = 4.690000
Episode 124: Reward = -8.000000, Mean reward (over 100 episodes) = 4.330000
Episode 125: Reward = 13.000000, Mean reward (over 100 episodes) = 4.490000
Episode 126: Reward = -6.000000, Mean reward (over 100 episodes) = 4.420000
Episode 127: Reward = -8.000000, Mean reward (over 100 episodes) = 4.430000
Episode 128: Reward = -13.000000, Mean reward (over 100 episodes) = 4.300000
Episode 129: Reward = -3.000000, Mean reward (over 100 episodes) = 4.260000
Episode 130: Reward = 5.000000, Mean reward (over 100 episodes) = 4.200000
Episode 131: Reward = 16.000000, Mean reward (over 100 episodes) = 4.090000
Episode 132: Reward = -12.000000, Mean reward (over 100 episodes) = 3.940000
Episode 133: Reward = 20.000000, Mean reward (over 100 episodes) = 4.140000
Episode 134: Reward = 4.000000, Mean reward (over 100 episodes) = 4.190000
Episode 135: Reward = 11.000000, Mean reward (over 100 episodes) = 4.300000
Episode 136:

Episode 236: Reward = -1.000000, Mean reward (over 100 episodes) = 8.940000
Episode 237: Reward = 34.000000, Mean reward (over 100 episodes) = 9.280000
Episode 238: Reward = 14.000000, Mean reward (over 100 episodes) = 9.470000
Episode 239: Reward = 2.000000, Mean reward (over 100 episodes) = 9.560000
Episode 240: Reward = 3.000000, Mean reward (over 100 episodes) = 9.680000
Episode 241: Reward = 90.000000, Mean reward (over 100 episodes) = 10.470000
Episode 242: Reward = 11.000000, Mean reward (over 100 episodes) = 10.460000
Episode 243: Reward = 47.000000, Mean reward (over 100 episodes) = 10.770000
Episode 244: Reward = 9.000000, Mean reward (over 100 episodes) = 10.920000
Episode 245: Reward = -2.000000, Mean reward (over 100 episodes) = 10.980000
Episode 246: Reward = 25.000000, Mean reward (over 100 episodes) = 11.030000
Episode 247: Reward = 23.000000, Mean reward (over 100 episodes) = 11.070000
Episode 248: Reward = 0.000000, Mean reward (over 100 episodes) = 10.950000
Episode 

Episode 348: Reward = 8.000000, Mean reward (over 100 episodes) = 14.880000
Episode 349: Reward = 92.000000, Mean reward (over 100 episodes) = 15.610000
Episode 350: Reward = 6.000000, Mean reward (over 100 episodes) = 15.130000
Episode 351: Reward = 20.000000, Mean reward (over 100 episodes) = 15.330000
Episode 352: Reward = 34.000000, Mean reward (over 100 episodes) = 15.360000
Episode 353: Reward = -8.000000, Mean reward (over 100 episodes) = 15.200000
Episode 354: Reward = 74.000000, Mean reward (over 100 episodes) = 15.910000
Episode 355: Reward = 58.000000, Mean reward (over 100 episodes) = 16.480000
Episode 356: Reward = 18.000000, Mean reward (over 100 episodes) = 16.710000
Episode 357: Reward = 30.000000, Mean reward (over 100 episodes) = 16.930000
Episode 358: Reward = 24.000000, Mean reward (over 100 episodes) = 17.150000
Episode 359: Reward = 12.000000, Mean reward (over 100 episodes) = 17.080000
Episode 360: Reward = 6.000000, Mean reward (over 100 episodes) = 17.110000
Ep

Episode 459: Reward = 58.000000, Mean reward (over 100 episodes) = 29.570000
Episode 460: Reward = 4.000000, Mean reward (over 100 episodes) = 29.550000
Episode 461: Reward = 10.000000, Mean reward (over 100 episodes) = 29.320000
Episode 462: Reward = -1.000000, Mean reward (over 100 episodes) = 28.890000
Episode 463: Reward = 18.000000, Mean reward (over 100 episodes) = 28.340000
Episode 464: Reward = 9.000000, Mean reward (over 100 episodes) = 27.620000
Episode 465: Reward = 70.000000, Mean reward (over 100 episodes) = 27.950000
Episode 466: Reward = 25.000000, Mean reward (over 100 episodes) = 28.120000
Episode 467: Reward = 38.000000, Mean reward (over 100 episodes) = 28.200000
Episode 468: Reward = 34.000000, Mean reward (over 100 episodes) = 28.290000
Episode 469: Reward = 58.000000, Mean reward (over 100 episodes) = 28.640000
Episode 470: Reward = 13.000000, Mean reward (over 100 episodes) = 28.750000
Episode 471: Reward = 20.000000, Mean reward (over 100 episodes) = 28.520000
E

Episode 570: Reward = 46.000000, Mean reward (over 100 episodes) = 50.500000
Episode 571: Reward = 93.000000, Mean reward (over 100 episodes) = 51.230000
Episode 572: Reward = -6.000000, Mean reward (over 100 episodes) = 51.080000
Episode 573: Reward = 43.000000, Mean reward (over 100 episodes) = 50.710000
Episode 574: Reward = 21.000000, Mean reward (over 100 episodes) = 49.600000
Episode 575: Reward = 51.000000, Mean reward (over 100 episodes) = 49.750000
Episode 576: Reward = 73.000000, Mean reward (over 100 episodes) = 50.290000
Episode 577: Reward = 115.000000, Mean reward (over 100 episodes) = 51.070000
Episode 578: Reward = 28.000000, Mean reward (over 100 episodes) = 50.380000
Episode 579: Reward = 113.000000, Mean reward (over 100 episodes) = 51.050000
Episode 580: Reward = -6.000000, Mean reward (over 100 episodes) = 50.710000
Episode 581: Reward = 85.000000, Mean reward (over 100 episodes) = 50.940000
Episode 582: Reward = 31.000000, Mean reward (over 100 episodes) = 51.1800

Episode 676: Reward = 105.000000, Mean reward (over 100 episodes) = 102.500000
Episode 677: Reward = 200.000000, Mean reward (over 100 episodes) = 103.350000
Episode 678: Reward = 166.000000, Mean reward (over 100 episodes) = 104.730000
Episode 679: Reward = 200.000000, Mean reward (over 100 episodes) = 105.600000
Episode 680: Reward = 200.000000, Mean reward (over 100 episodes) = 107.660000
Episode 681: Reward = 200.000000, Mean reward (over 100 episodes) = 108.810000
Episode 682: Reward = 64.000000, Mean reward (over 100 episodes) = 109.140000
Episode 683: Reward = 200.000000, Mean reward (over 100 episodes) = 110.470000
Episode 684: Reward = 131.000000, Mean reward (over 100 episodes) = 111.450000
Episode 685: Reward = 200.000000, Mean reward (over 100 episodes) = 112.740000
Episode 686: Reward = 200.000000, Mean reward (over 100 episodes) = 114.420000
Episode 687: Reward = 58.000000, Mean reward (over 100 episodes) = 114.810000
Episode 688: Reward = 174.000000, Mean reward (over 10

Episode 782: Reward = 200.000000, Mean reward (over 100 episodes) = 163.410000
Episode 783: Reward = 200.000000, Mean reward (over 100 episodes) = 163.410000
Episode 784: Reward = 108.000000, Mean reward (over 100 episodes) = 163.180000
Episode 785: Reward = 200.000000, Mean reward (over 100 episodes) = 163.180000
Episode 786: Reward = 200.000000, Mean reward (over 100 episodes) = 163.180000
Episode 787: Reward = 71.000000, Mean reward (over 100 episodes) = 163.310000
Episode 788: Reward = 200.000000, Mean reward (over 100 episodes) = 163.570000
Episode 789: Reward = 200.000000, Mean reward (over 100 episodes) = 164.370000
Episode 790: Reward = 200.000000, Mean reward (over 100 episodes) = 164.840000
Episode 791: Reward = 112.000000, Mean reward (over 100 episodes) = 163.960000
Episode 792: Reward = 118.000000, Mean reward (over 100 episodes) = 163.140000
Episode 793: Reward = 200.000000, Mean reward (over 100 episodes) = 164.870000
Episode 794: Reward = 200.000000, Mean reward (over 1

Episode 888: Reward = 200.000000, Mean reward (over 100 episodes) = 179.580000
Episode 889: Reward = 200.000000, Mean reward (over 100 episodes) = 179.580000
Episode 890: Reward = 200.000000, Mean reward (over 100 episodes) = 179.580000
Episode 891: Reward = 200.000000, Mean reward (over 100 episodes) = 180.460000
Episode 892: Reward = 200.000000, Mean reward (over 100 episodes) = 181.280000
Episode 893: Reward = 200.000000, Mean reward (over 100 episodes) = 181.280000
Episode 894: Reward = 200.000000, Mean reward (over 100 episodes) = 181.280000
Episode 895: Reward = 142.000000, Mean reward (over 100 episodes) = 181.330000
Episode 896: Reward = 154.000000, Mean reward (over 100 episodes) = 181.280000
Episode 897: Reward = 200.000000, Mean reward (over 100 episodes) = 181.280000
Episode 898: Reward = 200.000000, Mean reward (over 100 episodes) = 181.280000
Episode 899: Reward = 71.000000, Mean reward (over 100 episodes) = 179.990000
Episode 900: Reward = -1.000000, Mean reward (over 10

Episode 992: Reward = 168.000000, Mean reward (over 100 episodes) = 187.090000
Episode 993: Reward = 200.000000, Mean reward (over 100 episodes) = 187.090000
Episode 994: Reward = 200.000000, Mean reward (over 100 episodes) = 187.090000
Episode 995: Reward = 200.000000, Mean reward (over 100 episodes) = 187.670000
Episode 996: Reward = 200.000000, Mean reward (over 100 episodes) = 188.130000
Episode 997: Reward = 200.000000, Mean reward (over 100 episodes) = 188.130000
Episode 998: Reward = 200.000000, Mean reward (over 100 episodes) = 188.130000
Episode 999: Reward = 200.000000, Mean reward (over 100 episodes) = 189.420000
Episode 1000: Reward = 149.000000, Mean reward (over 100 episodes) = 190.920000
Episode 1001: Reward = 200.000000, Mean reward (over 100 episodes) = 190.920000
Episode 1002: Reward = 18.000000, Mean reward (over 100 episodes) = 189.100000
Episode 1003: Reward = 200.000000, Mean reward (over 100 episodes) = 189.100000
Episode 1004: Reward = 200.000000, Mean reward (o

Episode 1096: Reward = 200.000000, Mean reward (over 100 episodes) = 191.700000
Episode 1097: Reward = 200.000000, Mean reward (over 100 episodes) = 191.700000
Episode 1098: Reward = 200.000000, Mean reward (over 100 episodes) = 191.700000
Episode 1099: Reward = 200.000000, Mean reward (over 100 episodes) = 191.700000
Episode 1100: Reward = 137.000000, Mean reward (over 100 episodes) = 191.580000
Episode 1101: Reward = 200.000000, Mean reward (over 100 episodes) = 191.580000
Episode 1102: Reward = 162.000000, Mean reward (over 100 episodes) = 193.020000
Episode 1103: Reward = 93.000000, Mean reward (over 100 episodes) = 191.950000
Episode 1104: Reward = 200.000000, Mean reward (over 100 episodes) = 191.950000
Episode 1105: Reward = 200.000000, Mean reward (over 100 episodes) = 192.260000
Episode 1106: Reward = 138.000000, Mean reward (over 100 episodes) = 192.230000
Episode 1107: Reward = 200.000000, Mean reward (over 100 episodes) = 192.230000
Episode 1108: Reward = 35.000000, Mean re

Episode 1199: Reward = 200.000000, Mean reward (over 100 episodes) = 169.670000
Episode 1200: Reward = 200.000000, Mean reward (over 100 episodes) = 170.300000
Episode 1201: Reward = 200.000000, Mean reward (over 100 episodes) = 170.300000
Episode 1202: Reward = 200.000000, Mean reward (over 100 episodes) = 170.680000
Episode 1203: Reward = 200.000000, Mean reward (over 100 episodes) = 171.750000
Episode 1204: Reward = 152.000000, Mean reward (over 100 episodes) = 171.270000
Episode 1205: Reward = 200.000000, Mean reward (over 100 episodes) = 171.270000
Episode 1206: Reward = 118.000000, Mean reward (over 100 episodes) = 171.070000
Episode 1207: Reward = 200.000000, Mean reward (over 100 episodes) = 171.070000
Episode 1208: Reward = 200.000000, Mean reward (over 100 episodes) = 172.720000
Episode 1209: Reward = 91.000000, Mean reward (over 100 episodes) = 171.630000
Episode 1210: Reward = 200.000000, Mean reward (over 100 episodes) = 171.860000
Episode 1211: Reward = 146.000000, Mean r

Episode 1302: Reward = 200.000000, Mean reward (over 100 episodes) = 187.210000
Episode 1303: Reward = 175.000000, Mean reward (over 100 episodes) = 186.960000
Episode 1304: Reward = 200.000000, Mean reward (over 100 episodes) = 187.440000
Episode 1305: Reward = 200.000000, Mean reward (over 100 episodes) = 187.440000
Episode 1306: Reward = 200.000000, Mean reward (over 100 episodes) = 188.260000
Episode 1307: Reward = 200.000000, Mean reward (over 100 episodes) = 188.260000
Episode 1308: Reward = 200.000000, Mean reward (over 100 episodes) = 188.260000
Episode 1309: Reward = 200.000000, Mean reward (over 100 episodes) = 189.350000
Episode 1310: Reward = 200.000000, Mean reward (over 100 episodes) = 189.350000
Episode 1311: Reward = 200.000000, Mean reward (over 100 episodes) = 189.890000
Episode 1312: Reward = 200.000000, Mean reward (over 100 episodes) = 189.890000
Episode 1313: Reward = 200.000000, Mean reward (over 100 episodes) = 190.140000
Episode 1314: Reward = 200.000000, Mean 

In [10]:
import matplotlib.pyplot as plt

def plot(x, y, name):
    fig, ax = plt.subplots()
    ax.plot(x, y)
    
    ax.set(xlabel='Episode', ylabel='Reward', title=name)
    ax.grid()

    fig.savefig("%s.png" % name)
    plt.show()
    
plot(range(episode), plot_history_episode_rewards, 'REINFORCE')

NameError: name 'episode' is not defined

In [None]:
while True:
    do_rollout(env=env, policy=policy, render=True)