# Lab 3-1: REINFORCE
    In this lab, you need to implement a REINFORCE algorithm with Tensorflow and solve OpenAI Gym CartPole-v0

In [1]:
from cartpole_env import *

import numpy as np
import tensorflow as tf

from collections import namedtuple, deque

# Define the data structure of experience
Experience = namedtuple('Experience', 'state action reward next_state done')

## Implement ```discount``` function to compute discounted reward

In [2]:
def discount(rewards, gamma):
    '''
    param rewards: a rewards numpy array
    param gamma: discount factor
    '''
    discounted_rewards = np.zeros_like(rewards)
    
    # TODO: Calculate discounted rewards
    
    return discounted_rewards

## Implement ```do_rollout``` function to collect rollout

In [3]:
def do_rollout(env, policy, render=False):
    '''
    Collect a rollout from env with policy
    
    param env: RL Environment
    param policy: a function parameterized by environment state, return a action
    return a list of (state, action, reward, next_state, done)
    '''
    # Initialize done as False
    done = False
    
    # Reset the environment and get the initial state
    state = env.reset()
    
    # Empty list
    rollout = []
    
    while not done:
        action = policy(state)
        next_state, reward, done, info = env.step(action)
        
        # Render the environment (slow)
        if render:
            env.render()
        
        rollout.append(Experience(state, action, reward, next_state, done))
        state = next_state
        
    return rollout

## Implement ```ReinforceAgent``` following ```TODO```

In [4]:
class ReinforceAgent(object):
    def __init__(self, sess, n_states, n_actions, n_hiddens, lr, gamma):
        '''
        param sess: tf session
        param n_states: dim of states
        param n_actions: dim of actions space
        param n_hiddens: dim of hidden state
        '''
        self.sess = sess
        self.n_states = n_states
        self.n_actions = n_actions
        
        # Learning rate
        self.lr = lr
        
        # Discount factor
        self.gamma = gamma
       
        self.state = tf.placeholder(shape=[None, n_states], dtype=tf.float32)
        self.value = tf.placeholder(shape=[None], dtype=tf.float32)
        self.action = tf.placeholder(shape=[None], dtype=tf.int32)
        
        # TODO: Declare 1-st hidden layer
        # Define a fully-connected layer with:
        # input = self.state
        # n_units = self.n_hiddens
        # activation = relu
        # weight_initializer = random_normal(0.0, 0.1)
        # bias_initializer = constant (0.1)
        
        
        # TODO: Declare 1-st hidden layer
        # Define a fully-connected layer with:
        # input = 1-st hidden layer
        # n_units = self.n_actions
        # activation = relu
        # weight_initializer = random_normal(0.0, 0.1)
        # bias_initializer = constant (0.1)
        
        # TODO: negative log probability 
        
         
        # TODO: policy gradient loss function
        
        # Optimizer
        self.optimizer = tf.train.AdamOptimizer(learning_rate=self.lr)
        self.train_op = self.optimizer.minimize(self.loss)
        
    def act(self, s):
        '''
        param s: a np.ndarray with shape [n_batches, n_states]
        return a batch of actions with shape [n_batches,]
        '''
        # TODO: Softmax stochastic policy
    
    def train(self, rollout):
        '''
        param rollout: a list o
        '''
        states = np.array([ np.asarray(e.state) for e in rollout ])
        actions = np.squeeze(np.array([ e.action for e in rollout ]))
        rewards = np.array([ e.reward for e in rollout ])
        discounted_rewards = discount(rewards, gamma=self.gamma)
        
        self.sess.run(self.train_op, feed_dict={self.state: states,
                                                self.action: actions,
                                                self.value: discounted_rewards})
   
        

In [5]:
LR = 0.001
GAMMA = 0.99

sess = tf.InteractiveSession()
env = CartpoleEnvironment()
agent = ReinforceAgent(sess=sess, 
                       n_states=env.observation_space.shape[0],
                       n_actions=env.action_space.n,
                       n_hiddens=20,
                       lr=LR,
                       gamma=GAMMA)
init = tf.global_variables_initializer()
sess.run(init)

[2017-10-16 18:56:28,425] Making new env: CartPole-v0


AttributeError: 'ReinforceAgent' object has no attribute 'loss'

In [None]:
def policy(s):
    return agent.act([s])[0]

In [None]:
def calculate_episode_reward(rollout):
    rewards = [ e.reward for e in rollout ]
    return sum(rewards)

In [None]:
def eval_history_reward(history):
    arr = np.asarray(history)
    return arr.mean()

In [None]:
MAX_ITERATIONS = 100000

episode_reward = 0.0
history_episode_rewards = deque(maxlen=100)

plot_history_episode_rewards = []

for iter in range(MAX_ITERATIONS):
    rollout = do_rollout(env=env, policy=policy, render=False)
    agent.train(rollout=rollout)
    
    episode_reward = calculate_episode_reward(rollout)
    history_episode_rewards.append(episode_reward)
    plot_history_episode_rewards.append(episode_reward)
    mean_rewards = eval_history_reward(history_episode_rewards)
    
    print('Episode %d: Reward = %f, Mean reward (over %d episodes) = %f' % (iter, 
                                                                            episode_reward,
                                                                            len(history_episode_rewards),
                                                                            mean_rewards))
    if mean_rewards > 195.0:
        print('Pass')
        break

In [None]:
import matplotlib.pyplot as plt

def plot(x, y, name):
    fig, ax = plt.subplots()
    ax.plot(x, y)
    
    ax.set(xlabel='Episode', ylabel='Reward', title=name)
    ax.grid()

    fig.savefig("%s.png" % name)
    plt.show()
    
plot(range(episode), plot_history_episode_rewards, 'REINFORCE')

In [None]:
while True:
    do_rollout(env=env, policy=policy, render=True)