In [2]:

import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from tqdm import tqdm

def true_value(p):
    """ True value of the first state
    Args:
        p (float): probability of the action 'right'.
    Returns:
        True value of the first state.
        The expression is obtained by manually solving the easy linear system
        of Bellman equations using known dynamics.
    """
    return (2 * p - 4) / (p * (1 - p))

class ShortCorridorEnv:
    """
    Short corridor environment, see Example 13.1
    """
    def __init__(self):
        self.reset()

    def reset(self):
        self.state = 0
        return self.state 

    def step(self, go_right):  # follow gym interface
        """
        Args:
            go_right (bool): chosen action
        Returns:
            tuple of (state, reward, episode terminated?)
        """
        if self.state == 0 or self.state == 2:
            if go_right:
                self.state += 1
            else:
                self.state = max(0, self.state - 1)
        else:
            if go_right:
                self.state -= 1
            else:
                self.state += 1

        info = {}
        if self.state == 3:
            # terminal state
            return self.state, 0, True, info 
        else:
            return self.state, -1, False, info
#

The agent in Example 13.1 does not use the state information to estimate policy or value function
- The agent maintains only one probability for every state. (This can be thought of as a constraint for the problem.)
- The policy is defined to be 
$$
    \pi = \theta_0 x_0 + \theta_1 x_1  \in \mathbb{R}^2   \quad\text{where}\quad    x_0 = [1,0], x_1 = [0, 1]
$$

- It will perform well if a policy probability was defined for each state.


In [3]:
# REINFORCE algorithm for the problem in Example 13.1
def softmax(a):
    exp = np.exp(a)
    sm = exp / exp.sum()
    return sm 

class ReinforceAgent:
    def __init__(self, alpha, gamma):
        self.alpha = alpha 
        self.gamma = gamma 
        self.theta = np.array([-1, 1])   # initial parameter for pi
        self.x = np.array([[1, 0], [0, 1]])  # feature vector, constant
        self.pi = None  # initial pi

        # policy is defined to be
    def get_action(self, ):  # state information is not required. 
        self.pi = softmax(np.dot(self.theta, self.x))  # PMF
        if np.random.uniform() < self.pi[0]:
            return 0  # go left
        else:
            return 1

    def learn(self, rewards, actions):
        G = 0
        Ghist = []
        for r in rewards[::-1]:
            G += r + self.gamma * G 
            Ghist.insert(0, G)
        Ghist = np.array(Ghist)
        Ghist = (Ghist - Ghist.mean()) / Ghist.std()  # data normalization

        delta = 0
        
        for t, (G, a) in enumerate(zip(Ghist, actions)):
            grad_log_pi = self.x[:,a] - self.x @ self.pi
            d = self.alpha * np.power(self.gamma, t) * G * grad_log_pi

            self.theta += d 
    #

        pass

In [5]:
def trial(num_episodes, agent_generator):
    env = ShortCorridorEnv()
    agent = agent_generator()

    rewards = np.zeros(num_episodes)
    for episode_idx in range(num_episodes):
        reward_hist, action_hist = [], []

        env.reset()
        while True:
            go_right = agent.get_action()
            state, reward, episode_end, info = env.step(go_right)
            reward_hist.append(reward)
            action_hist.append(go_right)

            if episode_end:        
                break

        reward_hist = np.array(reward_hist)
        agent.learn(reward_hist, action_hist)  # update agent parameters
        rewards[episode_idx] = sum(reward_hist)

    return rewards


In [6]:
def figure_13_1():
    num_trials = 100
    num_episodes = 1000
    gamma = 1
    agent_generators = [lambda : ReinforceAgent(alpha=2e-4, gamma=gamma),
                        lambda : ReinforceAgent(alpha=2e-5, gamma=gamma),
                        lambda : ReinforceAgent(alpha=2e-3, gamma=gamma)]
    labels = ['alpha = 2e-4',
              'alpha = 2e-5',
              'alpha = 2e-3']

    rewards = np.zeros((len(agent_generators), num_trials, num_episodes))

    for agent_index, agent_generator in enumerate(agent_generators):
        for i in tqdm(range(num_trials)):
            reward = trial(num_episodes, agent_generator)
            rewards[agent_index, i, :] = reward

    plt.plot(np.arange(num_episodes) + 1, -11.6 * np.ones(num_episodes), ls='dashed', color='red', label='-11.6')
    for i, label in enumerate(labels):
        plt.plot(np.arange(num_episodes) + 1, rewards[i].mean(axis=0), label=label)
    plt.ylabel('total reward on episode')
    plt.xlabel('episode')
    plt.legend(loc='lower right')

    plt.savefig('./images/figure_13_1.png')
    plt.close()
#

figure_13_1()

100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [01:05<00:00,  1.54it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [02:44<00:00,  1.65s/it]
100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [02:02<00:00,  1.22s/it]


In [7]:
def figure_13_2():
    num_trials = 100
    num_episodes = 1000
    alpha = 2e-4
    gamma = 1
    agent_generators = [lambda : ReinforceAgent(alpha=alpha, gamma=gamma),
                        lambda : ReinforceBaselineAgent(alpha=alpha*10, gamma=gamma, alpha_w=alpha*100)]
    labels = ['Reinforce without baseline',
              'Reinforce with baseline']

    rewards = np.zeros((len(agent_generators), num_trials, num_episodes))

    for agent_index, agent_generator in enumerate(agent_generators):
        for i in tqdm(range(num_trials)):
            reward = trial(num_episodes, agent_generator)
            rewards[agent_index, i, :] = reward

    plt.plot(np.arange(num_episodes) + 1, -11.6 * np.ones(num_episodes), ls='dashed', color='red', label='-11.6')
    for i, label in enumerate(labels):
        plt.plot(np.arange(num_episodes) + 1, rewards[i].mean(axis=0), label=label)
    plt.ylabel('total reward on episode')
    plt.xlabel('episode')
    plt.legend(loc='lower right')

    plt.savefig('./images/figure_13_2.png')
    plt.close()
#
figure_13_2()

100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [01:04<00:00,  1.55it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:52<00:00,  1.89it/s]
