In [2]:
class Agent(object):
    """Defines the interface that an agent playing an MDP should implement."""

    def __init__(self, gamma=1.0):
        """Initializes the agent, setting any relevant hyperparameters."""
        self.gamma = gamma
        self.current_discount = 1.0
        self.reward = 0.0

    def set_mdp(self, mdp):
        """Sets the MDP that the agent will be playing."""
        self.mdp = mdp

    def get_action(self, state):
        """Returns the action that the agent takes in the given state.
        The agent should imagine that it is in the given state when selecting an
        action. When the agent is actually acting in an environment, the
        environment will guarantee that it always passes in the current state of
        the agent. However, for other purposes, sequential calls to `get_action`
        are not required to be part of the same trajectory.
        state: State of the agent. An element of self.mdp.get_states().
        Returns: An action a such that a is in self.mdp.get_actions(state).
        """
        return self.get_action_distribution(state).sample()

    def get_action_distribution(self, state):
        """Returns a Distribution over actions that the agent takes in `state`.
        The agent should imagine that it is in the given state when selecting an
        action. When the agent is actually acting in an environment, the
        environment will guarantee that it always passes in the current state of
        the agent. However, for other purposes, sequential calls to
        `get_action_distribution` are not required to be part of the same
        trajectory.
        state: State of the agent. An element of self.mdp.get_states().
        Returns: A Distribution over actions.
        """
        raise NotImplementedError("get_action_distribution not implemented")

    def inform_minibatch(self, state, action, next_state, reward):
        """Updates the agent based on the results of the last action."""
        self.reward += self.current_discount * reward
        self.current_discount *= self.gamma

In [3]:
class ValueIterationLikeAgent(Agent):
    """An agent that chooses actions using something similar to value iteration.
    Instead of working directly on states from the mdp, we perform value
    iteration on generalized states (called mus), following the formalism in
    "Learning the Preferences of Bounded Agents" from a NIPS 2015 workshop.
    In the default case, a single MDP provides all of the necessary
    information. However, to support evaluation of reward learning, you can
    optionally specify a reward_mdp in the set_mdp method, in which case all
    reward evaluations will be done by the reward_mdp (while everything else
    such as transition probabilities will still use the original MDP).
    The algorithm in this class is simply standard value iteration, but
    subclasses can easily change the behavior while reusing most of the code by
    overriding hooks into the algorithm.
    """

    def __init__(self, gamma=0.9, beta=None, num_iters=50):
        """Initializes the agent, setting any relevant hyperparameters.
        gamma: Discount factor.
        beta: Noise parameter when choosing actions. beta=None implies that
        there is no noise, otherwise actions are chosen with probability
        proportional to exp(beta * value).
        num_iters: The maximum number of iterations of value iteration to run.
        """
        super(ValueIterationLikeAgent, self).__init__(gamma)
        self.beta = beta
        self.num_iters = num_iters
        self.policy = None

    def set_mdp(self, mdp, reward_mdp=None):
        super(ValueIterationLikeAgent, self).set_mdp(mdp)
        self.reward_mdp = reward_mdp if reward_mdp is not None else mdp
        self.compute_values()

    def compute_values(self):
        """Computes the values for self.mdp using value iteration.
        Populates an object self.values, such that self.values[mu] is the value
        (a float) of the generalized state mu.
        """
        values = defaultdict(float)
        for iter in range(self.num_iters):
            new_values = defaultdict(float)
            for mu in self.get_mus():
                actions = self.get_actions(mu)
                if not actions:
                    continue
                new_mu = self.get_mu_for_planning(mu)  # Typically new_mu == mu
                qvalues = [(self.qvalue(new_mu, a, values), a) for a in actions]
                _, chosen_action = max(qvalues)
                new_values[mu] = self.qvalue(mu, chosen_action, values)

            if self.converged(values, new_values):
                self.values = new_values
                return

            values = new_values

        self.values = values

    def converged(self, values, new_values, tolerance=1e-3):
        """Returns True if value iteration has converged.
        Value iteration has converged if no value has changed by more than tolerance.
        values: The values from the previous iteration of value iteration.
        new_values: The new value computed during this iteration.
        """
        for mu in new_values.keys():
            if abs(values[mu] - new_values[mu]) > tolerance:
                return False
        return True

    def value(self, mu):
        """Computes V(mu).
        mu: Generalized state
        """
        return self.values[mu]

    def qvalue(self, mu, a, values=None):
        """Computes Q(mu, a) from the values table.
        mu: Generalized state
        a: Action
        values: Dictionary such that values[mu] is the value of generalized
        state mu. If None, then self.values is used instead.
        """
        if values is None:
            values = self.values
        r = self.get_reward(mu, a)
        transitions = self.get_transition_mus_and_probs(mu, a)
        return r + self.gamma * sum([p * values[mu2] for mu2, p in transitions])

    def get_action_distribution(self, s):
        """Returns a Distribution over actions.
        Note that this is a normal state s, not a generalized state mu.
        """
        mu = self.extend_state_to_mu(s)
        actions = self.mdp.get_actions(s)
        if self.beta is not None:
            q_vals = np.array([self.qvalue(mu, a) for a in actions])
            q_vals = q_vals - np.mean(q_vals)  # To prevent overflow in exp
            action_dist = np.exp(self.beta * q_vals)
            return Distribution(dict(zip(actions, action_dist)))

        best_value, best_actions = float("-inf"), []
        for a in actions:
            action_value = self.qvalue(mu, a)
            if action_value > best_value:
                best_value, best_actions = action_value, [a]
            elif action_value == best_value:
                best_actions.append(a)
        return Distribution({a : 1 for a in best_actions})
        # For more determinism, you can break ties deterministically:
        # return Distribution({best_actions[0] : 1})

    def get_mus(self):
        """Returns all possible generalized states the agent could be in.
        This is the equivalent of self.mdp.get_states() for generalized states.
        """
        return self.mdp.get_states()

    def get_actions(self, mu):
        """Returns all actions the agent could take from generalized state mu.
        This is the equivalent of self.mdp.get_actions() for generalized states.
        """
        s = self.extract_state_from_mu(mu)
        return self.mdp.get_actions(s)

    def get_reward(self, mu, a):
        """Returns the reward for taking action a from generalized state mu.
        This is the equivalent of self.mdp.get_reward() for generalized states.
        """
        s = self.extract_state_from_mu(mu)
        return self.reward_mdp.get_reward(s, a)

    def get_transition_mus_and_probs(self, mu, a):
        """Gets information about possible transitions for the action.
        This is the equivalent of self.mdp.get_transition_states_and_probs() for
        generalized states. So, it returns a list of (next_mu, prob) pairs,
        where next_mu must be a generalized state.
        """
        s = self.extract_state_from_mu(mu)
        return self.mdp.get_transition_states_and_probs(s, a)

    def get_mu_for_planning(self, mu):
        """Returns the generalized state that an agent uses for planning.
        Specifically, the returned state is used when looking forward to find
        the expected value of a future state.
        """
        return mu

    def extend_state_to_mu(self, state):
        """Converts a normal state to a generalized state."""
        return state

    def extract_state_from_mu(self, mu):
        """Converts a generalized state to a normal state."""
        return mu

class OptimalAgent(ValueIterationLikeAgent):
    """An agent that implements regular value iteration."""
    def __str__(self):
        pattern = 'Optimal-gamma-{0.gamma}-beta-{0.beta}-numiters-{0.num_iters}'
        return pattern.format(self)

In [10]:
class GridworldMdpNoR(object):
    """A grid world where the objective is to navigate to one of many rewards.
    Specifies all of the static information that an agent has access to when
    playing in the given grid world, including the state space, action space,
    transition probabilities, start state, etc. The agent can take any of the \
    four cardinal directions as an action, or the STAY action.
    The reward is by default *not present*, though subclasses may add in
    funcitonality for the reward.
    """
    def __init__(self, walls, start_state, noise=0):
        self.height = len(walls)
        self.width = len(walls[0])
        self.walls = walls
        self.start_state = start_state
        self.noise = noise
        self.transition_matrix = None

    def get_start_state(self):
        """Returns the start state."""
        return self.start_state

    def get_states(self):
        """Returns a list of all possible states the agent can be in.
        Note it is not guaranteed that the agent can reach all of these states.
        """
        coords = [(x, y) for x in range(self.width) for y in range(self.height)]
        all_states = [(x, y) for x, y in coords if not self.walls[y][x]]
        return all_states

    def get_actions(self, state):
        """Returns the list of valid actions for 'state'.
        Note that you can request moves into walls, which are equivalent to
        STAY. The order in which actions are returned is guaranteed to be
        deterministic, in order to allow agents to implement deterministic
        behavior.
        """
        x, y = state
        if self.walls[y][x]:
            raise ValueError('Cannot be inside a wall!')
        return [Direction.NORTH, Direction.SOUTH, Direction.EAST, Direction.WEST, Direction.STAY]

    def get_reward(self, state, action):
        """Get reward for state, action transition."""
        raise NotImplemented("Cannot call get_reward for GridworldMdpNoR")

    def is_terminal(self, state):
        return False

    def get_transition_states_and_probs(self, state, action):
        """Gets information about possible transitions for the action.
        Returns list of (next_state, prob) pairs representing the states
        reachable from 'state' by taking 'action' along with their transition
        probabilities.
        """
        if action not in self.get_actions(state):
            raise ValueError("Illegal action %s in state %s" % (action, state))

        if action == Direction.STAY:
            return [(state, 1.0)]

        next_state = self._attempt_to_move_in_direction(state, action)
        if self.noise == 0.0:
            return [(next_state, 1.0)]

        successors = defaultdict(float)
        successors[next_state] += 1.0 - self.noise
        for direction in Direction.get_adjacent_directions(action):
            next_state = self._attempt_to_move_in_direction(state, direction)
            successors[next_state] += (self.noise / 2.0)

        return successors.items()

    def get_transition_matrix(self):
        """Returns transition matrix. Very slow."""
        if self.noise != 0:
            raise AssertionError("Transition matrix does not have computations set when MDP has noise")
        if self.transition_matrix != None:
            return self.transition_matrix

        height = self.height
        width = self.width
        num_actions = len(Direction.ALL_DIRECTIONS)

        tran_shape = (width*height, num_actions, width*height)
        transition_matrix = np.zeros(tran_shape)

        # Init the array to stay action, even if in wall
        for x in range(width):
            for y in range(height):
                flatOuter = y * width + x

                for idxA, action in enumerate(Direction.ALL_DIRECTIONS):
                    # Stay action is default for every state, even walls
                    transition_matrix[flatOuter, idxA, flatOuter] = 1

                    # Compute s,a -> s' transitions
                    try:
                        # self.get_actions(self, state) <-- takes state in non-gridworld format
                        # of (x, y)
                        sa_transitions = self.get_transition_states_and_probs((x, y), action)
                    except ValueError:
                        sa_transitions = None

                    if sa_transitions:
                        transition_matrix[flatOuter, idxA, flatOuter] = 0
                        for state, prob in sa_transitions:
                            flatInner = state[1] * width + state[0]
                            transition_matrix[flatOuter, idxA, flatInner] = prob

        self.transition_matrix = transition_matrix
        return self.transition_matrix

    def _attempt_to_move_in_direction(self, state, action):
        """Return the new state an agent would be in if it took the action.
        Requires: action is in self.get_actions(state).
        """
        x, y = state
        newx, newy = Direction.move_in_direction(state, action)
        return state if self.walls[newy][newx] else (newx, newy)



class GridworldMdp(GridworldMdpNoR):
    """A grid world where the objective is to navigate to one of many rewards.
    Specifies all of the static information that an agent has access to when
    playing in the given grid world, including the state space, action space,
    transition probabilities, rewards, start state, etc.
    The agent can take any of the four cardinal directions as an action, getting
    a living reward (typically negative in order to incentivize shorter
    paths). It can also take the STAY action, in which case it does not receive
    the living reward.
    """

    def __init__(self, grid, living_reward=-0.01, noise=0):
        """Initializes the MDP.
        grid: A sequence of sequences of spaces, representing a grid of a
        certain height and width. See assert_valid_grid for details on the grid
        format.
        living_reward: The reward obtained when taking any action besides STAY.
        noise: Probability that when the agent takes a non-STAY action (that is,
        a cardinal direction), it instead moves in one of the two adjacent
        cardinal directions.
        Raises: AssertionError if the grid is invalid.
        """
        self._assert_valid_grid(grid)

        walls = [[space == 'X' for space in row] for row in grid]
        rewards, start_state = self._get_rewards_and_start_state(grid)
        GridworldMdpNoR.__init__(self, walls, start_state, noise)
        self.rewards = rewards
        self.living_reward = living_reward

    def _assert_valid_grid(self, grid):
        """Raises an AssertionError if the grid is invalid.
        grid:  A sequence of sequences of spaces, representing a grid of a
        certain height and width. grid[y][x] is the space at row y and column
        x. A space must be either 'X' (representing a wall), ' ' (representing
        an empty space), 'A' (representing the start state), or a value v so
        that float(v) succeeds (representing a reward).
        Often, grid will be a list of strings, in which case the rewards must be
        single digit positive rewards.
        """
        height = len(grid)
        width = len(grid[0])

        # Make sure the grid is not ragged
        assert all(len(row) == width for row in grid), 'Ragged grid'

        # Borders must all be walls
        for y in range(height):
            assert grid[y][0] == 'X', 'Left border must be a wall'
            assert grid[y][-1] == 'X', 'Right border must be a wall'
        for x in range(width):
            assert grid[0][x] == 'X', 'Top border must be a wall'
            assert grid[-1][x] == 'X', 'Bottom border must be a wall'

        def is_float(element):
            try:
                return float(element) or True
            except ValueError:
                return False

        # An element can be 'X' (a wall), ' ' (empty element), 'A' (the agent),
        # or a value v such that float(v) succeeds and returns a float.
        def is_valid_element(element):
            return element in ['X', ' ', 'A'] or is_float(element)

        all_elements = [element for row in grid for element in row]
        assert all(is_valid_element(element) for element in all_elements), \
               'Invalid element: must be X, A, blank space, or a number'
        assert all_elements.count('A') == 1, "'A' must be present exactly once"
        floats = [element for element in all_elements if is_float(element)]
        assert len(floats) >= 1, 'There must at least one reward square'

    def _get_rewards_and_start_state(self, grid):
        """Extracts the rewards and start state from grid.
        Assumes that grid is a valid grid.
        grid: A sequence of sequences of spaces, representing a grid of a
        certain height and width. See assert_valid_grid for details on the grid
        format.
        living_reward: The reward obtained each time step (typically negative).
        Returns two things -- a dictionary mapping states to rewards, and a
        start state.
        """
        rewards = {}
        start_state = None
        for y in range(len(grid)):
            for x in range(len(grid[0])):
                if grid[y][x] not in ['X', ' ', 'A']:
                    rewards[(x, y)] = float(grid[y][x])
                elif grid[y][x] == 'A':
                    start_state = (x, y)
        return rewards, start_state

    def get_reward(self, state, action):
        """Get reward for state, action transition."""
        result = 0
        if state in self.rewards:
            result += self.rewards[state]
        if action != Direction.STAY:
            result += self.living_reward
        return result

    def get_random_start_state(self):
        """Returns a state that would be a legal start state for an agent.
        Avoids walls and reward/exit states.
        Returns: Randomly chosen state (x, y).
        """
        y = random.randint(1, self.height - 2)
        x = random.randint(1, self.width - 2)
        while self.walls[y][x] or (x, y) in self.rewards:
            y = random.randint(1, self.height - 2)
            x = random.randint(1, self.width - 2)
        return (x, y)

    def convert_to_numpy_input(self):
        """Encodes this MDP in a format well-suited for deep models.
        Returns three things -- a grid of indicators for whether or not a wall
        is present, a grid of reward values (not including living reward), and
        the start state (a tuple in the format x, y).
        """
        walls = np.array(self.walls, dtype=int)
        rewards = np.zeros([self.height, self.width], dtype=float)
        for x, y in self.rewards:
            rewards[y, x] = self.rewards[(x, y)]
        return walls, rewards, self.start_state

    @staticmethod
    def from_numpy_input(walls, reward, start_state, noise=0):
        """Creates the MDP from the format output by convert_to_numpy_input.
        See convert_to_numpy_input for the types of the parameters. If
        start_state is not provided, some arbitrary blank space is set as the
        start state. Assumes that the parameters were returned by
        convert_to_numpy_input, and in particular it does not check that they
        are valid (for example, it assumes that no space is both a wall and a
        reward).
        It is *not* the case that calling from_numpy_input on the result of
        convert_to_numpy_input will give exactly the same gridworld. In
        particular, the living reward and noise will be reset to their default
        values.
        """
        def get_elem(x, y):
            wall_elem, reward_elem = walls[y][x], reward[y][x]
            if wall_elem == 1:
                return 'X'
            elif reward_elem == 0:
                return ' '
            else:
                return reward_elem

        height, width = walls.shape
        grid = [[get_elem(x, y) for x in range(width)] for y in range(height)]
        x, y = start_state
        grid[y][x] = 'A'
        return GridworldMdp(grid, noise=noise)

    @staticmethod
    def get_random_state(grid, accepted_tokens):
        height, width = len(grid), len(grid[0])
        current_val = None
        while current_val not in accepted_tokens:
            y = random.randint(1, height - 2)
            x = random.randint(1, width - 2)
            current_val = grid[y][x]
        return x, y

    def without_reward(self):
        return GridworldMdpNoR(self.walls, self.start_state, self.noise)

    @staticmethod
    def generate_random(height, width, pr_wall, pr_reward):
        """Generates a random instance of a Gridworld.
        Note that based on the generated walls and start position, it may be
        impossible for the agent to ever reach a reward.
        """
        grid = [['X'] * width for _ in range(height)]
        for y in range(1, height - 1):
            for x in range(1, width - 1):
                if random.random() < pr_reward:
                    grid[y][x] = random.randint(-9, 9)
                    # Don't allow 0 rewards
                    while grid[y][x] == 0:
                        grid[y][x] = random.randint(-9, 9)
                elif random.random() >= pr_wall:
                    grid[y][x] = ' '

        def set_random_position_to(token):
            x, y = GridworldMdp.get_random_state(grid, ['X', ' '])
            grid[y][x] = token

        set_random_position_to(3)
        set_random_position_to('A')
        return GridworldMdp(grid)

    @staticmethod
    def generate_random_connected(height, width, num_rewards, noise, goals=None):
        """Generates a random instance of a Gridworld.
        Unlike with generate_random, it is guaranteed that the agent
        can reach a reward. However, that reward might be negative.
        goals: If not None, dictionary mapping (x, y) positions to rewards.
        """
        def get_random_reward():
            result = random.randint(-9, 9)
            while result == 0:
                result = random.randint(-9, 9)
            return result

        def generate_goals(start_state):
            states = [(x, y) for x in range(1, width-1) for y in range(1, height-1)]
            states.remove(start_state)
            indices = np.random.choice(len(states), num_rewards, replace=False)
            return {states[i] : get_random_reward() for i in indices}

        start_state = (width // 2, height // 2)
        if goals is None:
            goals = generate_goals(start_state)
        required_nonwalls = list(goals.keys())
        required_nonwalls.append(start_state)

        directions = [
            Direction.NORTH, Direction.SOUTH, Direction.EAST, Direction.WEST]
        grid = [['X'] * width for _ in range(height)]
        walls = [(x, y) for x in range(1, width-1) for y in range(1, height-1)]
        dsets = DisjointSets([])
        first_state = required_nonwalls[0]
        for x, y in required_nonwalls:
            grid[y][x] = ' '
            walls.remove((x, y))
            dsets.add_singleton((x, y))

        min_free_spots = len(walls) / 2
        random.shuffle(walls)
        while dsets.get_num_elements() < min_free_spots or not dsets.is_connected():
            x, y = walls.pop()
            grid[y][x] = ' '
            dsets.add_singleton((x, y))
            for direction in directions:
                newx, newy = Direction.move_in_direction((x, y), direction)
                if dsets.contains((newx, newy)):
                    dsets.union((x, y), (newx, newy))

        grid[height // 2][width // 2] = 'A'
        for x, y in goals.keys():
            grid[y][x] = goals[(x, y)]

        return GridworldMdp(grid, noise=noise)

    def __str__(self):
        """Returns a string representation of this grid world.
        The returned string has a line for every row, and each space is exactly
        one character. These are encoded in the same way as the grid input to
        the constructor -- walls are 'X', empty spaces are ' ', the start state
        is 'A', and rewards are their own values. However, rewards like 3.5 or
        -9 cannot be represented with a single character. Such rewards are
        encoded as 'R' (if positive) or 'N' (if negative).
        """
        def get_char(x, y):
            if self.walls[y][x]:
                return 'X'
            elif (x, y) in self.rewards:
                reward = self.rewards[(x, y)]
                # Convert to an int if it would not lose information
                reward = int(reward) if int(reward) == reward else reward
                posneg_char = 'R' if reward >= 0 else 'N'
                reward_str = str(reward)
                return reward_str if len(reward_str) == 1 else posneg_char
            elif (x, y) == self.get_start_state():
                return 'A'
            else:
                return ' '

        def get_row_str(y):
            return ''.join([get_char(x, y) for x in range(self.width)])

        return '\n'.join([get_row_str(y) for y in range(self.height)])

class Direction(object):
    """A class that contains the five actions available in Gridworlds.
    Includes definitions of the actions as well as utility functions for
    manipulating them or applying them.
    """
    NORTH = (0, -1)
    SOUTH = (0, 1)
    EAST  = (1, 0)
    WEST  = (-1, 0)
    STAY = (0, 0)
    INDEX_TO_DIRECTION = [NORTH, SOUTH, EAST, WEST, STAY]
    DIRECTION_TO_INDEX = { a:i for i, a in enumerate(INDEX_TO_DIRECTION) }
    ALL_DIRECTIONS = INDEX_TO_DIRECTION

    @staticmethod
    def move_in_direction(point, direction):
        """Takes a step in the given direction and returns the new point.
        point: Tuple (x, y) representing a point in the x-y plane.
        direction: One of the Directions, except not Direction.STAY or
                   Direction.SELF_LOOP.
        """
        x, y = point
        dx, dy = direction
        return (x + dx, y + dy)

    @staticmethod
    def get_adjacent_directions(direction):
        """Returns the directions within 90 degrees of the given direction.
        direction: One of the Directions, except not Direction.STAY.
        """
        if direction in [Direction.NORTH, Direction.SOUTH]:
            return [Direction.EAST, Direction.WEST]
        elif direction in [Direction.EAST, Direction.WEST]:
            return [Direction.NORTH, Direction.SOUTH]
        raise ValueError('Invalid direction: %s' % direction)

    @staticmethod
    def get_number_from_direction(direction):
        return Direction.DIRECTION_TO_INDEX[direction]

    @staticmethod
    def get_direction_from_number(number):
        return Direction.INDEX_TO_DIRECTION[number]

In [15]:
from collections import defaultdict

In [17]:
class Distribution(object):
    """Represents a probability distribution.
    The distribution is stored in a canonical form where items are mapped to
    their probabilities. The distribution is always normalized (so that the
    probabilities sum to 1).
    """
    def __init__(self, probability_mapping):
        # Convert to a list so that we aren't iterating over the dictionary and
        # removing at the same time
        for key in list(probability_mapping.keys()):
            prob = probability_mapping[key]
            if prob == 0:
                del probability_mapping[key]
            elif prob < 0:
                raise ValueError('Cannot have negative probability!')

        assert len(probability_mapping) > 0
        self.dist = probability_mapping
        self.normalize()

    def factor(self, key, factor):
        """Updates the probability distribution as though we see evidence that
        is `factor` times more likely for `key` than for any other key."""
        self.dist[key] *= factor
        self.normalize()

    def normalize(self):
        Z = float(sum(self.dist.values()))
        for key in list(self.dist.keys()):
            self.dist[key] /= Z

    def sample(self):
        keys, probabilities = zip(*self.dist.items())
        return keys[np.random.choice(np.arange(len(keys)), p=probabilities)]

    def get_dict(self):
        return self.dist.copy()

    def as_numpy_array(self, fn=None, length=None):
        if fn is None:
            fn = lambda x: x
        keys = list(self.dist.keys())
        numeric_keys = [fn(key) for key in keys]
        if length is None:
            length = max(numeric_keys) + 1

        result = np.zeros(length)
        for key, numeric_key in zip(keys, numeric_keys):
            result[numeric_key] = self.dist[key]
        return result

    def __eq__(self, other):
        return self.dist == other.dist

    def __str__(self):
        return str(self.dist)

    def __repr__(self):
        return 'Distribution(%s)' % repr(self.dist)

In [19]:
grid = ['XXXXXXXXX',
        'X9X6X   X',
        'X X X XXX',
        'X  A   2X',
        'XXXXXXXXX']
n, s, e, w, stay = Direction.ALL_DIRECTIONS

agent = OptimalAgent(gamma=0.95, num_iters=20)
mdp = GridworldMdp(grid, living_reward=-0.1)
agent.set_mdp(mdp)
start_state = mdp.get_start_state()

# Action distribution
action_dist = agent.get_action_distribution(start_state)

print(action_dist)

{(-1, 0): 1.0}
