# MDP Policy Evaluation - Gridworld

* **Description:** Perform iterative policy evaluation for the gridworld example
* **Reference:** Reinforcement Learning, An Introduction, Second Edition by Sutton, Barto
* **Section:** Section 4.1, Example 4.1, Pg. 76 to 77

# Import required libraries

In [21]:
import numpy as np
import matplotlib.pyplot as plt

from tqdm import tqdm

# Define classes

## Class: Agent

In [2]:
class Agent(object):
    '''
    Defines the agent class for the gridworld problem
    Arguments:
        policy_init: Initial policy to use for the agent class
        actions: List of actions that the agent can take
    '''
    
    def __init__(self, policy_init, actions):
        self.policy = policy_init
        self.actions = actions
        
    def step(self, state):
        # Execute one step of agent based on current state
        if isinstance(self.policy, str):
            if(self.policy == 'random'):
                return np.random.choice(self.actions)        
        

## Class: Environment

In [119]:
class Environment(object):
    '''
    Defines the environment class for a n x n gridworld problem
    Arguments:
        n: Defines the size of the gridworld. n x n gridworld is generated
        reward: Reward value for each transition
    '''
    
    def __init__(self, n, reward):
        self.n = n
        self.state_list = list(range(n ** 2))
        self.reward = reward
        self.state = 1
        
    def set_state(self, state):
        self.state = state
        
    def respond(self, action): # Respond to a particular action
        if (action == 'up'): # Execute up action
            if (self.state < self.n):
                self.state = self.state
            else:
                self.state = self.state - 4
        if (action == 'down'): # Execute down action
            if (self.state >= (self.n * (self.n - 1))):
                self.state = self.state
            else:
                self.state = self.state + 4
        if (action == 'right'): # Execute right action
            if ((self.state + 1) % self.n == 0):
                self.state = self.state
            else:
                self.state = self.state + 1
        if (action == 'left'): # Execute left action
            if (self.state % self.n == 0):
                self.state = self.state
            else:
                self.state = self.state - 1
        return self.state, self.reward    

# Run Iterative Policy Evaluation

In [121]:
policy_init = 'random' # Set initial policy to equiprobable random policy
actions_list = ['up', 'down', 'right', 'left']
gw_size = 4 # 4 x 4 gridworld

gw_agent = Agent(policy_init, actions_list)
gw_envir = Environment(gw_size, -1)

In [138]:
v_init = np.zeros(gw_size ** 2) # v_0: Initialize value function array to all zeros.
num_iter = 500

for ind in tqdm(range(num_iter)):
    if (ind == 0):
        v_curr = v_init
    else:
        v_curr = v_next
    v_next = np.zeros(gw_size ** 2) # v_(k+1): Placeholder for next value function array.        
    for s in range(1, (gw_size ** 2 - 1)): # Loop through states (leave out terminal states)
        for act in actions_list:
            gw_envir.set_state(s)
            s_pr, r = gw_envir.respond(act)
            v_next[s] += r + v_curr[s_pr]
        v_next[s] = v_next[s] / len(actions_list)

print(v_next[0:4])
print(v_next[4:8])
print(v_next[8:12])
print(v_next[12:16])

100%|██████████████████████████████████████████████████████████████████████████████| 500/500 [00:00<00:00, 4213.05it/s]

[  0. -14. -20. -22.]
[-14. -18. -20. -20.]
[-20. -20. -18. -14.]
[-22. -20. -14.   0.]



