# Second practical exercise: Grid World and Value iteration

# A deterministic grid world

Finite grid with some obstacles inside. The agent can move up, left, right and down.

![](imgs/grid_world.png)

In [None]:
%pip install gymnasium

In [2]:
# Imports 
import gymnasium as gym
from gymnasium import spaces
import random
import numpy as np

In [3]:
# Custom 2D GridWorld Enviroment
class GridWorld(gym.Env):
    metadata = {'render.modes': ['console']}

    # Actions available
    UP = 0
    LEFT = 1
    DOWN = 2
    RIGHT = 3

    def __init__(self, width, height):
        super(GridWorld, self).__init__()
        self.ACTION_NAMES = ["UP", "LEFT", "DOWN", "RIGHT"]                         # Mapping of the actions
        self.num_actions = 4

        self.width = width
        self.height = height
        self.size = width * height                                                  # Size of the grid world
        self.num_states = self.size                                                 # States are computed (width * height)
        self.num_obstacles = int((width + height) /2)                               # Number of obstacles present in our Grid
        self.end_state = np.array([height - 1, width - 1], dtype=np.uint8)          # Goal state = Bottom right cell

        self.action_space = spaces.Discrete(4)                                      # Actions space of agent : up, down, left and right
        self.observation_space = spaces.MultiDiscrete([self.height, self.width])    # Observation : Cell indices in the grid

        self.obstacles = np.zeros((height, width))                                  # Initialize the obstacles as a Grid with all zeros

        """
        Looping for the number of obstacles, we take a random number between height and width
        and until the two numbers are not (0, 0) -> Because it's the goal, we keep on looping.
        When we've found two suitable numbers, we put 1 in the Obstacles grid at coords height/width.
        """
        for i in range(self.num_obstacles):
            obstacle = random.randrange(height) , random.randrange(width)
            while obstacle == (0, 0):
                obstacle = random.randrange(height), random.randrange(width)
            self.obstacles[obstacle] = 1

        self.num_steps = 0
        self.max_steps = height * width

        self.current_state = np.zeros((2), np.uint8) # Initial state = [0,0]

        self.directions = np.array([
            [-1, 0], # UP
            [0, -1], # LEFT
            [1, 0],  # DOWN
            [0, 1]   # RIGHT
        ])
        
    def step(self, action):
        s_prime = self.transition_function(self.current_state, action)
        reward = self.reward_function(s_prime)
        terminated, truncated = self.termination_condition(s_prime)

        self.current_state = s_prime
        self.num_steps += 1

        return self.current_state, reward, terminated, truncated, None
    
    
    def transition_function(self, s, a):
        s_prime = s + self.directions[a, :]

        # Now we check if the agent is going out of the boundaries.
        #   - If I take s_prime[0], I'm working on the rows.
        #   - If I take s_prime[1], I'm working on the columns.
        if(s_prime[0] < 0 or s_prime[0] >= self.height):
            print("Agent is going outside of the grid. Staying in the same cell")
            return s
        if(s_prime[1] < 0 or s_prime[1] >= self.width):
            print("Agent is going outside of the grid. Staying in the same cell")
            return s
        
        # Check obstacles for the agent.
        # If the agent new coordinates are in the coordinates of an obstacle, exit.
        if(self.obstacles[s_prime[0], s_prime[1]] == 1):
            print("Agent is hitting an obstacle. Staying in the same cell")
            return s            
    
        return s_prime # We simply return s_prime
    
    
    def reward_function(self,s):
        r = 0

        # We check if both elements of the arrays s and self.end_state are equal
        if(s == self.end_state).all():
            r = 1

        return r
    
    def termination_condition(self, s):

        truncated = self.num_steps >= self.max_steps
        terminated = False
        
        if(truncated):
            print("Maximum steps reached. Exiting.")
            terminated = True
        if(s == self.end_state).all():
            print("Agent is in the goal state. Let's end the loop.")
            terminated = True

        return terminated, truncated
    
    def reset(self):
        self.current_state = np.zeros((2), np.uint8)
        self.num_steps = 0

        return self.current_state
    
    def render(self):
        '''
            Render the state
        '''

        row = self.current_state[0]
        col = self.current_state[1]

        for r in range(self.height):
            for c in range(self.width):
                if r == row and c == col:
                    print("| A ", end='')
                elif r == self.end_state[0] and c == self.end_state[1]:
                    print("| G ", end='')
                else:
                    if self.obstacles[r,c] == 1:
                        print('|///', end='')
                    else:
                        print('|___', end='')
            print('|')
        print('\n')

Simulate all the four actions

In [4]:
env = GridWorld(3,5) # GridWorld of 3 columns and 5 rows.
env.reset()
env.render()

action_sequence = [0, 1, 2, 3]

for a in action_sequence:
    print("The agent will now move: ", env.ACTION_NAMES[a])
    env.step(a)
    env.render()

| A |___|///|
|///|___|___|
|___|___|___|
|___|___|///|
|///|___| G |


The agent will now move:  UP
Agent is going outside of the grid. Staying in the same cell
| A |___|///|
|///|___|___|
|___|___|___|
|___|___|///|
|///|___| G |


The agent will now move:  LEFT
Agent is going outside of the grid. Staying in the same cell
| A |___|///|
|///|___|___|
|___|___|___|
|___|___|///|
|///|___| G |


The agent will now move:  DOWN
Agent is hitting an obstacle. Staying in the same cell
| A |___|///|
|///|___|___|
|___|___|___|
|___|___|///|
|///|___| G |


The agent will now move:  RIGHT
|___| A |///|
|///|___|___|
|___|___|___|
|___|___|///|
|///|___| G |




Simulate a random episode

In [5]:
done = False
env.reset()

while not done:
    action = env.action_space.sample()
    print(env.ACTION_NAMES[action])
    state, reward, terminated, truncated, _ = env.step(action)
    done = terminated or truncated
    env.render()


LEFT
Agent is going outside of the grid. Staying in the same cell
| A |___|///|
|///|___|___|
|___|___|___|
|___|___|///|
|///|___| G |


LEFT
Agent is going outside of the grid. Staying in the same cell
| A |___|///|
|///|___|___|
|___|___|___|
|___|___|///|
|///|___| G |


RIGHT
|___| A |///|
|///|___|___|
|___|___|___|
|___|___|///|
|///|___| G |


RIGHT
Agent is hitting an obstacle. Staying in the same cell
|___| A |///|
|///|___|___|
|___|___|___|
|___|___|///|
|///|___| G |


LEFT
| A |___|///|
|///|___|___|
|___|___|___|
|___|___|///|
|///|___| G |


DOWN
Agent is hitting an obstacle. Staying in the same cell
| A |___|///|
|///|___|___|
|___|___|___|
|___|___|///|
|///|___| G |


DOWN
Agent is hitting an obstacle. Staying in the same cell
| A |___|///|
|///|___|___|
|___|___|___|
|___|___|///|
|///|___| G |


LEFT
Agent is going outside of the grid. Staying in the same cell
| A |___|///|
|///|___|___|
|___|___|___|
|___|___|///|
|///|___| G |


LEFT
Agent is going outside of the

## A non deterministic grid world

The agent goes with probability p to the right cell, with probability 1 - p in a different cell

In [31]:
class NonDeterministicGridWorld(GridWorld):
    def __init__(self, width, height, p=0.8):
        super(NonDeterministicGridWorld, self).__init__(width, height)
        self.probability_right_action = p

    def transition_function(self, s, a):
        s_prime = s + self.directions[a]

        # With probability 1 - p, we have a diagonal movement
        # random.random() returns a number between 0 and 1
        if random.random() <= 1 - self.probability_right_action:
            print("Original action to perform:", self.ACTION_NAMES[a])
            if random.random() < 0.5:
                print(F"Actual action performed: {self.ACTION_NAMES[a]} + {self.ACTION_NAMES[(a+1) % self.num_actions]}")
                s_prime = s_prime + self.directions[(a+1) % self.num_actions]
            else:
                print(F"Actual action performed: {self.ACTION_NAMES[a]} + {self.ACTION_NAMES[(a-1) % self.num_actions]}")
                s_prime = s_prime + self.directions[(a-1) % self.num_actions]

        # Check if the agent goes out of the grid along with obstacles 
        if s_prime[0] < self.height and s_prime[1] < self.width and (s_prime >= 0).all():
            if self.obstacles[s_prime[0], s_prime[1]] == 0 :
                return s_prime

        return s

Simulate a random episode

In [47]:
env = NonDeterministicGridWorld(3,5)
env.reset()
env.render()

| A |___|___|
|///|___|___|
|___|___|___|
|///|___|___|
|///|___| G |




In [48]:
done = False
while not done:
    
    action = env.action_space.sample()
    print(env.ACTION_NAMES[action])
    state, reward, terminated, truncated, _ = env.step(action)
    done = terminated or truncated
    env.render()

DOWN
| A |___|___|
|///|___|___|
|___|___|___|
|///|___|___|
|///|___| G |


DOWN
| A |___|___|
|///|___|___|
|___|___|___|
|///|___|___|
|///|___| G |


RIGHT
|___| A |___|
|///|___|___|
|___|___|___|
|///|___|___|
|///|___| G |


RIGHT
|___|___| A |
|///|___|___|
|___|___|___|
|///|___|___|
|///|___| G |


UP
|___|___| A |
|///|___|___|
|___|___|___|
|///|___|___|
|///|___| G |


UP
|___|___| A |
|///|___|___|
|___|___|___|
|///|___|___|
|///|___| G |


UP
Original action to perform:  UP
Actual action performed: UP + RIGHT
|___|___| A |
|///|___|___|
|___|___|___|
|///|___|___|
|///|___| G |


UP
|___|___| A |
|///|___|___|
|___|___|___|
|///|___|___|
|///|___| G |


LEFT
Original action to perform:  LEFT
Actual action performed: LEFT + DOWN
|___|___|___|
|///| A |___|
|___|___|___|
|///|___|___|
|///|___| G |


LEFT
|___|___|___|
|///| A |___|
|___|___|___|
|///|___|___|
|///|___| G |


LEFT
|___|___|___|
|///| A |___|
|___|___|___|
|///|___|___|
|///|___| G |


UP
Original action t