# Gridworld


In [463]:
import gymnasium as gym
import gridworld  # this will auto-register the environment
import pickle
import numpy as np


In [None]:
with open("S03_pi_star.pkl", "rb") as file:
    data = pickle.load(file)
policy_optimal = data["policy"]
print(policy_optimal)

[[0.25 0.25 0.25 0.25]
 [0.   0.   0.   1.  ]
 [0.   0.   0.   1.  ]
 [0.   0.   0.5  0.5 ]
 [1.   0.   0.   0.  ]
 [0.5  0.   0.   0.5 ]
 [0.   0.   0.5  0.5 ]
 [0.   0.   1.   0.  ]
 [1.   0.   0.   0.  ]
 [0.5  0.5  0.   0.  ]
 [0.   1.   0.   0.  ]
 [0.   0.   1.   0.  ]
 [1.   0.   0.   0.  ]
 [0.   1.   0.   0.  ]
 [0.   1.   0.   0.  ]
 [0.25 0.25 0.25 0.25]]


In [465]:
env = gym.make("GridWorld-v0")

In [None]:
def get_action(policy, state):
    """
    Selects an action based on the given policy for a specific state.

    Args:
        policy (np.ndarray): A 2D array where each row corresponds to a state's action
                             probability distribution.
        state (int): The current state index.

    Returns:
        int: An action selected randomly according to the state's action probabilities
             defined in the policy.
    """
    nA = policy.shape[1]  # Number of possible actions (e.g., 4)
    actions = np.arange(nA)  # Array of action indices: [0, 1, 2, 3]
    prob = policy[state]  # Probability distribution over actions for the given state
    return np.random.choice(
        actions, p=prob
    )  # Randomly choose action based on probabilities


In [None]:
# Test
# UP = 0
# RIGHT = 1
# DOWN = 2
# LEFT = 3
get_action(policy_optimal, 1)  # You will always go left from state 1

np.int64(3)

In [None]:
total_reward = 0.0
total_steps = 0
curState, _ = env.reset()

done = False
while not done:
    action = get_action(policy_optimal, curState)
    curState, reward, terminated, truncated, info = env.step(action)
    total_reward += reward
    total_steps += 1
    done = terminated or truncated
    env.render()
    print(
        f"Step {total_steps:2d}: action={action:2d}, Reward={reward:5.2f}, "
        f"Terminated={terminated}, Truncated={truncated}, info={info}"
    )

print(
    f"Episode finished after {total_steps} steps with total reward: {total_reward:.2f}"
)

env.close()

T  o  o  o
o  o  o  o
x  o  o  o
o  o  o  T

Step  1: action= 0, Reward=-1.00, Terminated=False, Truncated=False, info={}
T  o  o  o
x  o  o  o
o  o  o  o
o  o  o  T

Step  2: action= 0, Reward=-1.00, Terminated=False, Truncated=False, info={}
x  o  o  o
o  o  o  o
o  o  o  o
o  o  o  T

Step  3: action= 0, Reward=-1.00, Terminated=True, Truncated=False, info={}
Episode finished after 3 steps with total reward: -3.00
