In [1]:
import numpy as np

class DiscountGridEnv:
    def __init__(self, grid_size=(4, 4), goal_state=(3, 3), discount_factor=0.9, obstacle_states=[], goal_reward=10, step_penalty=-1):
        self.grid_size = grid_size
        self.goal_state = goal_state
        self.discount_factor = discount_factor
        self.obstacle_states = obstacle_states
        self.goal_reward = goal_reward
        self.step_penalty = step_penalty
        self.actions = ['up', 'down', 'left', 'right']
        self.n_states = grid_size[0] * grid_size[1]
        self.n_actions = len(self.actions)
        self.transitions, self.rewards = self.build_environment()

    def build_environment(self):
        transition_prob = np.zeros((self.n_states, self.n_actions, self.n_states))
        rewards = np.full((self.n_states, self.n_actions), self.step_penalty)

        def state_to_index(row, col):
            return row * self.grid_size[1] + col

        def is_valid_state(row, col):
            return (0 <= row < self.grid_size[0]) and (0 <= col < self.grid_size[1]) and (row, col) not in self.obstacle_states

        for row in range(self.grid_size[0]):
            for col in range(self.grid_size[1]):
                state = state_to_index(row, col)
                for action_idx, action in enumerate(self.actions):
                    if (row, col) == self.goal_state:
                        rewards[state, action_idx] = self.goal_reward
                        transition_prob[state, action_idx, state] = 1.0
                        continue
                    next_row, next_col = row, col
                    if action == 'up' and is_valid_state(row - 1, col):
                        next_row = row - 1
                    elif action == 'down' and is_valid_state(row + 1, col):
                        next_row = row + 1
                    elif action == 'left' and is_valid_state(row, col - 1):
                        next_col = col - 1
                    elif action == 'right' and is_valid_state(row, col + 1):
                        next_col = col + 1
                    next_state = state_to_index(next_row, next_col)
                    transition_prob[state, action_idx, next_state] = 1.0
        return transition_prob, rewards

    def state_index_to_coordinates(self, state_index):
        row = state_index // self.grid_size[1]
        col = state_index % self.grid_size[1]
        return row, col

def value_iteration_discount_grid(env, gamma=0.9, theta=1e-6):
    V = np.zeros(env.n_states)
    while True:
        delta = 0
        for s in range(env.n_states):
            v = V[s]
            Q_sa = np.zeros(env.n_actions)
            for a in range(env.n_actions):
                Q_sa[a] = sum([env.transitions[s, a, s_prime] * (env.rewards[s, a] + gamma * V[s_prime])
                               for s_prime in range(env.n_states)])
            V[s] = max(Q_sa)
            delta = max(delta, abs(v - V[s]))
        if delta < theta:
            break
    policy = np.zeros(env.n_states, dtype=int)
    for s in range(env.n_states):
        Q_sa = np.zeros(env.n_actions)
        for a in range(env.n_actions):
            Q_sa[a] = sum([env.transitions[s, a, s_prime] * (env.rewards[s, a] + gamma * V[s_prime])
                           for s_prime in range(env.n_states)])
        policy[s] = np.argmax(Q_sa)
    return V, policy

env = DiscountGridEnv(grid_size=(4, 4), goal_state=(3, 3), discount_factor=0.9, obstacle_states=[(1, 1)], goal_reward=10, step_penalty=-1)
V, policy = value_iteration_discount_grid(env, gamma=0.9)
print("Optimal Value Function (V):")
print(V.reshape(env.grid_size))
print("\nOptimal Policy (actions corresponding to indices):")
policy_grid = np.array(env.actions)[policy].reshape(env.grid_size)
print(policy_grid)

Optimal Value Function (V):
[[48.45850102 54.95389102 62.17099102 70.18999102]
 [54.95389102 62.17099102 70.18999102 79.09999102]
 [62.17099102 70.18999102 79.09999102 88.99999102]
 [70.18999102 79.09999102 88.99999102 99.99999102]]

Optimal Policy (actions corresponding to indices):
[['down' 'right' 'down' 'down']
 ['down' 'down' 'down' 'down']
 ['down' 'down' 'down' 'down']
 ['right' 'right' 'right' 'up']]
