# Value iteration on small grdiworld

In [1]:
import os
os.chdir("..")
from src.gym_kalman.env_Gridworld import GridworldEnv

In [2]:
# Initialize value function
import numpy as np

# initialize the environment
reward_mean = -1
reward_std = 0.2
grid_size = 4

env = GridworldEnv(grid_size=grid_size, reward_mean=reward_mean, reward_std=reward_std)
num_states = env.observation_space.n
actions = np.arange(env.action_space.n)
gamma = 1
theta = 1e-1

V_mean = np.zeros(num_states)
V_std = np.zeros(num_states)

maximum_iterations = 10000
itr_num = 0

# Value Iteration Algorithm
def value_iteration():
    global itr_num
    while True:
        delta = 0
        for state in range(num_states):
            if state == 15:  # Skip terminal state (bottom-right)
                continue
            v_mean = V_mean[state]
            v_std = V_std[state]
            new_v_mean = []
            new_v_std = []
            for action in actions:
                env.state = state
                next_state, reward, done, _, _  = env.step(action)
                if done:
                    new_v_mean.append(reward_mean)
                    new_v_std.append(reward_std)
                else:
                    new_v_mean.append(reward_mean + gamma * V_mean[next_state])
                    new_v_std.append(np.sqrt(reward_std**2 + (gamma * V_std[next_state])**2))

            V_mean[state] = max(new_v_mean)
            V_std[state] = new_v_std[np.argmax(new_v_mean)]

            delta = max(delta, abs(v_mean - V_mean[state]))

        # Check if the values have converged
        if delta < theta:
            break

        itr_num += 1
        if itr_num > maximum_iterations:
            break

# Run the value iteration algorithm
value_iteration()

# Extract optimal policy
policy = np.zeros(num_states, dtype=str)
for state in range(num_states):
    if state == 15:  # Terminal state
        policy[state] = 'goal'
        continue
    action_values = []
    for action in actions:
        env.state = state
        next_state, reward, done, _, _  = env.step(action)
        if done:
            action_values.append(reward)
        else:
            action_values.append(reward + gamma * V_mean[next_state])
    best_action = actions[np.argmax(action_values)]
    policy[state] = best_action


# Reshape for easier visualization
V_grid = V_mean.reshape((grid_size, grid_size))
V_grid = np.round(V_grid, 2)
V_std_grid = V_std.reshape((grid_size, grid_size))
V_std_grid = np.round(V_std_grid, 2)

policy_grid = np.array(policy).reshape((grid_size, grid_size))

print("Optimal Value Function (V):")
print(V_grid)
print("\nStandard Deviation of Value Function (V):")
print(V_std_grid)

print("\nOptimal Policy:")
print(policy_grid)


Optimal Value Function (V):
[[-6. -5. -4. -3.]
 [-5. -4. -3. -2.]
 [-4. -3. -2. -1.]
 [-3. -2. -1.  0.]]

Standard Deviation of Value Function (V):
[[0.49 0.45 0.4  0.35]
 [0.45 0.4  0.35 0.28]
 [0.4  0.35 0.28 0.2 ]
 [0.35 0.28 0.2  0.  ]]

Optimal Policy:
[['1' '1' '1' '1']
 ['3' '3' '1' '1']
 ['3' '1' '3' '1']
 ['3' '3' '3' 'g']]
