# Value iteration on small grdiworld

In [1]:
import os
os.chdir("..")
from src.gym_kalman.env_Gridworld import GridworldEnv

In [2]:
# Initialize value function
import numpy as np

# initialize the environment
grid_size = 4
env = GridworldEnv(grid_size=grid_size, reward_std=0.)
num_states = env.observation_space.n
actions = np.arange(env.action_space.n)
gamma = 1
theta = 1e-1

V = np.zeros(num_states)

maximum_iterations = 10000
itr_num = 0

# Value Iteration Algorithm
def value_iteration():
    global itr_num
    while True:
        delta = 0
        for state in range(num_states):
            if state == 15:  # Skip terminal state (bottom-right)
                continue
            v = V[state]
            new_v = []
            for action in actions:
                env.state = state
                next_state, reward, done, _, _  = env.step(action)
                if done:
                    new_v.append(reward)
                else:
                    new_v.append(reward + gamma * V[next_state])

            V[state] = max(new_v)
            delta = max(delta, abs(v - V[state]))

        print('Iteration:', itr_num)
        print(np.round(V.reshape((grid_size, grid_size))))
        print('=============')

        # Check if the values have converged
        if delta < theta:
            break

        itr_num += 1
        if itr_num > maximum_iterations:
            break

# Run the value iteration algorithm
value_iteration()

# Extract optimal policy
policy = np.zeros(num_states, dtype=str)
for state in range(num_states):
    if state == 15:  # Terminal state
        policy[state] = 'goal'
        continue
    action_values = []
    for action in actions:
        env.state = state
        next_state, reward, done, _, _  = env.step(action)
        if done:
            action_values.append(reward)
        else:
            action_values.append(reward + gamma * V[next_state])
    best_action = actions[np.argmax(action_values)]
    policy[state] = best_action



# Reshape for easier visualization
V_grid = V.reshape((grid_size, grid_size))
V_grid = np.round(V_grid, 2)
policy_grid = np.array(policy).reshape((grid_size, grid_size))

print("Optimal Value Function (V):")
print(V_grid)

print("\nOptimal Policy:")
print(policy_grid)


Iteration: 0
[[-1. -1. -1. -1.]
 [-1. -1. -1. -1.]
 [-1. -1. -1. -1.]
 [-1. -1. -1.  0.]]
Iteration: 1
[[-2. -2. -2. -2.]
 [-2. -2. -2. -2.]
 [-2. -2. -2. -1.]
 [-2. -2. -1.  0.]]
Iteration: 2
[[-3. -3. -3. -3.]
 [-3. -3. -3. -2.]
 [-3. -3. -2. -1.]
 [-3. -2. -1.  0.]]
Iteration: 3
[[-4. -4. -4. -3.]
 [-4. -4. -3. -2.]
 [-4. -3. -2. -1.]
 [-3. -2. -1.  0.]]
Iteration: 4
[[-5. -5. -4. -3.]
 [-5. -4. -3. -2.]
 [-4. -3. -2. -1.]
 [-3. -2. -1.  0.]]
Iteration: 5
[[-6. -5. -4. -3.]
 [-5. -4. -3. -2.]
 [-4. -3. -2. -1.]
 [-3. -2. -1.  0.]]
Iteration: 6
[[-6. -5. -4. -3.]
 [-5. -4. -3. -2.]
 [-4. -3. -2. -1.]
 [-3. -2. -1.  0.]]
Optimal Value Function (V):
[[-6. -5. -4. -3.]
 [-5. -4. -3. -2.]
 [-4. -3. -2. -1.]
 [-3. -2. -1.  0.]]

Optimal Policy:
[['1' '1' '1' '1']
 ['1' '1' '1' '1']
 ['1' '1' '1' '1']
 ['3' '3' '3' 'g']]
