In [1]:
import numpy as np

In [50]:
grid_size=4
empty_value=np.zeros((4,4))
reward_pots={
    (0,3): 10,
    (1,1): "-1",
    (3,0): 1,
    (3,2): "-20",
    (2,3): -45
}

In [51]:
rewards=np.zeros((grid_size,grid_size))
for pos,reward in reward_pots.items():
    rewards[pos]=reward
rewards

array([[  0.,   0.,   0.,  10.],
       [  0.,  -1.,   0.,   0.],
       [  0.,   0.,   0., -45.],
       [  1.,   0., -20.,   0.]])

In [52]:

# Define the deterministic transition dynamics
def move(position, action):
    x, y = position
    if action == "U" and x > 0:
        return (x - 1, y)
    elif action == "D" and x < grid_size - 1:
        return (x + 1, y)
    elif action == "L" and y > 0:
        return (x, y - 1)
    elif action == "R" and y < grid_size - 1:
        return (x, y + 1)
    else:
        return position  # If the move is not possible, stay in the same position


In [53]:
value_func=np.zeros((4,4))

# Value iteration parameters
gamma = 0.7  # Discount factor
theta = 0.00001  # Threshold for convergence
actions = ["U", "D", "L", "R"]
    

def eval_valuefunc(state, value_function, rewards, gamma=0.9):
    x, y = state
    actions = ["U", "D", "L", "R"]
    
    new_value = max(
        rewards[x, y] + gamma * value_function[move((x, y), action)]
        for action in actions
    )
    return new_value

In [54]:
def value_iter(value_func,rewards,gamma=0.9,theta=1e-5):
    
    while True:
        delta=0
        new_value_func = np.zeros((grid_size, grid_size))
        
        for x in range(grid_size):
            for y in range(grid_size):
                state=(x,y)
                
                v = value_func[x, y]
                new_value_func[x,y]=eval_valuefunc(state, value_func, rewards, gamma=0.9)
                delta = max(delta, abs(v - new_value_func[x, y]))
        value_func = new_value_func.copy()
        
        if delta < theta:
            break
    
    return value_func

In [55]:
def find_optimal_policy(value_func, rewards, gamma=0.9):
    grid_size = len(value_func)
    actions = ["U", "D", "L", "R"]
    policy = np.empty((grid_size, grid_size), dtype=str)
    
    for x in range(grid_size):
        for y in range(grid_size):
            state = (x, y)
            print(state)
            action_values = {}
            
            for action in actions:
                new_state = move(state, action)
                
                action_values[action] = rewards[x, y] + gamma * value_func[new_state]
                print(f'action:{action} and action_value: {action_values[action]}')
            
            best_action = max(action_values, key=action_values.get)
            print(best_action)
            policy[x, y] = best_action
    
    return policy

In [56]:

optimal_value_func = value_iter(empty_value, rewards, gamma, theta)
optimal_policy = find_optimal_policy(optimal_value_func, rewards, gamma)

# Print the optimal value function
print("Optimal Value Function:")
print(rewards)
print("\nOptimal Policy:")
for row in optimal_policy:
    print(" ".join(row))
print(optimal_value_func)

(0, 0)
action:U and action_value: 51.029942541829264
action:D and action_value: 45.92694254182927
action:L and action_value: 51.029942541829264
action:R and action_value: 56.69994254182926
R
(0, 1)
action:U and action_value: 56.69994254182926
action:D and action_value: 50.32994254182927
action:L and action_value: 51.029942541829264
action:R and action_value: 62.99994254182926
R
(0, 2)
action:U and action_value: 62.99994254182926
action:D and action_value: 56.69994254182926
action:L and action_value: 56.69994254182926
action:R and action_value: 69.99994254182926
R
(0, 3)
action:U and action_value: 79.99994254182926
action:D and action_value: 72.99994254182926
action:L and action_value: 72.99994254182926
action:R and action_value: 79.99994254182926
U
(1, 0)
action:U and action_value: 51.029942541829264
action:D and action_value: 41.334242541829276
action:L and action_value: 45.92694254182927
action:R and action_value: 50.32994254182927
U
(1, 1)
action:U and action_value: 55.6999425418292