In [1]:
import numpy as np
import random

In [2]:
reward_pots={
    (0,3): 10,
    (1,1): "-1",
    (3,0): 1,
    (3,3): "20",
    (0,2): "-20",
    (2,0): "-2",
    (2,3): -45
}

In [3]:

grid_size=4
rewards=np.zeros((grid_size,grid_size))
for pos,reward in reward_pots.items():
    rewards[pos]=reward
rewards

array([[  0.,   0., -20.,  10.],
       [  0.,  -1.,   0.,   0.],
       [ -2.,   0.,   0., -45.],
       [  1.,   0.,   0.,  20.]])

In [4]:
    
# Define the deterministic transition dynamics
def move(position, action):
    x, y = position
    if action == "3" and x > 0:
        return (x - 1, y)
    elif action == "1" and x < grid_size - 1:
        return (x + 1, y)
    elif action == "0" and y > 0:
        return (x, y - 1)
    elif action == "2" and y < grid_size - 1:
        return (x, y + 1)
    else:
        return position # If the move is not possible, stay in the same position


In [5]:


# Value iteration parameters
gamma = 0.7  # Discount factor
theta = 0.00001  # Threshold for convergence
actions = ["0", "1", "2", "3"]
num_episodes=100000
value_func=np.zeros((4,4))
count_func=np.zeros((4,4))

# Initialize the number of episodes
for _ in range(num_episodes):
    # Start at the initial state (0, 0)
    state = (0, 0)
    # Initialize an empty list to store the episode
    episode = []
    
    # Generate an episode
    while True:
        # Select a random action from the available actions
        action = random.choice(actions)
        # Determine the next state based on the current state and action
        next_state = move(state, action)
        # Get the reward for the next state
        reward = rewards[next_state]
        # Append the (state, action, reward) tuple to the episode list
        episode.append((state, action, reward,next_state))
        # Update the current state to the next state
        state = next_state
        # If the terminal state (3, 3) is reached, end the episode
        if state == (3, 3):
            break
    
    # Initialize the return value G to 0
    G = 0
    # Iterate over the episode in reverse order
    for t in reversed(range(len(episode))):
        # Extract the state, action, and reward at time t
        state, action, reward,next_state = episode[t]
        # Update the return value G
        
        row, col = state
        
        # Incrementally update V(s)
        # Increase the visit count for the current state
        count_func[row, col] += 1
        # Calculate the learning rate alpha
        alpha = 1.0 / count_func[row, col]
        # Update the value function for the current state
        value_func[row, col] += alpha * (reward+gamma*(value_func[next_state] - value_func[row, col])

# Print the rewards, value function, and visit count
print(rewards, value_func, count_func, sep='\n')

