In [1]:
from env import *
from tabulate import tabulate

grid = Grid(size=(5, 5), special_states={(0, 1): (10, (4, 1)),
                                         (0, 3): (5, (2, 3))})
agent = Agent(grid.valid_states, (3, 4))

policy = agent.random_policy

###  Solve state-value function

In [2]:
gamma = 0.9
agent.reset_state_values()
for i in range(20):
    new_state_value = agent.state_value.copy()
    for state in grid.valid_states:
        backups = []
        for action in agent.action_set:
            p_a = 1 / len(agent.action_set)
            next_state, reward = grid.reward(state, action)
            new_value = p_a * (reward + gamma * agent.state_value[next_state])
            backups.append(new_value)
        new_state_value[state] = sum(backups)
    agent.state_value = new_state_value
table = np.around(show_values((5, 5), agent.state_value), decimals=1)
print(table)

[[ 3.3  8.8  4.4  5.3  1.5]
 [ 1.5  3.   2.3  1.9  0.6]
 [ 0.1  0.8  0.7  0.4 -0.4]
 [-1.  -0.4 -0.3 -0.6 -1.2]
 [-1.8 -1.3 -1.2 -1.4 -2. ]]


### in-place version

In [3]:
agent.reset_state_values()
gamma = 0.9
for i in range(15):
    for state in grid.valid_states:
        backups = []
        for action in agent.action_set:
            p_a = 1 / len(agent.action_set)
            next_state, reward = grid.reward(state, action)
            new_value = p_a * (reward + gamma * agent.state_value[next_state])
            backups.append(new_value)
        agent.state_value[state] = sum(backups)
table = np.around(show_values((5, 5), agent.state_value), decimals=1)
print(table)

[[ 3.4  8.8  4.5  5.4  1.5]
 [ 1.6  3.   2.3  2.   0.6]
 [ 0.1  0.8  0.7  0.4 -0.4]
 [-0.9 -0.4 -0.3 -0.5 -1.1]
 [-1.8 -1.3 -1.2 -1.4 -1.9]]


### Optimal state-value function

In [13]:
gamma = 0.9
agent.reset_state_values()
for i in range(500):
    new_state_value = agent.state_value.copy()
    for state in grid.valid_states:
        backups = []
        for action in agent.action_set:
            p_a = 1 # Transition prob is 1.0
            next_state, reward = grid.reward(state, action)
            new_value = p_a * (reward + gamma * agent.state_value[next_state])
            backups.append(new_value)
        new_state_value[state] = max(backups)
    agent.state_value = new_state_value
table = np.around(show_values((5, 5), agent.state_value), decimals=1)
print(table)

[[22.  24.4 22.  19.4 17.5]
 [19.8 22.  19.8 17.8 16. ]
 [17.8 19.8 17.8 16.  14.4]
 [16.  17.8 16.  14.4 13. ]
 [14.4 16.  14.4 13.  11.7]]
