In [37]:
import numpy as np
import time
from grid_world import standard_grid
SMALL_TOL=10e-8

def print_values(V, g):
  for i in range(g.width):
    print("---------------------------")
    for j in range(g.height):
      v = V.get((i,j), 0)
      if v >= 0:
        print(" %.2f|" % v, end="")
      else:
        print("%.2f|" % v, end="") # -ve sign takes up an extra space
    print("")


def print_policy(P, g):
  for i in range(g.width):
    print("---------------------------")
    for j in range(g.height):
      a = P.get((i,j), ' ')
      print("  %s  |" % a, end="")
    print("")
    

In [38]:
# Main starts here

In [39]:
grid = standard_grid()
states = grid.all_states()

In [44]:
### uniformly random actions ###
# initialize V(s) = 0
V = {}
for s in states:
    V[s] = 0
    gamma = 0.9 # discount factor

# repeat until convergence
iter_n=1
while True:
    biggest_change = 0
    for s in states:
        old_v = V[s]
        # V(s) only has value if it's not a terminal state
        if s in grid.actions:
            new_v = 0 # we will accumulate the answer
            p_a = 1.0 / len(grid.actions[s]) # each action has equal probability
            for a in grid.actions[s]:
                grid.set_state(s)
                r = grid.move(a)
                new_v += p_a * (r + gamma * V[grid.current_state()]) # use value from the previous iteration
            V[s] = new_v
            biggest_change = max(biggest_change, np.abs(old_v - V[s]))
    if biggest_change < SMALL_TOL: # check is stopping criteria met
        break
    else:
        time.sleep(0.01)
        iter_n+=1
        print("Iter --> {1} Biggest_Change --> {0}".format(biggest_change,iter_n),end="\r")
print()
print("values for uniformly random actions:")
print_values(V, grid)
print("\n\n")

Iter --> 36 Biggest_Change --> 1.1317412559885831e-07
values for uniformly random actions:
---------------------------
 0.05| 0.14| 0.27| 0.00|
---------------------------
-0.03| 0.00|-0.37| 0.00|
---------------------------
-0.11|-0.22|-0.38|-0.67|





In [47]:
### fixed policy ###
policy = {
(2, 0): 'U',
(1, 0): 'U',
(0, 0): 'R',
(0, 1): 'R',
(0, 2): 'R',
(1, 2): 'R',
(2, 1): 'R',
(2, 2): 'R',
(2, 3): 'U',
}
print_policy(policy, grid)

---------------------------
  R  |  R  |  R  |     |
---------------------------
  U  |     |  R  |     |
---------------------------
  U  |  R  |  R  |  U  |


In [48]:
# initialize V(s) = 0
V = {}
for s in states:
    V[s] = 0
# let's see how V(s) changes as we get further away from the reward
gamma = 0.9 # discount factor
# repeat until convergence
while True:
    biggest_change = 0
    for s in states:
        old_v = V[s]
        # V(s) only has value if it's not a terminal state
        if s in policy:
            a = policy[s]
            grid.set_state(s)
            r = grid.move(a)
            V[s] = r + gamma * V[grid.current_state()] # no need to sum up actions
            biggest_change = max(biggest_change, np.abs(old_v - V[s]))
    if biggest_change < SMALL_TOL:
        break

print("values for fixed policy:")
print_values(V, grid)

values for fixed policy:
---------------------------
 0.81| 0.90| 1.00| 0.00|
---------------------------
 0.73| 0.00|-1.00| 0.00|
---------------------------
 0.66|-0.81|-0.90|-1.00|
