In [3]:
import numpy as np
import time
from grid_world import standard_grid,negative_grid
SMALL_TOL=10e-10
ALL_Actions=['U','D','L','R',]
GAMMA=0.9
def print_values(V, g):
  print("Values:")
  for i in range(g.width):
    print("---------------------------")
    for j in range(g.height):
      v = V.get((i,j), 0)
      if v >= 0:
        print(" %.2f|" % v, end="")
      else:
        print("%.2f|" % v, end="") # -ve sign takes up an extra space
    print("")


def print_policy(P, g):
  print("Policy:")
  for i in range(g.width):
    print("---------------------------")
    for j in range(g.height):
      a = P.get((i,j), ' ')
      print("  %s  |" % a, end="")
    print("")
    

In [9]:
def play_game(grid,policy):
    start_states=list(grid.actions.keys())
    start_idx=np.random.choice(len(start_states))
    grid.set_state(start_states[start_idx])
    s=grid.current_state()
    states_and_rewards=[(s,0)] # current state with no reward
    while not grid.game_over():
        a=policy[s]
        r=grid.move(a)
        s=grid.current_state()
        states_and_rewards.append((s,r))
    G=0 # termincal state with value = 0
    states_and_returns=[]
    first=True
    for s, r in reversed(states_and_rewards):
        if first:
            first=False
        else:
            states_and_returns.append((s,G))
        G=r+GAMMA*G
    states_and_returns.reverse()
    return states_and_returns

In [5]:
grid=standard_grid()
print("rewards:")
print_values(grid.rewards,grid)

rewards:
Values:
---------------------------
 0.00| 0.00| 0.00| 1.00|
---------------------------
 0.00| 0.00| 0.00|-1.00|
---------------------------
 0.00| 0.00| 0.00| 0.00|


In [6]:
# state -> action
policy = {
(2, 0): 'U',
(1, 0): 'U',
(0, 0): 'R',
(0, 1): 'R',
(0, 2): 'R',
(1, 2): 'R',
(2, 1): 'R',
(2, 2): 'R',
(2, 3): 'U',
}

In [28]:
# Main starts here

In [15]:
# initialize V(s) = 0
V = {}
returns={}
states=grid.all_states()
for s in states:
    V[s] = 0
    if s in grid.actions:
        returns[s]=[]
    else:
        V[s]=0

# value iteration
iter_n=0
for t in range(1000):
    iter_n+=1
    states_and_returns=play_game(grid,policy)
    seen_states=set() # this reset in each replication
    for s,G in states_and_returns:
        if s not in seen_states:
            returns[s].append(G)
            V[s]=np.mean(returns[s])
            seen_states.add(s)
    print(iter_n,end='\r')
    time.sleep(0.001)
#check the result
print()
print_values(V,grid)
print_policy(policy,grid)

1000
Values:
---------------------------
 0.81| 0.90| 1.00| 0.00|
---------------------------
 0.73| 0.00|-1.00| 0.00|
---------------------------
 0.66|-0.81|-0.90|-1.00|
Policy:
---------------------------
  R  |  R  |  R  |     |
---------------------------
  U  |     |  R  |     |
---------------------------
  U  |  R  |  R  |  U  |
