# Policy Evaluation
> find V(s) for every s

## Step 1: Imports

In [1]:
import random
from grid_world import Grid

In [2]:
# I have my own implementation differents from src
def generate_random_action(g):
    A = {}
    for state, actions in g.actions.items():
        A[state] = random.choice(actions)
    return A
def print_policy(P, g):
    print("="*(g.width*16+1))
    for i in range(g.height):
        print("|", end="")
        for j in range(g.width):
            p = P.get((i,j), ' ')
            print("{:^15}|".format(p), end="")
        print()
        if i < g.height-1:
            print("-"*(g.width*16+1))
    print("="*(g.width*16+1))
def print_results(V, P, g):
    print("="*(g.width*16+1))
    for i in range(g.height):
        print("|", end="")
        for j in range(g.width):
            v = V.get((i, j), 0)
            p = P.get((i, j), " ")
            p = "T" if (i, j) in g.rewards else "X" if (i, j) not in g.actions else p
            print("{:^15}|".format("{} ({})".format(v, p)), end="")
        print()
        if i < g.height-1:
            print("-"*(g.width*16+1))
    print("="*(g.width*16+1))

## Step 2: initialization
* Grid
* V
* gamma (discount factor)
* threshold (used to evalute delta V max)
* policy (manually assign)

In [3]:
# Grid
rewards = {
    (0, 3): 1,
    (1, 3): -1 }
actions = {
    (0, 0): ('D', 'R'),
    (0, 1): ('R', 'L'),
    (0, 2): ('R', 'L', 'D'),
#     (0, 3)
    (1, 0): ('U', 'D'),
#     (1, 1)
    (1, 2): ('U', 'D', 'R'),
#     (1, 3)
    (2, 0): ('U', 'R'),
    (2, 1): ('R', 'L'),
    (2, 2): ('U', 'R', 'L'),
    (2, 3): ('U', 'L') }
grid = Grid(4, 3, (2, 0))
grid.set(rewards, actions)

# V, gamma, threshold, policy
V = {s: 0 for s in grid.all_states()}
gamma = 1.0
SMALL_ENOUGH = 1e-3
policy = generate_random_action(grid)

# Print Policy
print_policy(policy, grid)

|       D       |       R       |       R       |               |
-----------------------------------------------------------------
|       D       |               |       R       |               |
-----------------------------------------------------------------
|       R       |       R       |       U       |       U       |


## Step 3: Main Function

In [4]:
while True:
    biggest_change = 0 # delta V max
    for s in grid.all_states():
        old_v = V[s]
        if s in policy:
            a = policy[s]
            grid.set_state(s)
            reward = grid.move(a)
            new_v = (reward + gamma * V[grid.current_state()])
            V[s] = new_v
#             print("{} update to : reward({}) + r({}) * current_state({}) = {}".format(s, reward, gamma, V[grid.current_state()], new_v))
            biggest_change = max(biggest_change, abs(old_v - new_v))
#     print("Biggest Change: {}".format(biggest_change))
#     print_values(V, grid)
    if biggest_change < SMALL_ENOUGH:
        break

## Step 4: Show Result

In [5]:
print_results(V, policy, grid)

|   -1.0 (D)    |    1.0 (R)    |    1.0 (R)    |     0 (T)     |
-----------------------------------------------------------------
|   -1.0 (D)    |     0 (X)     |   -1.0 (R)    |     0 (T)     |
-----------------------------------------------------------------
|   -1.0 (R)    |   -1.0 (R)    |   -1.0 (U)    |   -1.0 (U)    |
