# Policy Evaluation - Changed Topology
> find V(s) for every s

## Step 1: Imports

In [1]:
import random
from grid_world import Grid

In [2]:
# I have my own implementation differents from src
def generate_random_action(g):
    A = {}
    for state, actions in g.actions.items():
        A[state] = random.choice(actions)
    return A
def print_policy(P, g):
    print("="*(g.width*16+1))
    for i in range(g.height):
        print("|", end="")
        for j in range(g.width):
            p = P.get((i,j), ' ')
            print("{:^15}|".format(p), end="")
        print()
        if i < g.height-1:
            print("-"*(g.width*16+1))
    print("="*(g.width*16+1))
def print_results(V, P, g):
    print("="*(g.width*16+1))
    for i in range(g.height):
        print("|", end="")
        for j in range(g.width):
            v = V.get((i, j), 0)
            p = P.get((i, j), " ")
            p = "T" if (i, j) in g.rewards else "X" if (i, j) not in g.actions else p
            print("{:^15}|".format("{} ({})".format(v, p)), end="")
        print()
        if i < g.height-1:
            print("-"*(g.width*16+1))
    print("="*(g.width*16+1))

## Step 2: initialization
* Grid
* V
* gamma (discount factor)
* threshold (used to evalute delta V max)
* policy (manually assign)

In [3]:
# Grid
rewards = {
    (1, 4): 1,
    (3, 4): -1 }
actions = {
    (0, 0): ('D', 'R'),
    (0, 1): ('D', 'R', 'L'),
    (0, 2): ('D', 'R', 'L'),
    (0, 3): ('D', 'R', 'L'),
    (0, 4): ('D', 'R', 'L'),
#     (0, 5): ('D', 'L'),
    (1, 0): ('U', 'D', 'R'),
    (1, 1): ('U', 'D', 'R', 'L'),
    (1, 2): ('U', 'D', 'R', 'L'),
#     (1, 3): ('U', 'D', 'R', 'L'),
#     (1, 4): ('U', 'D', 'R', 'L'),
#     (1, 5): ('U', 'D', 'L'),
    (2, 0): ('U', 'D', 'R'),
    (2, 1): ('U', 'D', 'R', 'L'),
    (2, 2): ('U', 'D', 'R', 'L'),
#     (2, 3): ('U', 'D', 'R', 'L'),
#     (2, 4): ('U', 'D', 'R', 'L'),
#     (2, 5): ('U', 'D', 'L'),
    (3, 0): ('U', 'D', 'R'),
    (3, 1): ('U', 'D', 'R', 'L'),
    (3, 2): ('U', 'D', 'R', 'L'),
    (3, 3): ('U', 'D', 'R', 'L'),
#     (3, 4): ('U', 'D', 'R', 'L'),
#     (3, 5): ('U', 'D', 'L'),
    (4, 0): ('U', 'D', 'R'),
    (4, 1): ('U', 'D', 'R', 'L'),
    (4, 2): ('U', 'D', 'R', 'L'),
#     (4, 3): ('U', 'D', 'R', 'L'),
#     (4, 4): ('U', 'D', 'R', 'L'),
#     (4, 5): ('U', 'D', 'L'),
    (5, 0): ('U', 'D', 'R'),
    (5, 1): ('U', 'D', 'R', 'L'),
    (5, 2): ('U', 'D', 'R', 'L'),
    (5, 3): ('U', 'D', 'R', 'L'),
    (5, 4): ('U', 'D', 'R', 'L'),
    (5, 5): ('U', 'D', 'L'),
    (6, 0): ('U', 'D', 'R'),
    (6, 1): ('U', 'D', 'R', 'L'),
    (6, 2): ('U', 'D', 'R', 'L'),
    (6, 3): ('U', 'D', 'R', 'L'),
    (6, 4): ('U', 'D', 'R', 'L'),
    (6, 5): ('U', 'D', 'L'),
    (7, 0): ('U', 'R'),
    (7, 1): ('U', 'R', 'L'),
    (7, 2): ('U', 'R', 'L'),
    (7, 3): ('U', 'R', 'L'),
    (7, 4): ('U', 'R', 'L'),
    (7, 5): ('U', 'L') }
grid = Grid(6, 8, (7, 0))
grid.set(rewards, actions)

# V, gamma, threshold, policy
V = {s: 0 for s in grid.all_states()}
gamma = 1.0
SMALL_ENOUGH = 1e-3
policy = { # Manually assigned fixed policy
    (0, 0): ('R'),
    (0, 1): ('R'),
    (0, 2): ('R'),
    (0, 3): ('R'),
    (0, 4): ('D'),
#     (0, 5): (''),
    (1, 0): ('U'),
    (1, 1): ('D'),
    (1, 2): ('D'),
#     (1, 3): (''),
#     (1, 4): (''),
#     (1, 5): (''),
    (2, 0): ('U'),
    (2, 1): ('D'),
    (2, 2): ('D'),
#     (2, 3): (''),
#     (2, 4): (''),
#     (2, 5): (''),
    (3, 0): ('U'),
    (3, 1): ('R'),
    (3, 2): ('R'),
    (3, 3): ('R'),
#     (3, 4): (''),
#     (3, 5): (''),
    (4, 0): ('U'),
    (4, 1): ('U'),
    (4, 2): ('U'),
#     (4, 3): (''),
#     (4, 4): (''),
#     (4, 5): (''),
    (5, 0): ('U'),
    (5, 1): ('U'),
    (5, 2): ('U'),
    (5, 3): ('L'),
    (5, 4): ('L'),
    (5, 5): ('L'),
    (6, 0): ('U'),
    (6, 1): ('U'),
    (6, 2): ('U'),
    (6, 3): ('L'),
    (6, 4): ('L'),
    (6, 5): ('L'),
    (7, 0): ('U'),
    (7, 1): ('U'),
    (7, 2): ('U'),
    (7, 3): ('L'),
    (7, 4): ('L'),
    (7, 5): ('L') }

# Print Policy
print_policy(policy, grid)

|       R       |       R       |       R       |       R       |       D       |               |
-------------------------------------------------------------------------------------------------
|       U       |       D       |       D       |               |               |               |
-------------------------------------------------------------------------------------------------
|       U       |       D       |       D       |               |               |               |
-------------------------------------------------------------------------------------------------
|       U       |       R       |       R       |       R       |               |               |
-------------------------------------------------------------------------------------------------
|       U       |       U       |       U       |               |               |               |
-------------------------------------------------------------------------------------------------
|       U       |   

## Step 3: Main Function

In [4]:
while True:
    biggest_change = 0 # delta V max
    for s in grid.all_states():
        old_v = V[s]
        if s in policy:
            a = policy[s]
            grid.set_state(s)
            reward = grid.move(a)
            new_v = (reward + gamma * V[grid.current_state()])
            V[s] = new_v
#             print("{} update to : reward({}) + r({}) * current_state({}) = {}".format(s, reward, gamma, V[grid.current_state()], new_v))
            biggest_change = max(biggest_change, abs(old_v - new_v))
#     print("Biggest Change: {}".format(biggest_change))
#     print_values(V, grid)
    if biggest_change < SMALL_ENOUGH:
        break

## Step 4: Show Result

In [5]:
print_results(V, policy, grid)

|    1.0 (R)    |    1.0 (R)    |    1.0 (R)    |    1.0 (R)    |    1.0 (D)    |     0 (X)     |
-------------------------------------------------------------------------------------------------
|    1.0 (U)    |   -1.0 (D)    |   -1.0 (D)    |     0 (X)     |     0 (T)     |     0 (X)     |
-------------------------------------------------------------------------------------------------
|    1.0 (U)    |   -1.0 (D)    |   -1.0 (D)    |     0 (X)     |     0 (X)     |     0 (X)     |
-------------------------------------------------------------------------------------------------
|    1.0 (U)    |   -1.0 (R)    |   -1.0 (R)    |   -1.0 (R)    |     0 (T)     |     0 (X)     |
-------------------------------------------------------------------------------------------------
|    1.0 (U)    |   -1.0 (U)    |   -1.0 (U)    |     0 (X)     |     0 (X)     |     0 (X)     |
-------------------------------------------------------------------------------------------------
|    1.0 (U)    |   