In [None]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import Normalize

In [None]:
movable = np.array(
        [[1, 1, 1, 1],
         [1, 0, 1, 1],
         [1, 1, 1, 1]]
    ).astype(np.bool)

rewards = np.array(
        [[0, 0, 0, 1],
         [0, 0, 0, -100],
         [0, 0, 0, 0]]
    ).astype(np.float)

actions = np.array([
    {(0, 0): 1},
    {(0, -1): 0.8, (-1, 0): 0.1, (1, 0): 0.1},
    {(0, 1): 0.8, (-1, 0): 0.1, (1, 0): 0.1},
    {(-1, 0): 0.8, (0, -1): 0.1, (0, 1): 0.1}, 
    {(1, 0): 0.8, (0, -1): 0.1, (0, 1): 0.1}
])

action_label = np.array(['o', '<', '>', '^', 'v', ' '])
gamma = 0.8

In [None]:
def new_pos(pos, move):
    return tuple(np.array(pos) + np.array(move))

def valid_move(pos, move, movable):
    return new_pos(pos, move) in np.ndindex(value.shape) \
        and movable[new_pos(pos, move)]

def valid_pos(pos, movable):
    return movable[pos]

def action_value(pos, action, movable):
    return np.sum([
        value[new_pos(pos, move)] * probability
        for move, probability in action.items()
        if valid_move(pos, move, movable)
    ])

def calc_new_value(pos, value, movable):
    return rewards[pos] + gamma * np.max([
        action_value(pos, action, movable)
        for action in actions
        if valid_pos(pos, movable)
    ], initial=np.NINF)

In [None]:
value = np.zeros_like(rewards)
for _ in range(10):
    new_value = np.zeros_like(value)
    for pos in np.ndindex(value.shape):
        new_value[pos] = calc_new_value(pos, value, movable)
    value = new_value
    print(value)
    plt.imshow(value, norm=Normalize(-0.1, 4))
    plt.show()

In [None]:
def calc_best_action(pos, value, movable):
    return np.argmax([
        action_value(pos, action, movable)
        for action in actions
    ]) if valid_pos(pos, movable) else -1

In [None]:
plt.imshow(value, norm=Normalize(-0.5, 4))
best_action = np.zeros_like(value).astype(np.int)
for pos in np.ndindex(value.shape):
    best_action[pos] = calc_best_action(pos, value, movable)
    plt.text(pos[1], pos[0], action_label[best_action[pos]])