In [16]:
class MDP:
    def __init__(self, states, actions, dyn):

        # states: any list
        self.states = states

        # actions: any list
        self.actions = actions

        # self.dyn(s :: state, a :: action) :: [(real, state, real)]
        # The return value is a list of (p1, s1, r1), where p1 is the
        # probability, s1 is the state, and r1 is the reward.
        # When a is not allowed in s, then the None should be returned.
        # Thus, for a terminating state st, self.dyn(st, a) should be None
        # for any action a.
        self.dyn = dyn
    
    # 
    # pol: policy.  Dictionary.  Key Set is the list of states.  The value for a key s
    #   is a list of (probablity, action).  For a terminating state, the value must be
    #   the empty list.
    def policy_eval(self, pol, gamma, v = None, thr = 0.01):
        if v is None:
            v = {s:0.0 for s in self.states}
        while True:
            newV = [sum([p * p1 * r + gamma * v[s1]
                         for (p, a) in pol[s]
                         for (p1, s1, r) in self.dyn(s, a)])
                    for s in self.states]
            delta = sum([abs(newV[s] - v[s]) for s in self.states])
            if delta < thr:  break
        return newV

In [17]:
e41States = list(range(15))
e41Actions = "URDL"
e41Dir = { 'U': (-1,0), 'R': (0,1), 'D': (1,0), 'L': (0,-1) }
def e41Nxt(s, a):
    dy, dx = e41Dir[a]
    y, x = s // 4, s % 4
    nxt = 4 * ((y + dy) // 4) + ((x + dx) % 4)
    return 0 if nxt == 15 else nxt

def e41Dyn(s, a):
    if s == 0: return None
    return [(1, e41Nxt(s, a), -1)]

mdp_e41 = MDP(e41States, e41Actions, e41Dyn)

In [20]:
pol = { i : [ (0.25, e41Nxt(i, a)) for a in e41Actions ]
        for i in range(1, 15) }
pol[0] = []

In [21]:
mdp_e41.policy_eval(pol, 1.0)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]


KeyError: -3

In [3]:
(-1) % 4

3