In [1]:
import numpy as np 

In [2]:
# Initialize the maze with all rewards on each cell
SMALL_ENOUGH = 0.005
GAMMA = 0.7         
NOISE = 0.10  

DIRECTION = {
    3: '\u2191', #U
    2: '\u2192', #R
    1: '\u2193', #D
    0: '\u2190' #L
}
# print(' '.join([ACTION[i] for i in range(4)]))


In [3]:
def maze_states(cols,rows):
    """Initialize all states for given grid size.
    """
    all_states = []
    for i in range(cols):
        for j in range(rows):
                all_states.append((i,j))
    return all_states

In [4]:
states = maze_states(4,4)
states

[(0, 0),
 (0, 1),
 (0, 2),
 (0, 3),
 (1, 0),
 (1, 1),
 (1, 2),
 (1, 3),
 (2, 0),
 (2, 1),
 (2, 2),
 (2, 3),
 (3, 0),
 (3, 1),
 (3, 2),
 (3, 3)]

In [5]:
# Each state which I printed above has their own reward, these are defined here
rewards = {}
for i in states:
    if i == (0,0):
        rewards[i] = +10
    elif i == (0,1):
        rewards[i] = -2
    elif i == (2,2):
        rewards[i] = -10
    elif i == (2,3):
        rewards[i] = -10
    elif i == (3,3):
        rewards[i] = +40
    else:
        rewards[i] = -1

In [6]:
rewards

{(0, 0): 10,
 (0, 1): -2,
 (0, 2): -1,
 (0, 3): -1,
 (1, 0): -1,
 (1, 1): -1,
 (1, 2): -1,
 (1, 3): -1,
 (2, 0): -1,
 (2, 1): -1,
 (2, 2): -10,
 (2, 3): -10,
 (3, 0): -1,
 (3, 1): -1,
 (3, 2): -1,
 (3, 3): 40}

In [7]:
# All possible directions at a given state. For example, you're only able to go Up and Left at state 0,0
ACTIONS = {
 (0, 0): ('U','L'),
 (0, 1):('U','R','L'),
 (0, 2):('U','L','R'),
 (0, 3):('L','U'),
 (1, 0):('U','D','R'),
 (1, 1):('U','D','L','R'),
 (1, 2):('U','D','L','R'),
 (1, 3):('U','D','L'),
 (2, 0):('U','D','R'),
 (2, 1):('U','D','L','R'),
 (2, 2):('U','D','L','R'),
 (2, 3):('U','D','L'),
 (3, 0):('D','R'),
 (3, 1):('D','L','R'),
 (3, 2):('D','L','R'),
 (3, 3):('D','L')
}

In [8]:
# The initial policy is defined here based on all the actions above
policy = {}
for s in ACTIONS.keys():
    policy[s] = np.random.choice(ACTIONS[s])

In [9]:
policy # This is an example policy/route the agent can take 

{(0, 0): 'U',
 (0, 1): 'R',
 (0, 2): 'R',
 (0, 3): 'L',
 (1, 0): 'R',
 (1, 1): 'R',
 (1, 2): 'L',
 (1, 3): 'L',
 (2, 0): 'R',
 (2, 1): 'D',
 (2, 2): 'U',
 (2, 3): 'U',
 (3, 0): 'D',
 (3, 1): 'R',
 (3, 2): 'D',
 (3, 3): 'L'}

In [10]:
# define initial value function
V = {}
for s in states:
    if s in ACTIONS.keys():
        V[s] = -1
    if s == (0,0):
        V[s] = +10
    if s == (0,1):
        V[s] = -2
    if s == (2,2):
        V[s] = -10
    if s == (2,3):
        V[s] = -10
    if s == (3,3):
        V[s] = +40


In [11]:
V

{(0, 0): 10,
 (0, 1): -2,
 (0, 2): -1,
 (0, 3): -1,
 (1, 0): -1,
 (1, 1): -1,
 (1, 2): -1,
 (1, 3): -1,
 (2, 0): -1,
 (2, 1): -1,
 (2, 2): -10,
 (2, 3): -10,
 (3, 0): -1,
 (3, 1): -1,
 (3, 2): -1,
 (3, 3): 40}

In [14]:
iteration = 0

while True:
    biggest_change = 0
    for s in states:            
        if s in policy:
            
            old_v = V[s]
            new_v = 0
            
            for a in ACTIONS[s]:
                if a == 'U':
                    nxt = [s[0]-1, s[1]]
                if a == 'D':
                    nxt = [s[0]+1, s[1]]
                if a == 'L':
                    nxt = [s[0], s[1]-1]
                if a == 'R':
                    nxt = [s[0], s[1]+1]

                #Choose a new random action to do (transition probability)
                random_1=np.random.choice([i for i in ACTIONS[s] if i != a])
                if random_1 == 'U':
                    act = [s[0]-1, s[1]]
                if random_1 == 'D':
                    act = [s[0]+1, s[1]]
                if random_1 == 'L':
                    act = [s[0], s[1]-1]
                if random_1 == 'R':
                    act = [s[0], s[1]+1]

                #Calculate the value
                nxt = tuple(nxt)
                act = tuple(act)
                print(nxt)
                print(act)
                v = rewards[s] + (GAMMA * ((1-NOISE)* V[nxt] + (NOISE * V[act]))) 
                if v > new_v: #Is this the best action so far? If so, keep it
                    new_v = v
                    policy[s] = a

       #Save the best of all actions for the state                                
            V[s] = new_v
            biggest_change = max(biggest_change, np.abs(old_v - V[s]))


(-1, 0)
(0, -1)


KeyError: (-1, 0)