In [1]:
import sys
import numpy as np

UP = 0
RIGHT = 1
DOWN = 2
LEFT = 3

In [3]:
# parameters must be of the same type, i.e. [x,y] or int value 0-8
# need to adjust to include reward definition for bumping into walls
def calculate_reward(pacman_location, ghost_location, pellet_location):
    if pacman_location == pellet_location:
        return 1000
    elif pacman_location == ghost_location:
        return -1000
    else:
        return 0

# TEST
pl = [2,2]
print(calculate_reward([0,0], [0,0], pl) == -1000)
print(calculate_reward([2,2], [1,0], pl) == 1000)
print(calculate_reward([0,0], [0,1], pl) == 0)
print(calculate_reward(0,0,8) == -1000)
print(calculate_reward(8,0,8) == 1000)
print(calculate_reward(0,1,8) == 0)

True
True
True
True
True
True


In [5]:
'''
enumerate states and associated rewards for a 3x3 grid ==> 81 states 
(9 choices for pacman location x 9 choices for ghost location)

^    __|__|__
|    __|__|__
y      |  |
    x -->

x,y = (0,0) corresponds to the bottom-left corner
x,y = (2,2) corresponds to the top-right corner

each states[s] = an array of 2 entries:
    the first is the x,y coordinates of pacman's position
    the second is the x,y coordinates of the ghost's position

Assume pellet is initialized to the top-right corner (2,2)

'''

states_xy = [];

for s in range(81):
    for x_p in range(3):
        for y_p in range(3):
            for x_g in range(3):
                for y_g in range(3):
                    reward = calculate_reward([x_p, y_p], [x_g, y_g], [2,2])
                    states_xy.append( ([[x_p,y_p],[x_g,y_g]], reward) )
                    
for s in range(81):
    print(states_xy[s])

([[0, 0], [0, 0]], -1000)
([[0, 0], [0, 1]], 0)
([[0, 0], [0, 2]], 0)
([[0, 0], [1, 0]], 0)
([[0, 0], [1, 1]], 0)
([[0, 0], [1, 2]], 0)
([[0, 0], [2, 0]], 0)
([[0, 0], [2, 1]], 0)
([[0, 0], [2, 2]], 0)
([[0, 1], [0, 0]], 0)
([[0, 1], [0, 1]], -1000)
([[0, 1], [0, 2]], 0)
([[0, 1], [1, 0]], 0)
([[0, 1], [1, 1]], 0)
([[0, 1], [1, 2]], 0)
([[0, 1], [2, 0]], 0)
([[0, 1], [2, 1]], 0)
([[0, 1], [2, 2]], 0)
([[0, 2], [0, 0]], 0)
([[0, 2], [0, 1]], 0)
([[0, 2], [0, 2]], -1000)
([[0, 2], [1, 0]], 0)
([[0, 2], [1, 1]], 0)
([[0, 2], [1, 2]], 0)
([[0, 2], [2, 0]], 0)
([[0, 2], [2, 1]], 0)
([[0, 2], [2, 2]], 0)
([[1, 0], [0, 0]], 0)
([[1, 0], [0, 1]], 0)
([[1, 0], [0, 2]], 0)
([[1, 0], [1, 0]], -1000)
([[1, 0], [1, 1]], 0)
([[1, 0], [1, 2]], 0)
([[1, 0], [2, 0]], 0)
([[1, 0], [2, 1]], 0)
([[1, 0], [2, 2]], 0)
([[1, 1], [0, 0]], 0)
([[1, 1], [0, 1]], 0)
([[1, 1], [0, 2]], 0)
([[1, 1], [1, 0]], 0)
([[1, 1], [1, 1]], -1000)
([[1, 1], [1, 2]], 0)
([[1, 1], [2, 0]], 0)
([[1, 1], [2, 1]], 0)
([[1, 1], [2

In [7]:
# TEST

print(states_xy[0]) # returns tuple: nested array of 2 entries (pacman_loc, ghost_loc), reward value
print(states_xy[0][0]) # returns nested array of 2 entries (pacman_loc, ghost_loc)
print(states_xy[0][1]) # returns reward value
print(states_xy[0][0][0]) # returns array of either pacman_loc (x,y)
print(states_xy[0][0][1]) # returns array of ghost_loc (x,y)
print(states_xy[0][0][0][0]) # returns pacman_loc x-coordinate

([[0, 0], [0, 0]], -1000)
[[0, 0], [0, 0]]
-1000
[0, 0]
[0, 0]
0


In [8]:
'''
Returns corresponding (x,y) coordinate pair for valid grid location integer input
If number out of range, returns 'invalid entry' error message
'''

def grid_to_xy(number):
    switch = {
        0: [0,0],
        1: [0,1],
        2: [0,2],
        3: [1,0],
        4: [1,1],
        5: [1,2],
        6: [2,0],
        7: [2,1],
        8: [2,2]
    }
    return switch.get(number, "invalid entry")

# TEST
print(grid_to_xy(1) == [0,1])
print(grid_to_xy(8) == [2,2])
print(grid_to_xy(9) == "invalid entry")

True
True
True


In [9]:
'''
Returns corresponding grid location # for given (x,y) coordinate pair input
If number out of range, returns 'invalid entry' error message
'''

def xy_to_grid(x,y):
    switch = {
        0: {0:0, 1:1, 2:2},
        1: {0:3, 1:4, 2:5},
        2: {0:6, 1:7, 2:8}
    }
    x = switch.get(x,"invalid entry")
    
    if x == "invalid entry":
        return x
    else:
        return x.get(y,"invalid entry")

# TEST
print(xy_to_grid(0,0) == 0) 
print(xy_to_grid(0,1) == 1)
print(xy_to_grid(1,1) == 4)
print(xy_to_grid(2,2) == 8)
print(xy_to_grid(3,1) == "invalid entry")
print(xy_to_grid(1,3) == "invalid entry")

True
True
True
True
True
True


In [10]:
def pacman_move(current_row, current_col, action):
    if action == UP:
        current_row = min(2, current_row+1)
    elif action == RIGHT:
        current_col = min(2, current_col+1)
    elif action == DOWN:
        current_row = max(0, current_row-1)
    elif action == LEFT:
        current_col = max(0, current_col-1)
    return (current_row, current_col)

# TEST
print(pacman_move(0,0,UP) == (1,0))
print(pacman_move(0,0,LEFT) == (0,0))
print(pacman_move(0,0,DOWN) == (0,0))
print(pacman_move(0,0,RIGHT) == (0,1))

print(pacman_move(2,1,LEFT) == (2,0))
print(pacman_move(2,1,UP) == (2,1))
print(pacman_move(2,0,LEFT) == (2,0))

print(pacman_move(2,2,RIGHT) == (2,2))
print(pacman_move(2,2,UP) == (2,2))
print(pacman_move(2,2,LEFT) == (2,1))
print(pacman_move(2,2,DOWN) == (1,2))

print(pacman_move(0,2,RIGHT) == (0,2))
print(pacman_move(0,2,DOWN) == (0,2))

True
True
True
True
True
True
True
True
True
True
True
True
True


In [14]:
states_num = [];

for s in range(81):
    for p in range(9):
        for g in range(9):
            states_num.append( (p, g) )

def return_state(p, g):
    return states_num.index( (p,g) )

# TEST
print(return_state(0,0))
print(return_state(0,1))
print(return_state(2,2))

0
1
20


In [13]:
P = {s : {a : [] for a in range(4)} for s in range(81)}

print(P)

{0: {0: [], 1: [], 2: [], 3: []}, 1: {0: [], 1: [], 2: [], 3: []}, 2: {0: [], 1: [], 2: [], 3: []}, 3: {0: [], 1: [], 2: [], 3: []}, 4: {0: [], 1: [], 2: [], 3: []}, 5: {0: [], 1: [], 2: [], 3: []}, 6: {0: [], 1: [], 2: [], 3: []}, 7: {0: [], 1: [], 2: [], 3: []}, 8: {0: [], 1: [], 2: [], 3: []}, 9: {0: [], 1: [], 2: [], 3: []}, 10: {0: [], 1: [], 2: [], 3: []}, 11: {0: [], 1: [], 2: [], 3: []}, 12: {0: [], 1: [], 2: [], 3: []}, 13: {0: [], 1: [], 2: [], 3: []}, 14: {0: [], 1: [], 2: [], 3: []}, 15: {0: [], 1: [], 2: [], 3: []}, 16: {0: [], 1: [], 2: [], 3: []}, 17: {0: [], 1: [], 2: [], 3: []}, 18: {0: [], 1: [], 2: [], 3: []}, 19: {0: [], 1: [], 2: [], 3: []}, 20: {0: [], 1: [], 2: [], 3: []}, 21: {0: [], 1: [], 2: [], 3: []}, 22: {0: [], 1: [], 2: [], 3: []}, 23: {0: [], 1: [], 2: [], 3: []}, 24: {0: [], 1: [], 2: [], 3: []}, 25: {0: [], 1: [], 2: [], 3: []}, 26: {0: [], 1: [], 2: [], 3: []}, 27: {0: [], 1: [], 2: [], 3: []}, 28: {0: [], 1: [], 2: [], 3: []}, 29: {0: [], 1: [], 2: [

In [None]:
ns = 81
na = 4

def calculate_action_values(current_state, V):
    action_values = np.zeros(env.num_actions)
    for a in range(env.num_actions):
        for prob, next_state, reward, done in env.P[current_state][a]:
            action_values[a] += prob * (reward + (gamma * V[next_state]))
    return action_values

def extract_policy(V):
    policy = np.zeros([ns, na])

    for s in range(ns):
        action_values = calculate_action_values(s, V)
        best_action = np.argmax(action_values) # returns index of action that has maximum V
        policy[s, best_action] = 1 # deterministic optimal policy, i.e. always take best_action for given state

# TEST
V = np.zeros(ns)
policy = extract_policy(V)