In [1]:
import numpy as np

In [2]:
# Assignment for Adaptive Systems
REWARD = -1
GAMMA = 0.9

# Arrow symbols
# ACTIONS = {
#     3: '\u2191', #U
#     2: '\u2192', #R
#     1: '\u2193', #D
#     0: '\u2190' #L
# }
# print(' '.join([ACTION[i] for i in range(4)]))

# Initialize the actions which can be D, L, U, R
ACTIONS = [(1, 0), (0, -1), (-1, 0), (0, 1)] 
NUM_ACTIONS = len(ACTIONS)

# Grid size based on amount of rows and columns
ROW = 4
COL = 4

# Grid with all of the rewards
U = [[-1, -1, -1, 40],
    [-1, -1, -10, -10],
    [-1, -1, -1, -1],
    [10, -2, -1, -1]]


In [4]:
def maze_grid(arr, policy=False):
    """This function initializes the maze grid with all of the 
    given rewards per state and prints it out nicely.    
    The grid has 4 rows and 4 columns which is set in the cell above.

    Args:
        arr::[int]
            Multidimensional grid with rewards per state
    
    Returns:
        res::int
            Prints out the result
    """
    res = ""
    for r in range(ROW):
        res += "|"
        for c in range(COL):
            # val = "-1"
            # if r == 0 and c == 0:
            #     val = "+10" 
            # elif r == 0 and c == 1:
            #     val = "-2"
            # elif r == 2 and c == 2:
            #     val = "-10"
            # elif r == 2 and c == 3:
            #     val = "-10"
            # elif r == 3 and c == 3:
            #     val = "+40"
            # else:
            if policy:
                val = ['\u2193', '\u2190', '\u2191', '\u2192'][arr[r][c]]
            else:
                val = str(arr[r][c])
            res += " " + val[:5].ljust(5) + " |" # format
        res += "\n"
    print(res)


In [5]:
maze_grid(U)

| -1    | -1    | -1    | 40    |
| -1    | -1    | -10   | -10   |
| -1    | -1    | -1    | -1    |
| 10    | -2    | -1    | -1    |



In [6]:
# Get the utility of the state reached by performing the given action from the given state
def getU(U, r, c, action):
    """This function initializes the maze grid with all of the 
    given rewards per state and prints it out nicely.    
    The grid has 4 rows and 4 columns which is set in the cell above.

    Args:
        U::[int]
            Multidimensional grid with rewards per state
        r::[int]
            Multidimensional grid with rewards per state          
        c::[int]
            Multidimensional grid with rewards per state
        action::[int]
            Multidimensional grid with rewards per state 
    Returns:
        U[newR][newC]::int
            Prints out the result
    """
    dr, dc = ACTIONS[action]
    newR, newC = r+dr, c+dc
    if newR < 0 or newC < 0 or newR >= ROW or newC >= COL or (newR == newC == 1):
        return U[r][c]
    else:
        return U[newR][newC]

# Calculate the utility of a state given an action
def calculateU(U, r, c, action):
    """This function initializes the maze grid with all of the 
    given rewards per state and prints it out nicely.    
    The grid has 4 rows and 4 columns which is set in the cell above.

    Args:
        U::[int]
            Multidimensional grid with rewards per state
        r::[int]
            Multidimensional grid with rewards per state          
        c::[int]
            Multidimensional grid with rewards per state
        action::[int]
            Multidimensional grid with rewards per state 
    Returns:
        u::int
            Prints out the result
    """
    u = REWARD
    u += 0.1 * GAMMA * getU(U, r, c, (action-1)%4)
    u += 0.8 * GAMMA * getU(U, r, c, action)
    u += 0.1 * GAMMA * getU(U, r, c, (action+1)%4)
    return u

def valueIteration(U):
    """This function initializes the maze grid with all of the 
    given rewards per state and prints it out nicely.    
    The grid has 4 rows and 4 columns which is set in the cell above.

    Args:
        U::[int]
            Multidimensional grid with rewards per state
    
    Returns:
        U::int
            Prints out the result
    """
    print("During the value iteration:\n")
    while True:
        nextU = [[0, 0, 0, 1], [0, 0, 0, -1], [0, 0, 0, 0], [0, 0, 0, 0]]
        error = 0
        for r in range(ROW):
            for c in range(COL):
                if (r <= 1 and c == 3) or (r == c == 1):
                    continue
                nextU[r][c] = max([calculateU(U, r, c, action) for action in range(NUM_ACTIONS)]) # Bellman update
                error = max(error, abs(nextU[r][c]-U[r][c]))
        U = nextU
        maze_grid(U)
        if error < ((1-GAMMA) / GAMMA):
            break
    return U

# Get the optimal policy from U
def getOptimalPolicy(U):
    """This function initializes the maze grid with all of the 
    given rewards per state and prints it out nicely.    
    The grid has 4 rows and 4 columns which is set in the cell above.

    Args:
        U::[int]
            Multidimensional grid with rewards per state
    
    Returns:
        policy::int
            returns the optimal policy
    """
    policy = [[-1, -1, -1, -1] for i in range(ROW)]
    for r in range(ROW):
        for c in range(COL):
            if (r <= 1 and c == 3) or (r == c == 1):
                continue
            # Choose the action that maximizes the utility
            maxAction, maxU = None, -float("inf")
            for action in range(NUM_ACTIONS):
                u = calculateU(U, r, c, action)
                if u > maxU:
                    maxAction, maxU = action, u
            policy[r][c] = maxAction
    return policy

In [7]:
# Print the initial environment
print("The initial U is:\n")
maze_grid(U)

# Value iteration
U = valueIteration(U)

# Get the optimal policy from U and print it
policy = getOptimalPolicy(U)
print("The optimal policy is:\n")
maze_grid(policy, True)

The initial U is:

| -1    | -1    | -1    | 40    |
| -1    | -1    | -10   | -10   |
| -1    | -1    | -1    | -1    |
| 10    | -2    | -1    | -1    |

During the value iteration:

| -1.90 | -1.90 | 26.81 | 1     |
| -1.90 | 0     | -3.52 | -1    |
| 6.020 | -1.90 | -1.90 | -1.90 |
| 7.010 | 5.930 | -1.90 | -1.90 |

| -2.71 | 17.96 | 18.22 | 1     |
| 2.992 | 0     | 17.89 | -1    |
| 4.418 | 3.697 | -2.71 | -2.06 |
| 5.219 | 4.409 | 2.927 | -2.71 |

| 11.95 | 15.35 | 15.18 | 1     |
| 2.719 | 0     | 13.64 | -1    |
| 3.488 | 2.910 | 12.03 | -2.14 |
| 3.625 | 3.487 | 2.194 | 0.678 |

| 11.37 | 12.69 | 12.64 | 1     |
| 8.098 | 0     | 11.27 | -1    |
| 2.186 | 8.239 | 8.889 | 7.634 |
| 2.250 | 2.186 | 8.038 | 0.447 |

| 9.893 | 10.39 | 10.29 | 1     |
| 8.647 | 0     | 9.053 | -1    |
| 5.863 | 6.338 | 8.543 | 5.350 |
| 1.019 | 5.858 | 5.637 | 5.515 |

| 8.150 | 8.298 | 8.223 | 1     |
| 7.679 | 0     | 7.213 | -1    |
| 6.324 | 6.249 | 6.570 | 5.557 |
| 3.840 | 4.163 | 6.174 | 4.