In [1]:
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
%pylab inline
import random

Populating the interactive namespace from numpy and matplotlib


## Another maze
This is the third grid world which I create

In [2]:
gamma = 1 # discounting rate
gridSize = 5
terminationStates = [[2,2]]
actions = [[-1, 0], [1, 0], [0, 1], [0, -1]]
numIterations = 1000

In [3]:
rewardValue = np.zeros((gridSize,gridSize)) -1
rewardValue[1]=np.array([-1,-10,-1,-10,-1])
rewardValue[3]=np.array([-1,-10,-1,-10,-1])
rewardValue

array([[ -1.,  -1.,  -1.,  -1.,  -1.],
       [ -1., -10.,  -1., -10.,  -1.],
       [ -1.,  -1.,  -1.,  -1.,  -1.],
       [ -1., -10.,  -1., -10.,  -1.],
       [ -1.,  -1.,  -1.,  -1.,  -1.]])

In [4]:
def actionValue(initialPosition,action):
    if initialPosition in terminationStates:
        finalPosition = initialPosition
        reward=0
    else:
        #Compute final position
        finalPosition = np.array(initialPosition) + np.array(action)
        
        # If the action moves the finalPosition out of the grid, stay in same cell
        if -1 in finalPosition or gridSize in finalPosition:
                finalPosition = initialPosition
                reward= rewardValue[finalPosition[0],finalPosition[1]]
        else:
                reward= rewardValue[finalPosition[0],finalPosition[1]]
    
    #print(finalPosition)
    return finalPosition, reward

In [5]:
valueMap = np.zeros((gridSize, gridSize))
valueMap1 = np.zeros((gridSize, gridSize))
states = [[i, j] for i in range(gridSize) for j in range(gridSize)]

In [6]:
def policy_evaluation(numIterations,gamma,theta,valueMap):
    for i in range(numIterations):
        delta=0
        #print("iterations=",i)
        for state in states:
            weightedRewards=0
            for action in actions:
                finalPosition,reward = actionValue(state,action)
                #print("reward=",reward,"valueMap=",valueMap[finalPosition[0],finalPosition][1])
                weightedRewards += 1/4* (reward + gamma * valueMap[finalPosition[0],finalPosition][1])
            #print(weightedRewards)
            valueMap1[state[0],state[1]]=weightedRewards
            #print("wr=",weightedRewards,"va=",valueMap[state[0],state[1]]) 
            delta =max(delta,abs(weightedRewards-valueMap[state[0],state[1]]))
        valueMap = np.copy(valueMap1)
        #print(valueMap1)
        if(delta < 0.01):
            print(delta)                                                   
            print(valueMap)
            break

In [7]:
valueMap = np.zeros((gridSize, gridSize))
valueMap1 = np.zeros((gridSize, gridSize))
states = [[i, j] for i in range(gridSize) for j in range(gridSize)]
policy_evaluation(1000,1,0.0001,valueMap)

0.009697101372182715
[[-82.49768079 -80.51647225 -74.9345659  -80.51647225 -82.49768079]
 [-80.51647225 -71.15241689 -59.80375072 -71.15241689 -80.51647225]
 [-74.9345659  -59.80375072   0.         -59.80375072 -74.9345659 ]
 [-80.51647225 -71.15241689 -59.80375072 -71.15241689 -80.51647225]
 [-82.49768079 -80.51647225 -74.9345659  -80.51647225 -82.49768079]]


## Greedify

In [8]:
valueMap = np.zeros((gridSize, gridSize))
valueMap1 = np.zeros((gridSize, gridSize))
states = [[i, j] for i in range(gridSize) for j in range(gridSize)]
pi = np.ones((gridSize,gridSize))/4
pi1 = np.chararray((gridSize, gridSize))
pi1[:] = 'a'

In [9]:
# Compute the value state function for the Grid
def policy_evaluate(states,actions,gamma,valueMap):
    #print("iterations=",i)
    for state in states:
        weightedRewards=0
        for action in actions:
            finalPosition,reward = actionValue(state,action)
            weightedRewards += 1/4* (reward + gamma * valueMap[finalPosition[0],finalPosition][1])
        # Set the computed weighted rewards to valueMap1
        valueMap1[state[0],state[1]]=weightedRewards
    # Copy to original valueMap
    valueMap = np.copy(valueMap1)
    return(valueMap)

In [10]:
def argmax(q_values):
    idx=np.argmax(q_values)
    return(np.random.choice(np.where(a==a[idx])[0].tolist()))


# Compute the best action in each state
def greedify_policy(state,pi,pi1,gamma,valueMap):  
        q_values=np.zeros(len(actions))
        for idx,action in enumerate(actions):
            finalPosition,reward = actionValue(state,action)
            q_values[idx] += 1/4* (reward + gamma * valueMap[finalPosition[0],finalPosition][1])
        # Find the index of the action for which the q_value is 
        idx=q_values.argmax()
        pi[state[0],state[1]]=idx 
        if(idx == 0):
            pi1[state[0],state[1]]='u'
        elif(idx == 1):
            pi1[state[0],state[1]]='d'
        elif(idx == 2):
            pi1[state[0],state[1]]='r'
        elif(idx == 3):
            pi1[state[0],state[1]]='l'

        


In [11]:
def improve_policy(pi, pi1,gamma,valueMap):
    policy_stable = True
    for state in states:
        old = pi[state].copy()
        # Greedify policy for state
        greedify_policy(state,pi,pi1,gamma,valueMap)
        if not np.array_equal(pi[state], old):
            policy_stable = False
    print(pi)
    print(pi1)
    return pi, pi1, policy_stable



In [12]:
def policy_iteration(gamma, theta):
    valueMap = np.zeros((gridSize, gridSize))
    pi = np.ones((gridSize,gridSize))/4
    pi1 = np.chararray((gridSize, gridSize))
    pi1[:] = 'a'
    policy_stable = False
    print("here")
    while not policy_stable:
        valueMap = policy_evaluate(states,actions,gamma,valueMap)
        pi,pi1, policy_stable = improve_policy(pi,pi1,  gamma,valueMap)
    return valueMap, pi,pi1

In [13]:
theta=0.1
valueMap, pi,pi1 = policy_iteration(gamma, theta)

here
[[0. 2. 0. 2. 0.]
 [0. 0. 1. 0. 0.]
 [3. 2. 0. 3. 2.]
 [0. 1. 0. 1. 0.]
 [1. 2. 1. 2. 1.]]
[[b'u' b'r' b'u' b'r' b'u']
 [b'u' b'u' b'd' b'u' b'u']
 [b'l' b'r' b'u' b'l' b'r']
 [b'u' b'd' b'u' b'd' b'u']
 [b'd' b'r' b'd' b'r' b'd']]
[[0. 3. 0. 2. 0.]
 [0. 0. 1. 0. 0.]
 [3. 2. 0. 3. 2.]
 [1. 1. 0. 1. 1.]
 [1. 3. 1. 2. 1.]]
[[b'u' b'l' b'u' b'r' b'u']
 [b'u' b'u' b'd' b'u' b'u']
 [b'l' b'r' b'u' b'l' b'r']
 [b'd' b'd' b'u' b'd' b'd']
 [b'd' b'l' b'd' b'r' b'd']]
[[0. 3. 0. 2. 0.]
 [0. 0. 1. 0. 0.]
 [3. 2. 0. 3. 2.]
 [1. 1. 0. 1. 1.]
 [1. 3. 1. 2. 1.]]
[[b'u' b'l' b'u' b'r' b'u']
 [b'u' b'u' b'd' b'u' b'u']
 [b'l' b'r' b'u' b'l' b'r']
 [b'd' b'd' b'u' b'd' b'd']
 [b'd' b'l' b'd' b'r' b'd']]


In [14]:
gamma = 1 # discounting rate
gridSize=5
rewardValue = np.zeros((gridSize,gridSize)) -1
rewardValue = np.zeros((gridSize,gridSize)) -1
rewardValue[1]=np.array([-1,-10,-1,-10,-1])
rewardValue[3]=np.array([-1,-10,-1,-10,-1])
print(rewardValue)


terminationStates = [[2,2]]
actions = [[-1, 0], [1, 0], [0, 1], [0, -1]]
numIterations = 1000


[[ -1.  -1.  -1.  -1.  -1.]
 [ -1. -10.  -1. -10.  -1.]
 [ -1.  -1.  -1.  -1.  -1.]
 [ -1. -10.  -1. -10.  -1.]
 [ -1.  -1.  -1.  -1.  -1.]]


In [15]:
valueMap = np.zeros((gridSize, gridSize))
valueMap1 = np.zeros((gridSize, gridSize))
states = [[i, j] for i in range(gridSize) for j in range(gridSize)]
pi = np.ones((gridSize,gridSize))/4
pi1 = np.chararray((gridSize, gridSize))
pi1[:] = 'a'

In [16]:
def bellman_optimality_update(valueMap, state, gamma):

    q_values=np.zeros(len(actions))
    
    for idx,action in enumerate(actions):
        finalPosition,reward = actionValue(state,action)
        q_values[idx] += 1/4* (reward + gamma * valueMap[finalPosition[0],finalPosition][1])
    # Find the index of the action for which the q_value is 
    idx=q_values.argmax()
            
    max = np.argmax(q_values)
    valueMap[state[0],state[1]] = q_values[max]    
    #print(q_values[max])


In [17]:
def value_iteration(gamma, theta):
    valueMap = np.zeros((gridSize, gridSize))
    while True:
        delta = 0
        for state in states:
            v_old=valueMap[state[0],state[1]]
            bellman_optimality_update(valueMap, state, gamma)
            delta = max(delta, abs(v_old - valueMap[state[0],state[1]]))
        if delta < theta:
            break
    pi = np.ones((gridSize,gridSize))/4
    for state in states:
        greedify_policy(state,pi,pi1,gamma,valueMap)
    print(pi)
    print(pi1)
    return valueMap, pi,pi1

In [18]:
gamma = 1
theta = 0.000001
valueMap,pi,pi1=value_iteration(gamma, theta)
pi
pi1

[[1. 2. 1. 3. 1.]
 [1. 1. 1. 1. 1.]
 [2. 2. 0. 3. 3.]
 [0. 0. 0. 0. 0.]
 [0. 2. 0. 3. 0.]]
[[b'd' b'r' b'd' b'l' b'd']
 [b'd' b'd' b'd' b'd' b'd']
 [b'r' b'r' b'u' b'l' b'l']
 [b'u' b'u' b'u' b'u' b'u']
 [b'u' b'r' b'u' b'l' b'u']]


chararray([[b'd', b'r', b'd', b'l', b'd'],
           [b'd', b'd', b'd', b'd', b'd'],
           [b'r', b'r', b'u', b'l', b'l'],
           [b'u', b'u', b'u', b'u', b'u'],
           [b'u', b'r', b'u', b'l', b'u']], dtype='|S1')