In [12]:
import numpy as np

#Grid and Rewards
rows, cols = 3, 4
rewards = np.full((rows, cols), -0.04)
rewards[0, 3] = 1
rewards[1, 3] = -1

#Wall and Terminal States
wall = (1, 1)
terminals = [(0, 3), (1, 3)]

#Initialize Value Function
values = np.zeros((rows, cols))
values[0, 3] = 1
values[1, 3] = -1

#Initial Policy
policy = np.array([
    ['up', 'right', 'right', 'down'],
    ['up', None, 'down', 'down'],
    ['left', 'left', 'up', 'left']
])

#Actions and Discount Factor
actions = ['up', 'down', 'left', 'right']
gamma = 0.9

#Move Function for each action
def move(r, c, action):
    if action == 'up' and r > 0:
        return r - 1, c
    if action == 'down' and r < rows - 1:
        return r + 1, c
    if action == 'left' and c > 0:
        return r, c - 1
    if action == 'right' and c < cols - 1:
        return r, c + 1
    return r, c

#Evaluate and Improve Policy
def evaluate_and_improve_policy(policy, values):
    while True:
        delta = 0
        for r in range(rows):
            for c in range(cols):
                #Skip Wall and Terminal States
                if (r, c) in terminals or (r, c) == wall:
                    continue

                #Policy Evaluation
                v = values[r, c]  #Current Value of State
                next_r, next_c = move(r, c, policy[r, c])  #Finding Next State
                values[r, c] = rewards[r, c] + gamma * values[next_r, next_c]  #Update based on Bellman Equation
                delta = max(delta, abs(v - values[r, c]))
        if delta < 1e-6: #Convergence Criterion - if delta(change) is small enough, stop iterations)
            break

    #Policy Improvement
    for r in range(rows):
        for c in range(cols):
            if (r, c) in terminals or (r, c) == wall:
                continue
            best_action = None
            best_value = float('-inf')  #Initially choose very small value

            #Choose the action with the highest value
            for action in actions:
                next_r, next_c = move(r, c, action)  #Find next state for current action
                v = rewards[r, c] + gamma * values[next_r, next_c]  #Calculate value for this action

                #If this action is better than the previous, update
                if v > best_value:
                    best_value = v
                    best_action = action
            policy[r, c] = best_action #Update Policy

    policy[wall] = "Wall"
    policy[0, 3] = "+1"
    policy[1, 3] = "-1"

for _ in range(10):
    evaluate_and_improve_policy(policy, values)

#Print Final Results
print("Values:")
print(values)

print("Policy:")
for row in policy:
    print(row)

Values:
[[ 0.6206    0.734     0.86      1.      ]
 [ 0.51854   0.        0.734    -1.      ]
 [ 0.426686  0.51854   0.6206    0.51854 ]]
Policy:
['right' 'right' 'right' '+1']
['up' 'Wall' 'up' '-1']
['up' 'right' 'up' 'left']
