# Markov Decision Process

### Transition matrix

In [1]:
import numpy as np

T_up = np.zeros((6, 6))
T_up[0, 2] = 0.8
T_up[0, 0] = 0.1
T_up[0, 1] = 0.1
T_up[1, 3] = 0.8
T_up[1, 1] = 0.1
T_up[1, 0] = 0.1
T_up[2, 4] = 0.8
T_up[2, 2] = 0.1
T_up[2, 3] = 0.1
T_up[3, 5] = 0.8
T_up[3, 3] = 0.1
T_up[3, 2] = 0.1

T_down = np.zeros((6, 6))
T_down[0, 0] = 0.9
T_down[0, 1] = 0.1
T_down[1, 1] = 0.9
T_down[1, 0] = 0.1
T_down[2, 0] = 0.8
T_down[2, 2] = 0.1
T_down[2, 3] = 0.1
T_down[3, 1] = 0.8
T_down[3, 2] = 0.1
T_down[3, 3] = 0.1

T_left = np.zeros((6, 6))
T_left[0, 0] = 0.9
T_left[0, 2] = 0.1
T_left[1, 0] = 0.8
T_left[1, 3] = 0.1
T_left[1, 1] = 0.1
T_left[2, 2] = 0.8
T_left[2, 0] = 0.1
T_left[2, 4] = 0.1
T_left[3, 2] = 0.8
T_left[3, 1] = 0.1
T_left[3, 5] = 0.1

T_right = np.zeros((6, 6))
T_right[0, 1] = 0.8
T_right[0, 0] = 0.1
T_right[0, 2] = 0.1
T_right[1, 1] = 0.9
T_right[1, 3] = 0.1
T_right[2, 3] = 0.8
T_right[2, 0] = 0.1
T_right[2, 4] = 0.1
T_right[3, 3] = 0.8
T_right[3, 5] = 0.1
T_right[3, 1] = 0.1

### Reward

In [2]:
R = np.full((1, 6), -0.04)
R[0, 4] = -1
R[0, 5] = 1

### Utility

In [3]:
U = np.zeros((1, 6))

### Update Value and Policy

In [4]:
def update_value(T_up, T_down, T_right, T_left, R, U, gamma=1):
    U_updated = np.zeros((1, 6))
    policy = {s: None for s in range(6)}

    for s in range(6):
        actions_reward = {"UP": T_up[s] @ U.transpose(), "RG": T_right[s] @ U.transpose(), "DW": T_down[s] @ U.transpose(), "LF": T_left[s] @ U.transpose()}

        U_updated[0, s] = R[0, s] + gamma * max(actions_reward.values())

        policy[s] = max(actions_reward, key=actions_reward.get)

    return U_updated, policy

In [5]:
for i in range(200):
    U, policy = update_value(T_up, T_down, T_right, T_left, R, U)

    show_U = f"{U[0][4]:.{3}} {U[0][5]:.{3}}\n{U[0][2]:.{3}} {U[0][3]:.{3}}\n{U[0][0]:.{3}} {U[0][1]:.{3}}"
    show_policy = f"-1 1\n{policy[2]} {policy[3]}\n{policy[0]} {policy[1]}"
    
    print(f"- Iteration {i + 1}\n\n{show_U}\n\n{show_policy}\n")

- Iteration 1

-1.0 1.0
-0.04 -0.04
-0.04 -0.04

-1 1
UP UP
UP UP

- Iteration 2

-1.0 1.0
-0.08 0.752
-0.08 -0.08

-1 1
DW UP
UP UP

- Iteration 3

-1.0 1.0
0.454 0.827
-0.12 0.546

-1 1
RG UP
UP UP

- Iteration 4

-1.0 1.0
0.51 0.888
0.43 0.664

-1 1
RG UP
RG UP

- Iteration 5

-1.0 1.0
0.613 0.9
0.585 0.78

-1 1
RG UP
RG UP

- Iteration 6

-1.0 1.0
0.638 0.911
0.704 0.816

-1 1
RG UP
RG UP

- Iteration 7

-1.0 1.0
0.678 0.915
0.747 0.841

-1 1
DW UP
RG UP

- Iteration 8

-1.0 1.0
0.717 0.919
0.775 0.851

-1 1
DW UP
RG UP

- Iteration 9

-1.0 1.0
0.744 0.924
0.79 0.858

-1 1
DW UP
RG UP

- Iteration 10

-1.0 1.0
0.759 0.927
0.8 0.864

-1 1
DW UP
RG UP

- Iteration 11

-1.0 1.0
0.768 0.929
0.807 0.868

-1 1
DW UP
RG UP

- Iteration 12

-1.0 1.0
0.775 0.93
0.812 0.87

-1 1
DW UP
RG UP

- Iteration 13

-1.0 1.0
0.78 0.93
0.815 0.872

-1 1
DW UP
RG UP

- Iteration 14

-1.0 1.0
0.783 0.931
0.817 0.873

-1 1
DW UP
RG UP

- Iteration 15

-1.0 1.0
0.785 0.931
0.818 0.874

-1 1
DW UP
RG UP

-