In [2]:
import numpy as np

# LineWorld MDP

In [None]:
S = [0, 1, 2, 3, 4] # agent position in line world
A = [0, 1] # 0: Left, 1: Right
R = [-1.0, 0.0, 1.0]
p = np.zeros((len(S), len(A), len(S), len(R))) # state, action, next_state, reward_index
T = [0, 4]

p[3, 0, 2, 1] = 1.0
p[2, 0, 1, 1] = 1.0
p[1, 0, 0, 0] = 1.0

p[3, 1, 4, 2] = 1.0
p[2, 1, 3, 1] = 1.0
p[1, 1, 2, 1] = 1.0

# Iterative Policy Evaluation

In [33]:
from typing import List

In [34]:
def iterative_policy_evaluation(
    pi: np.ndarray,
    S: List[int],
    A: List[int],
    R: List[float],
    T: List[int], # terminal states
    p: np.ndarray,
    theta: float = 0.00001,
    gamma: float = 0.9999999,
):
  V = np.random.random((len(S),))
  V[T] = 0.0

  while True:
    delta = 0.0

    for s in S:
      v = V[s]
      total = 0.0
      for a in A:
        sub_total = 0.0
        for s_p in S:
          for r_index in range(len(R)):
            r = R[r_index]
            sub_total += p[s, a, s_p, r_index] * (r + gamma * V[s_p])
        total += pi[s, a] * sub_total
      V[s] = total
      abs_diff = np.abs(v - V[s])
      delta = np.maximum(delta, abs_diff)

    if delta < theta:
      break
  return V




In [35]:
pi_always_right = np.zeros((len(S), len(A)))
pi_always_right[:, 1] = 1.0

In [36]:
iterative_policy_evaluation(pi_always_right, S, A, R, T, p)

array([0.       , 0.9999998, 0.9999999, 1.       , 0.       ])

In [37]:
pi_always_left = np.zeros((len(S), len(A)))
pi_always_left[:, 0] = 1.0

In [38]:
iterative_policy_evaluation(pi_always_left, S, A, R, T, p)

array([ 0.       , -1.       , -0.9999999, -0.9999998,  0.       ])

In [39]:
pi_uniform_random = np.ones((len(S), len(A))) * 0.5

In [40]:
iterative_policy_evaluation(pi_uniform_random, S, A, R, T, p)

array([ 0.00000000e+00, -4.99994737e-01,  5.26278752e-06,  5.00002631e-01,
        0.00000000e+00])

In [41]:
pi_weird_random = np.zeros((len(S), len(A)))
pi_weird_random[:, 1] = 0.7
pi_weird_random[:, 0] = 0.3

In [42]:
iterative_policy_evaluation(pi_weird_random, S, A, R, T, p)

array([0.        , 0.18275314, 0.68965181, 0.90689552, 0.        ])

In [43]:
(0.7 * (0 + 0.9999999 * 0.68965317) + 0.3 * (-1 + 0.99999999 * 0.0))

0.1827571707242781

# Policy Iteration

In [44]:
def policy_iteration(
    S: List[int],
    A: List[int],
    R: List[int],
    T: List[int],
    p: np.ndarray,
    theta: float = 0.00001,
    gamma: float = 0.999999,
):
  V = np.random.random((len(S),))
  V[T] = 0.0
  pi = np.array([np.random.choice(A) for s in S])
  pi[T] = 0

  while True:

    # Policy Evaluation
    while True:
      delta = 0.0

      for s in S:
        v = V[s]
        total = 0.0
        for s_p in S:
          for r_index in range(len(R)):
            r = R[r_index]
            total += p[s, pi[s], s_p, r_index] * (r + gamma * V[s_p])
        V[s] = total
        abs_diff = np.abs(v - V[s])
        delta = np.maximum(delta, abs_diff)

      if delta < theta:
        break

    # Policy Improvement

    policy_stable = True
    for s in S:
      old_action = pi[s]
      best_a = None
      best_a_score = -999999999.99999
      for a in A:
        score = 0.0
        for s_p in S:
          for r_index in range(len(R)):
            r = R[r_index]
            score += p[s, a, s_p, r_index] * (r + gamma * V[s_p])
        if best_a is None or score > best_a_score:
          best_a = a
          best_a_score = score
      if best_a != old_action:
        policy_stable = False
      pi[s] = best_a

    if policy_stable:
      break

  return pi, V

In [45]:
policy_iteration(S, A, R, T, p)

(array([0, 1, 1, 1, 0]),
 array([0.      , 0.999998, 0.999999, 1.      , 0.      ]))