In [1]:
import numpy as np
import gymnasium as gym
import time

In [2]:
env = gym.make("FrozenLake-v1")
print(env.observation_space)
print(env.action_space)
print(env.spec.reward_threshold)
print(env.spec.max_episode_steps)

Discrete(16)
Discrete(4)
0.7
100


In [3]:
P = env.env.env.env.P
P

{0: {0: [(0.3333333333333333, 0, 0.0, False),
   (0.3333333333333333, 0, 0.0, False),
   (0.3333333333333333, 4, 0.0, False)],
  1: [(0.3333333333333333, 0, 0.0, False),
   (0.3333333333333333, 4, 0.0, False),
   (0.3333333333333333, 1, 0.0, False)],
  2: [(0.3333333333333333, 4, 0.0, False),
   (0.3333333333333333, 1, 0.0, False),
   (0.3333333333333333, 0, 0.0, False)],
  3: [(0.3333333333333333, 1, 0.0, False),
   (0.3333333333333333, 0, 0.0, False),
   (0.3333333333333333, 0, 0.0, False)]},
 1: {0: [(0.3333333333333333, 1, 0.0, False),
   (0.3333333333333333, 0, 0.0, False),
   (0.3333333333333333, 5, 0.0, True)],
  1: [(0.3333333333333333, 0, 0.0, False),
   (0.3333333333333333, 5, 0.0, True),
   (0.3333333333333333, 2, 0.0, False)],
  2: [(0.3333333333333333, 5, 0.0, True),
   (0.3333333333333333, 2, 0.0, False),
   (0.3333333333333333, 1, 0.0, False)],
  3: [(0.3333333333333333, 2, 0.0, False),
   (0.3333333333333333, 1, 0.0, False),
   (0.3333333333333333, 0, 0.0, False)]},
 2:

In [4]:
P[14]

{0: [(0.3333333333333333, 10, 0.0, False),
  (0.3333333333333333, 13, 0.0, False),
  (0.3333333333333333, 14, 0.0, False)],
 1: [(0.3333333333333333, 13, 0.0, False),
  (0.3333333333333333, 14, 0.0, False),
  (0.3333333333333333, 15, 1.0, True)],
 2: [(0.3333333333333333, 14, 0.0, False),
  (0.3333333333333333, 15, 1.0, True),
  (0.3333333333333333, 10, 0.0, False)],
 3: [(0.3333333333333333, 15, 1.0, True),
  (0.3333333333333333, 10, 0.0, False),
  (0.3333333333333333, 13, 0.0, False)]}

In [5]:
env # Location alone cannot fully determine the state, rather location + step.

<TimeLimit<OrderEnforcing<PassiveEnvChecker<FrozenLakeEnv<FrozenLake-v1>>>>>

In [6]:
def play_policy(env, policy, render=False):
    episode_reward = 0
    observation, _ = env.reset()
    while True:
        if render: env.render()
        action = np.random.choice(env.action_space.n, p=policy[observation])
        observation, reward, terminated, truncated, _ = env.step(action)
        episode_reward += reward
        if terminated or truncated:
            break
    return episode_reward

In [7]:
# Create a random policy that selects each action via a uniform distribution.
random_policy = np.ones((env.observation_space.n, env.action_space.n)) / env.action_space.n

episode_rewards = [play_policy(env, random_policy) for _ in range(100)]
np.mean(episode_rewards), np.std(episode_rewards)

(np.float64(0.04), np.float64(0.19595917942265423))

## Book Implementation of Policy Evaluation

In [8]:
def v2q(env, P, v, state=None, gamma=1):
    # Calculate action value from state value
    if state is not None:
        q = np.zeros(env.action_space.n)
        for action in range(env.action_space.n):
            for prob, next_state, reward, terminated in P[state][action]:
                q[action] += prob * (reward + gamma * v[next_state] * (1 - terminated))
    else:
        q = np.zeros((env.observation_space.n, env.action_space.n))
        for state in range(env.observation_space.n):
            q[state] = v2q(env, P, v, state, gamma)
    return q

In [9]:
rewards = set()

n_states = env.observation_space.n
n_actions = env.action_space.n
n_rewards = 2

P_tensor = np.zeros((n_states, n_rewards, n_states, n_actions))
terminals = np.zeros(n_states).astype(bool)
for state in P:
    s_d = P[state]
    for action in s_d:
        sa_d = s_d[action]
        for prob, next_state, reward, terminated in sa_d:
            P_tensor[next_state, int(reward), state, action] += prob
            terminals[next_state] = terminals[next_state] or terminated
P_tensor.shape

(16, 2, 16, 4)

In [10]:
terminals = terminals.astype(int)
rewards = np.array([0,1])
terminals

array([0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1])

In [11]:
np.sum(P_tensor, axis=(0,1))

array([[1., 1., 1., 1.],
       [1., 1., 1., 1.],
       [1., 1., 1., 1.],
       [1., 1., 1., 1.],
       [1., 1., 1., 1.],
       [1., 1., 1., 1.],
       [1., 1., 1., 1.],
       [1., 1., 1., 1.],
       [1., 1., 1., 1.],
       [1., 1., 1., 1.],
       [1., 1., 1., 1.],
       [1., 1., 1., 1.],
       [1., 1., 1., 1.],
       [1., 1., 1., 1.],
       [1., 1., 1., 1.],
       [1., 1., 1., 1.]])

## Vectorised v2q Function

In [12]:
def v2q_vectorised(P_tensor, rewards, v, terminals, gamma=1):
    r_sa = np.sum(rewards[np.newaxis, :, np.newaxis, np.newaxis] * P_tensor, axis=(0,1)) # (S, A)
    q = r_sa + gamma * np.sum(np.sum(P_tensor, axis=1) * 
                              v[:, np.newaxis, np.newaxis] *
                              (1 - terminals[:, np.newaxis, np.newaxis]), axis=0)
    return q

In [13]:
# Test that the vectorised and non-vectorised v2q functions are the same.
v_test = np.random.rand(n_states)
start = time.time()
non_vectorised = v2q(env, P, v_test)
non_vectorised_time = time.time() - start
start = time.time()
vectorised = v2q_vectorised(P_tensor, rewards, v_test, terminals)
vectorised_time = time.time() - start
print("Vectorised is", non_vectorised_time / vectorised_time, " times faster.")
print("L2 norm difference:", np.linalg.norm(vectorised - non_vectorised))

Vectorised is 0.9036458333333334  times faster.
L2 norm difference: 1.1443916996305594e-16


## Policy Evaluation

In [14]:
def evaluate_policy(env, P, policy, gamma=1, tolerance=1e-6):
    """Book implementation."""
    v = np.zeros(env.observation_space.n)
    while True:
        delta = 0
        for state in range(env.observation_space.n):
            vs = sum(policy[state] * v2q(env, P, v, state, gamma))
            delta = max(delta, abs(v[state] - vs))
            v[state] = vs
        if delta < tolerance: break
    return v

In [15]:
def evaluate_polcy_vectorised(P_tensor, policy, rewards, terminals, gamma=1, tolerance=1e-6):
    """Our implementation."""
    v = np.zeros(P_tensor.shape[0])
    
    while True:
        delta = 0
        v_next = np.sum(policy * v2q_vectorised(P_tensor, rewards, v, terminals, gamma), axis=1)
        delta = max(delta, np.max(np.abs(v - v_next)))
        v = v_next
        if delta < tolerance: break
    return v

In [16]:
start = time.time()
non_vectorised = evaluate_policy(env, P, random_policy, tolerance=1e-20)
non_vectorised_time = time.time() - start

start = time.time()
vectorised = evaluate_polcy_vectorised(P_tensor, random_policy, rewards, terminals, tolerance=1e-20)
vectorised_time = time.time() - start

print("Vectorised is", non_vectorised_time / vectorised_time, " times faster.")
print("L2 norm difference:", np.linalg.norm(vectorised - non_vectorised))

Vectorised is 3.1137452872388014  times faster.
L2 norm difference: 1.2827573411350428e-16


In [17]:
v_random = vectorised
v_random

array([0.0139398 , 0.01163093, 0.02095299, 0.01047649, 0.01624867,
       0.        , 0.04075154, 0.        , 0.0348062 , 0.08816993,
       0.14205316, 0.        , 0.        , 0.17582037, 0.43929118,
       0.        ])

In [18]:
q_random = v2q_vectorised(P_tensor, rewards, v_random, terminals)
q_random

array([[0.01470942, 0.0139398 , 0.0139398 , 0.01317017],
       [0.00852357, 0.01163093, 0.0108613 , 0.0155079 ],
       [0.02444515, 0.02095299, 0.02406034, 0.01435347],
       [0.01047649, 0.01047649, 0.00698433, 0.01396866],
       [0.02166489, 0.01701829, 0.01624867, 0.01006282],
       [0.        , 0.        , 0.        , 0.        ],
       [0.05433538, 0.04735105, 0.05433538, 0.00698433],
       [0.        , 0.        , 0.        , 0.        ],
       [0.01701829, 0.04099204, 0.0348062 , 0.04640827],
       [0.07020886, 0.11755991, 0.10595784, 0.05895312],
       [0.18940422, 0.17582037, 0.16001424, 0.04297382],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.08799677, 0.20503718, 0.23442716, 0.17582037],
       [0.25238824, 0.53837052, 0.52711478, 0.43929118],
       [0.        , 0.        , 0.        , 0.        ]])

## Policy Improvement

In [19]:
def improve_policy(env, P, v, policy, gamma=1):
    """Book implementation."""
    optimal = True
    for state in range(env.observation_space.n):
        q = v2q(env, P, v, state, gamma)
        action = np.argmax(q)
        if policy[state][action] != 1:
            optimal = False
            policy[state] = 0
            policy[state][action] = 1
    return optimal

In [20]:
def improve_policy_vectorised(P_tensor, v, rewards, policy, gamma=1):
    """Our implementation."""
    optimal = True
    q = v2q_vectorised(P_tensor, rewards, v, terminals, gamma)
    actions = np.argmax(q, axis=1)
    updated_policy = np.zeros(policy.shape)
    updated_policy[np.arange(P_tensor.shape[0]), actions] = 1
    if not np.all(np.isclose(policy - updated_policy, 0)):
        optimal = False
    return optimal, updated_policy

In [21]:
policy = random_policy.copy()

In [22]:
vectorised_optimal, vectorised_policy = improve_policy_vectorised(P_tensor, v_random, rewards, policy)
vectorised_optimal, vectorised_policy

(False,
 array([[1., 0., 0., 0.],
        [0., 0., 0., 1.],
        [1., 0., 0., 0.],
        [0., 0., 0., 1.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [0., 0., 0., 1.],
        [0., 1., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [0., 0., 1., 0.],
        [0., 1., 0., 0.],
        [1., 0., 0., 0.]]))

In [33]:
optimal = improve_policy(env, P, v_random, policy)
policy

array([[1., 0., 0., 0.],
       [0., 0., 0., 1.],
       [1., 0., 0., 0.],
       [0., 0., 0., 1.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [0., 0., 0., 1.],
       [0., 1., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [0., 0., 1., 0.],
       [0., 1., 0., 0.],
       [1., 0., 0., 0.]])

## Policy Iteration

In [24]:
def iterate_policy(env, P, gamma=1, tolerance=1e-6):
    """Book Implementation."""
    # Can initialise to any policy that we want.
    policy = np.ones((env.observation_space.n, env.action_space.n)) / env.action_space.n
    iterations = 0
    while True:
        v = evaluate_policy(env, P, policy, gamma, tolerance)
        iterations += 1
        if improve_policy(env, P, v, policy, gamma): break
    return policy, v, iterations

In [25]:
def iterate_policy_vectorised(P_tensor, rewards, terminals, gamma=1, tolerance=1e-6):
    """Our Implementation."""
    policy = np.ones(P_tensor.shape[2:]) / P_tensor.shape[3]
    iterations = 0
    while True:
        v = evaluate_polcy_vectorised(P_tensor, policy, rewards, terminals, gamma, tolerance)
        iterations += 1
        optimal, policy = improve_policy_vectorised(P_tensor, v, rewards, policy, gamma)
        if optimal: break
    return policy, v, iterations

In [26]:
start = time.time()
new_policy, v, iterations = iterate_policy(env, P, tolerance=1e-20)
non_vectorised_time = time.time() - start

start = time.time()
new_policy_vec, v_vec, iterations_vec = iterate_policy(env, P, tolerance=1e-20)
vectorised_time = time.time() - start
print("Vectorised version is", non_vectorised_time / vectorised_time, "times faster.")
np.isclose(np.linalg.norm(v - v_vec), 0), np.all(np.isclose(new_policy, new_policy_vec)), iterations, iterations_vec

Vectorised version is 0.9970250240446551 times faster.


(np.True_, np.True_, 3, 3)

In [32]:
v_vec.reshape(4,4)

array([[0.82352941, 0.82352941, 0.82352941, 0.82352941],
       [0.82352941, 0.        , 0.52941176, 0.        ],
       [0.82352941, 0.82352941, 0.76470588, 0.        ],
       [0.        , 0.88235294, 0.94117647, 0.        ]])

In [30]:
np.argmax(new_policy_vec, axis=1).reshape(4,4)

array([[0, 3, 3, 3],
       [0, 0, 0, 0],
       [3, 1, 0, 0],
       [0, 2, 1, 0]])

In [31]:
episode_rewards = [play_policy(env, new_policy_vec) for _ in range(100)]
np.mean(episode_rewards), np.std(episode_rewards)

(np.float64(0.79), np.float64(0.40730823708832603))

Mean average reward is only 0.79 because the calculation of optimal state values does not consider upper bounds on the steps.

## Using Value Iteration (VI)

In [34]:
def iterate_value(env, P, gamma=1, tolerance=1e-6):
    """Book Implementation."""
    v = np.zeros(env.observation_space.n)
    while True:
        delta = 0
        for state in range(env.observation_space.n):
            vmax = max(v2q(env, P, v, state, gamma))
            delta = max(delta, abs(v[state] - vmax))
            v[state] = vmax
        if delta < tolerance: break

    # Calculate optimal policy
    policy = np.zeros((env.observation_space.n, env.action_space.n))
    for state in range(env.observation_space.n):
        action = np.argmax(v2q(env, P, v, state, gamma))
        policy[state][action] = 1
    return policy, v

In [42]:
def iterate_value_vectorised(P_tensor, rewards, terminals, gamma=1, tolerance=1e-6):
    """Our implementation."""
    v = np.zeros(P_tensor.shape[0])
    while True:
        delta = 0
        v_next = np.max(v2q_vectorised(P_tensor, rewards, v, terminals, gamma), axis=1)
        delta = max(delta, np.max(np.abs(v - v_next)))
        v = v_next
        if delta < tolerance: break

    q = v2q_vectorised(P_tensor, rewards, v_next, terminals, gamma)
    actions = np.argmax(q, axis=1)
    policy = np.zeros(P_tensor.shape[2:])
    policy[np.arange(len(actions)), actions] = 1
    return policy, v

In [46]:
start = time.time()
policy_non_vec, v_non_vec = iterate_value(env, P, tolerance=1e-20)
time_non_vec = time.time() - start

start = time.time()
policy_vec, v_vec = iterate_value_vectorised(P_tensor, rewards, terminals, tolerance=1e-20)
time_vec = time.time() - start
print("Vectorised version is", time_non_vec / time_vec, "times faster.")
np.all(np.isclose(policy_non_vec, policy_vec)), np.all(np.isclose(v_non_vec, v_vec))

Vectorised version is 2.671810001475144 times faster.


(np.True_, np.True_)