In [29]:
import gymnasium as gym
import numpy as np

# Cliffwalking Environment

In [30]:
env = gym.make("CliffWalking-v0")
for k, v in vars(env).items():
    print(f"{k}: {v}")

_saved_kwargs: {'disable_render_order_enforcing': False}
env: <PassiveEnvChecker<CliffWalkingEnv<CliffWalking-v0>>>
_action_space: None
_observation_space: None
_metadata: None
_cached_spec: None
_has_reset: False
_disable_render_order_enforcing: False


In [31]:
env.action_space, env.observation_space

(Discrete(4), Discrete(48))

In [32]:
dynamics = env.env.env.P
dynamics

{0: {0: [(1.0, np.int64(0), -1, False)],
  1: [(1.0, np.int64(1), -1, False)],
  2: [(1.0, np.int64(12), -1, False)],
  3: [(1.0, np.int64(0), -1, False)]},
 1: {0: [(1.0, np.int64(1), -1, False)],
  1: [(1.0, np.int64(2), -1, False)],
  2: [(1.0, np.int64(13), -1, False)],
  3: [(1.0, np.int64(0), -1, False)]},
 2: {0: [(1.0, np.int64(2), -1, False)],
  1: [(1.0, np.int64(3), -1, False)],
  2: [(1.0, np.int64(14), -1, False)],
  3: [(1.0, np.int64(1), -1, False)]},
 3: {0: [(1.0, np.int64(3), -1, False)],
  1: [(1.0, np.int64(4), -1, False)],
  2: [(1.0, np.int64(15), -1, False)],
  3: [(1.0, np.int64(2), -1, False)]},
 4: {0: [(1.0, np.int64(4), -1, False)],
  1: [(1.0, np.int64(5), -1, False)],
  2: [(1.0, np.int64(16), -1, False)],
  3: [(1.0, np.int64(3), -1, False)]},
 5: {0: [(1.0, np.int64(5), -1, False)],
  1: [(1.0, np.int64(6), -1, False)],
  2: [(1.0, np.int64(17), -1, False)],
  3: [(1.0, np.int64(4), -1, False)]},
 6: {0: [(1.0, np.int64(6), -1, False)],
  1: [(1.0, np.in

In [33]:
dynamics[14][2] # each tuple has values (probability, next_state, reward, terminal)

[(1.0, np.int64(26), -1, False)]

The above result represents the following equation:
$$p(s_{26}, -1 \mid s_{14}, a_2) = 1$$
The lack of other entries in the list of `dynamics[14][2]` is enforced by the only outcome having probability 1.

In [34]:
dynamics[35] # The position right next to the completed state

{0: [(1.0, np.int64(23), -1, False)],
 1: [(1.0, np.int64(35), -1, False)],
 2: [(1.0, np.int64(47), -1, True)],
 3: [(1.0, np.int64(34), -1, False)]}

In [35]:
dynamics[44] # One of the failure states. It is technically impossible to reach this state, but if you happen to be here, moving anywhere but up will move you to the start.

{0: [(1.0, np.int64(32), -1, False)],
 1: [(1.0, np.int64(36), -100, False)],
 2: [(1.0, np.int64(36), -100, False)],
 3: [(1.0, np.int64(36), -100, False)]}

In [36]:
dynamics[32] # One of the states in which moving down results in a transition to a state with reward -100. We can see that the -100 reward action moves the agent to state 36 - the start.

{0: [(1.0, np.int64(20), -1, False)],
 1: [(1.0, np.int64(33), -1, False)],
 2: [(1.0, np.int64(36), -100, False)],
 3: [(1.0, np.int64(31), -1, False)]}

In [37]:
# Investigate the dictionary dynamics
actionable_states = set() # states you can perform an action from.
reachable_states = set() # states you can reach from transitioning there.
terminal_states = set()
probabilities = set()
rewards = set()

for state, actions in dynamics.items():
    actionable_states.add(state)
    for action, lst in actions.items():
        for prob, next_state, reward, terminal in lst:
            reachable_states.add(int(next_state))
            rewards.add(reward)
            probabilities.add(prob)
            if terminal:
                terminal_states.add(next_state)

In [38]:
actionable_states == set(range(48))

True

In [39]:
reachable_states == set(range(48)) - set(range(37, 47))

True

In [40]:
terminal_states

{np.int64(47)}

In [41]:
rewards

{-100, -1}

In [42]:
probabilities

{1.0}

So we will define the following sets:
- $R = \{-100, -1\}$,
- $S = \{0,1,\dots,47\}$,
- $A = \{0,1,2,3\}$

We now construct the dynamics matrix $P_{S_{t+1}, R_{t+1}\mid S_t, A_t}$

In [62]:
def encode_dynamics_row(next_state, reward, no_rewards, reward_map):
    return next_state * no_rewards + reward_map[reward]

def encode_dynamics_col(previous_state, no_actions, action):
    return previous_state * no_actions + action

no_states = 48
no_actions = 4
no_rewards = 2
reward_map = {-100: 0, -1: 1}
index_to_reward = {0: -100, 1: -1}

np.set_printoptions(threshold=np.inf)
dynamics_matrix = np.zeros((48, 2, 48, 4))
for state, actions in dynamics.items():
    for action, lst in actions.items():
        for prob, next_state, reward, terminal in lst:
            reward_idx = reward_map[reward]
            #dynamics_matrix[encode_dynamics_row(next_state, reward, no_rewards, reward_map), encode_dynamics_col(state, no_actions, action)] = prob
            dynamics_matrix[next_state, reward_idx, state, action] = prob

dynamics_matrix

array([[[[0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
 

In [63]:
np.all(np.sum(dynamics_matrix, axis=(0,1)) == 1) # check that the sum of s and r (dimension 0) always results in 1.

np.True_

## Policy Evaluation
We're going to combine the initial Bellman equations from `feed_full.ipynb`. As a reminder, we list them here:
$$
\begin{align*}
v_\pi(s) &= \sum_{a\in A}\pi(a\mid s)q_\pi(s)\\
q_\pi(s, a) &= \sum_{s' \in S^+, r \in \mathbb{R}}p(s', r\mid s, a)[r + \gamma v_\pi(s')]
\end{align*}$$
When put together, we have
$$v_\pi(s) = \sum_{a\in A}\pi(a\mid s)\sum_{s' \in S^+, r \in \mathbb{R}}p(s', r\mid s, a)[r + \gamma v_\pi(s')]$$
Which simplifies nicely to:
$$v_\pi(s) = \sum_{r \in R}r \cdot \mathrm{Pr}[R_{t+1} = r\mid S_t = s] + \gamma \sum_{s' \in S} v_\pi(s')p(s'\mid s)$$
When vectorised, this looks like:
$$\vec{v}_\pi = P^\top_{R_{t + 1}\mid S_t}\vec{r} + \gamma P^\top_{S_{t+1}\mid S_t}\vec{v}_\pi$$
Rearranged:
$$(I - \gamma P^\top_{S_{t+1}\mid S_t})\vec{v}_\pi = P^\top_{R_{t + 1}\mid S_t}\vec{r}$$

In [None]:
def evaluate_bellman(dynamics_matrix, policy, r, gamma=1, verbose=True):
    p_sp_given_s = np.sum(dynamics_matrix * policy[np.newaxis, np.newaxis, :, :], axis=(1,3)) # sum over r and a.
    p_r_given_s = np.sum(dynamics_matrix * policy[np.newaxis, np.newaxis, :, :], axis=(0,3)) # sum over s' and a
    
    lhs = np.eye(p_sp_given_s.shape[0]) - gamma * p_sp_given_s
    rhs = p_r_given_s.transpose(1,0) @ r[:, np.newaxis]
    return np.linalg.solve(lhs, rhs)

gamma = 1
rewards = np.array([-100, -1])
policy = np.ones((no_states, no_actions)) / no_actions
x = evaluate_bellman(dynamics_matrix, policy, rewards, gamma)
x.reshape(-1)

array([-5.57953084e+17, -5.57953084e+17, -5.57953084e+17, -5.57953084e+17,
       -5.57953084e+17, -5.57953084e+17, -5.57953084e+17, -5.57953084e+17,
       -5.57953084e+17, -5.57953084e+17, -5.57953084e+17, -5.57953084e+17,
       -5.57953084e+17, -5.57953084e+17, -5.57953084e+17, -5.57953084e+17,
       -5.57953084e+17, -5.57953084e+17, -5.57953084e+17, -5.57953084e+17,
       -5.57953084e+17, -5.57953084e+17, -5.57953084e+17, -5.57953084e+17,
       -5.57953084e+17, -5.57953084e+17, -5.57953084e+17, -5.57953084e+17,
       -5.57953084e+17, -5.57953084e+17, -5.57953084e+17, -5.57953084e+17,
       -5.57953084e+17, -5.57953084e+17, -5.57953084e+17, -5.57953084e+17,
       -5.57953084e+17, -5.57953084e+17, -5.57953084e+17, -5.57953084e+17,
       -5.57953084e+17, -5.57953084e+17, -5.57953084e+17, -5.57953084e+17,
       -5.57953084e+17, -5.57953084e+17, -5.57953084e+17, -5.57953084e+17])

In [79]:
def evaluate_bellman1(env, policy, gamma=1., verbose=False):
    if verbose:
        print('policy = %s', policy)
    a, b = np.eye(no_states), np.zeros((no_states))
    for state in range(no_states - 1):
        for action in range(no_actions):
            pi = policy[state][action]
            for p, next_state, reward, terminated in env.env.env.P[state][action]:
                a[state, next_state] -= (pi * gamma * p)
                b[state] += (pi * reward * p)
    v = np.linalg.solve(a, b)
    q = np.zeros((no_states, no_actions))
    for state in range(no_states - 1):
        for action in range(no_actions):
            for p, next_state, reward, terminated in env.env.env.P[state][action]:
                q[state][action] += ((reward + gamma * v[next_state]) * p)
    if verbose:
        print('state values = %s', v)
        print('action values = %s', q)
    return v, q
v_test, q_test = evaluate_bellman1(env, policy)

In [80]:
v_test

array([-65104.83759923, -65038.58278728, -64886.60152705, -64615.82843642,
       -64172.32462724, -63470.85252573, -62382.03961361, -60720.87142908,
       -58253.74184103, -54780.58582606, -50443.22723535, -46534.94093822,
       -65167.09241118, -65120.30923556, -65001.39335745, -64784.55915496,
       -64426.29291957, -63854.19333635, -62950.39488602, -61522.83283259,
       -59255.76826794, -55640.7884018 , -50010.15494176, -42622.6546411 ,
       -65272.13039875, -65270.16838633, -65210.10351224, -65090.72190641,
       -64890.09455971, -64565.23301409, -64038.51376154, -63160.29674731,
       -61601.70999636, -58512.64457143, -51329.94948881, -31318.8680433 ,
       -65375.13039875, -65399.38989565, -65409.12367712, -65379.27827567,
       -65329.12143899, -65247.90605259, -65116.22623945, -64896.67198589,
       -64507.02529816, -63734.75894192, -45570.55257158,      0.        ])