# Feed Full Env Using Tensors
If you are following along with the book, look through `feed_full.ipynb` first. This is an implementation of the same environment using tensors instead of 2d matrices.

In [1]:
import numpy as np
import scipy as sp

In [2]:
p_S0 = np.array([1/2, 1/2])

In [5]:
dynamics_raw = {("hungry", -2, "hungry", "ignore"): 1,
                ("hungry", -3, "hungry", "feed"): 1/3,
                ("full", 1, "hungry", "feed"): 2/3,
                ("hungry", -2, "full", "ignore"): 3/4,
                ("full", 2, "full", "ignore"): 1/4,
                ("full", 1, "full", "feed"): 1}

state_map = {"hungry": 0,
             "full": 1}
reward_map = {-3: 0,
              -2: 1,
               1: 2,
               2: 3}
action_map = {"ignore": 0,
              "feed": 1}

In [16]:
reward_vec = np.array([-3,-2,1,2])

In [None]:
dynamics_matrix = np.zeros((2, 4, 2, 2))
for (next_state, reward, state, action), prob in dynamics_raw.items():
    dynamics_matrix[state_map[next_state], reward_map[reward], state_map[state], action_map[action]] = prob
dynamics_matrix.reshape(2,4,4).reshape(8,4) # The dynamics matrix is basically the same.

array([[0.        , 0.33333333, 0.        , 0.        ],
       [1.        , 0.        , 0.75      , 0.        ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.66666667, 0.        , 1.        ],
       [0.        , 0.        , 0.25      , 0.        ]])

In [13]:
p_sp_given_s_a = np.sum(dynamics_matrix, axis=1)
p_sp_given_s_a.reshape(2,4)

array([[1.        , 0.33333333, 0.75      , 0.        ],
       [0.        , 0.66666667, 0.25      , 1.        ]])

In [82]:
r = np.sum(dynamics_matrix * reward_vec[np.newaxis, :, np.newaxis, np.newaxis], axis=(0,1))
r

array([[-2.        , -0.33333333],
       [-1.        ,  1.        ]])

In [81]:
# another way to compute r.
r1 = reward_vec[np.newaxis, np.newaxis, :] @ np.sum(dynamics_matrix, axis=0).transpose(1,0,2)
r1[:,0,:]

array([[-2.        , -0.33333333],
       [-1.        ,  1.        ]])

In [27]:
r_sp = np.sum(dynamics_matrix * reward_vec[np.newaxis, :, np.newaxis, np.newaxis], axis=1) / p_sp_given_s_a
r_sp = np.nan_to_num(r_sp)
r_sp.reshape(2,4)

  r_sp = np.sum(dynamics_matrix * reward_vec[np.newaxis, :, np.newaxis, np.newaxis], axis=1) / p_sp_given_s_a


array([[-2., -3., -2.,  0.],
       [ 0.,  1.,  2.,  1.]])

In [28]:
pi = np.array([[1/4, 3/4], 
               [5/6, 1/6]])
pi

array([[0.25      , 0.75      ],
       [0.83333333, 0.16666667]])

In [30]:
p_S0_A0_pi = p_S0[:, np.newaxis] * pi
p_S0_A0_pi

array([[0.125     , 0.375     ],
       [0.41666667, 0.08333333]])

In [55]:
# (S, A) and (S', S, A)
# (S, 1, A) and (S, A, S') # The S dimension must line up because you can only put pi(a|s) and p(s'|s, a) 
# when s and a are the same. furthermore, we are summing over all the actions a.
p_pi_sp_given_s = pi[:, np.newaxis, :] @ p_sp_given_s_a.transpose(1, 2, 0)
p_pi_sp_given_s = p_pi_sp_given_s[:, 0, :].transpose(1,0) # (S, S') shape 0 index is like squeezing in pytorch.
p_pi_sp_given_s

array([[0.5  , 0.625],
       [0.5  , 0.375]])

In [None]:
p_sp_given_s_a.shape # (S', S, A)

(2, 2, 2)

In [None]:
pi.shape # (S', A)

(2, 2)

In [None]:
def get_p_pi_sp_ap_given_s_a(pi, p_sp_given_s_a):
    # (S', A', 1, 1) * (S', 1, S, A)
    p_pi = pi[:, :, np.newaxis, np.newaxis] * p_sp_given_s_a[:, np.newaxis, :, :]
    return p_pi

p_pi_sp_ap_given_s_a = get_p_pi_sp_ap_given_s_a(pi, p_sp_given_s_a)
p_pi_sp_ap_given_s_a.reshape(4,4)

array([[0.25      , 0.08333333, 0.1875    , 0.        ],
       [0.75      , 0.25      , 0.5625    , 0.        ],
       [0.        , 0.55555556, 0.20833333, 0.83333333],
       [0.        , 0.11111111, 0.04166667, 0.16666667]])

In [71]:
r_pi = np.sum(r * pi, axis=1)
r_pi

array([-0.75      , -0.66666667])

In [None]:
def get_vq2(p_pi_sp_given_s, p_sp_given_s_a, gamma, pi, r):
    q_pi = np.linalg.inv(np.eye(4) - gamma * p_pi_sp_ap_given_s_a.T) @ r
    v_pi = np.array([[1,1,0,0],[0,0,1,1]]) @ (pi * q_pi)
    return v_pi, q_pi

array([[0.5  , 0.625],
       [0.5  , 0.375]])