In [23]:
import numpy as np
import scipy

### Description

We have the below Markov Decision Process:<br>
<img src='images/MDP.png'>
<br>
For this MDP, we want to find a lambda value such that the Expected Value of this MDP is the same as the expected value, were we to use TD(1).

First, let's program the k-step estimators for k=1 through k=6 (this will be the same as k=infinity)

In [16]:
def k_step_estimators(prob_state_1, value_estimates, rewards):
    '''
    This outputs the first 5 k-estimators for the given input.
    INPUT 
        prob_state_1: probability of transfering to state 1 (versus to state 2)
        value_estimates: list of numbers. Each number represents the value estimate
            for the state of the same index.
        rewards: list of numbers. Each number represents the reward associated with 
            the state of the same index.
    OUTPUT
        k_est_values: list of floats. Each float represents the k-estimator value
            associated with the input for k = list index
    '''
    k_est_values = []
    path_1 = [0,1,3,4,5,6]
    path_2 = [0,2,3,4,5,6]
    
    rewards_1_indices = [0,2,4,5,6]
    rewards_2_indices = [1,3,4,5,6]

    rewards_1 = np.array(rewards)[rewards_1_indices]
    rewards_2 = np.array(rewards)[rewards_2_indices]
    
    val_ests_1 = np.array(value_estimates)[path_1]
    val_ests_2 = np.array(value_estimates)[path_2]
    
    for k in range(1,6):
        path_1 = prob_state_1 * (
            value_estimates[0] + np.sum(rewards_1[:k]) + val_ests_1[k]
        )
        path_2 = (1 - prob_state_1) * (
            value_estimates[0] + np.sum(rewards_2[:k]) + val_ests_2[k]
        )
        k_est_values.append(path_1 + path_2)
        
    return k_est_values

In [104]:
def get_td_1(prob_state_1, value_estimates, rewards):
    '''
    Computes TD(1) for the above MDP, given the inputs. 
    INPUT 
    prob_state_1: probability of transfering to state 1 (versus to state 2)
    rewards: list of numbers. Each number represents the reward associated with 
        the state of the same index.
    OUTPUT 
        td_1: float.
    '''
    rewards_1_indices = [0,2,4,5,6]
    rewards_2_indices = [1,3,4,5,6]

    rewards_1 = np.array(rewards)[rewards_1_indices]
    rewards_2 = np.array(rewards)[rewards_2_indices]
    
    td_1 = (
        (prob_state_1 * np.sum(rewards_1)) + 
        ((1 - prob_state_1) * np.sum(rewards_2))
    )
        
    return td_1

Now let's run through an example to ensure that this function is working correctly.

In [126]:
k_est_values = k_step_estimators(
    prob_state_1=0.5, 
    value_estimates=[0, 3, 8, 2, 1, 2, 0], 
    rewards=[0, 0, 0, 4, 1, 1, 1]
)

print k_est_values

[5.5, 4.0, 4.0, 6.0, 5.0]


In [106]:
td_1 = get_td_1(
    prob_state_1=0.5, 
    value_estimates=[0, 3, 8, 2, 1, 2, 0], 
    rewards=[0, 0, 0, 4, 1, 1, 1]
)
print td_1

5.0


Now when performing TD($\lambda$), We know that our Expected Value of this MDP will be:<br>
$(1 - \lambda) E_{1} \, + \, \lambda(1-\lambda)E_{2} \, + \,
\lambda^{2}(1-\lambda)E_{3} \, + \, \lambda^{3}(1-\lambda)E_{4} \, + \,
\lambda^{4}(1-\lambda)E_{5}
\, + \, [(1-\lambda) \cdot TD(1) \cdot \sum_{i=6}^{\infty}{\lambda^{i-1}}]
$
<br><br>
However, because we have (tacitly) assumed that $\gamma=1$ up to this point, we need a terminating state. Obviously, T=5 acts as our terminating state. So we can rewrite this as:

$(1 - \lambda) E_{1} \, + \, \lambda(1-\lambda)E_{2} \, + \,
\lambda^{2}(1-\lambda)E_{3} \, + \, \lambda^{3}(1-\lambda)E_{4} \, + \,
\lambda^{4}(1-\lambda)E_{5} \, + \, \lambda^{4}(1-\lambda)TD(1) = TD(1)$ 

Doing some algebra, we can rewrite this as a polynomial with respect to $\lambda$:
<br>
$
(E_{1} - TD(1)) \, + \,
\lambda(E_{2} - E_{1}) \, +\, 
\lambda^{2}(E_{3} - E_{2}) \, +\, 
\lambda^{3}(E_{4} - E_{3}) \, +\, 
\lambda^{4}(E_{5} - E_{4}) \, +\, 
\lambda^{5}(TD(1) - E_{5}) \, = 0
$

Now, we just need to solve this polynomial for $\lambda$

In [135]:
def find_lambda(k_ests, td_1):
    '''
    Finds the lambda that gives us the same expected value as TD(1), according to the 
    formulas above.
    INPUT 
        k_ests: list of floats. Output of k_step_estimators.
        td_1: Our Expected value given TD(1)
    OUTPUT 
        lam: Float. Solution lambda value.
    '''
    coef_1 = td_1 - k_ests[4] # lambda ** 5 term
    coef_2 = k_ests[4] - k_ests[3] # lambda ** 4 term
    coef_3 = k_ests[3] - k_ests[2] # lambda ** 3 term
    coef_4 = k_ests[2] - k_ests[1] # lambda ** 2 term
    coef_5 = k_ests[1] - k_ests[0] # lambda term
    coef_6 = k_ests[0] - td_1 # constant
    coeffs = [coef_1, coef_2, coef_3, coef_4, coef_5, coef_6]
    
    lam = np.roots(coeffs)
    return lam

In [136]:
print find_lambda(k_est_values, td_1)

[-0.85463768  1.45160596  1.          0.40303172]


We can verify, by hand, that .40303 will result in the same Expected Value.

Now that we've verified this is working for a simple case, let's build a pipeline to take in 
our original inputs and output our $\lambda$ values.

In [137]:
def expectation_pipeline(prob_state_1, value_estimates, rewards):
    '''
    Runs a full pipeline to produce a set of lambda solutions given the input.
    INPUT 
    prob_state_1: probability of transfering to state 1 (versus to state 2)
    value_estimates: list of numbers. Each number represents the value estimate
        for the state of the same index.
    rewards: list of numbers. Each number represents the reward associated with 
        the state of the same index.
    OUTPUT 
        lam: Float. Solution lambda value
    '''
    k_est_vals = k_step_estimators(prob_state_1, value_estimates, rewards)
    td_1 = get_td_1(prob_state_1, value_estimates, rewards)
    lam = find_lambda(k_est_vals, td_1)
    return lam

In [138]:
print expectation_pipeline(0.31, [0.0,22.1,20.4,8.6,-4.1,0.0,0.0], [7.0,3.7,5.2,-0.3,-4.4,7.7,9.3])

[-1.80023857 -0.91864104  1.          0.45006241]
