# Stanford CME 241 (Winter 2021) - Assignment 12

In [2]:
import pandas as pd
import numpy as np

from gridWorldEnvironment import GridWorld
gw = GridWorld(gamma = .9, theta = .5)

### n-Step TD Prediction

In [3]:
def generate_random_policy(env):
    random_policy = {}
    for state in env.states:
        actions = []
        prob = []
        for action in env.actions:
            actions.append(action)
            prob.append(0.25)
        random_policy[state] = (actions, prob)
    return random_policy

In [6]:
def n_step_td_prediction(env, alpha = .5, n = 3, num_iter = 100):
    value_func = {}
    for state in env.states:
        value_func[state] = np.random.normal()     
    policy = generate_random_policy(env)
    
    for i in range(num_iter):
        state_trace = []
        action_trace = []
        reward_trace = []
        current_state = np.random.choice(env.states)
        state_trace.append(current_state)
        trace, total_trace = 0, 10000
        while True:
            if trace < total_trace:
                action = np.random.choice(policy[current_state][0], p = policy[current_state][1])
                action_trace.append(action)
                next_state, reward = env.state_transition(current_state, action)
                state_trace.append(next_state)
                reward_trace.append(reward)
                if next_state == 0:
                    total_trace = trace + 1
                    
            step = trace - n + 1   
            if step >= 0:               
                returns = 0
                for j in range(step+1, min([step+n, total_trace])+1):
                    returns += (env.gamma ** (j - step - 1)) * reward_trace[j-1]
                if step + n < total_trace: 
                    returns += (env.gamma ** n) * value_func[state_trace[step + n]]
                value_func[state_trace[step]] += alpha * (returns - value_func[state_trace[step]])
            
            if step == (total_trace-1):
                break
            current_state = next_state
            trace += 1
    return value_func

In [13]:
values = n_step_td_prediction(gw, num_iter = 10000)
np.array(list(values.values()))

array([-6.89560882, -7.22369492, -8.68058755, -3.61373811, -5.96515275,
       -8.63247721, -7.48758891, -4.70309762, -7.41797356, -6.19668666,
       -8.26693171, -5.09803511, -7.15102475, -5.69806612])

### MC Error & TD errors

Recall two points of TD error: is the error in the estimate made at that time; depends on the next state and next reward and is available till one time step later. TD error is defined as: $$\delta_t \doteq R_{t+1} + \gamma V(S_{t+1}) -V(S_t)$$
The Monte Carlo error can be written as:
$$
  \begin{align*}
    G_t - V(S_t) &= R_{t+1} + \gamma G_{t+1} - V(S_t) + \gamma V(S_{t+1}) - \gamma V(S_{t+1}) \\
    &= \delta_t + \gamma (G_{t+1} - V(S_{t+1})) \\
    &= \delta_t + \gamma \delta_{t+1} + \gamma^2 (G_{t+2} - V(S_{t+2})) \\ 
    &= \delta_t + \gamma \delta_{t+1} + \gamma \delta_{t+2} + \cdots + \gamma^{T-t-1} \delta_{T-1} + \gamma^{T-t} (G_T - V(S_T)) \\
    &= \delta_t + \gamma \delta_{t+1} + \gamma \delta_{t+2} + \cdots + \gamma^{T-t-1} \delta_{T-1} + \gamma^{T-t} (0 - 0) \\
    &= \displaystyle\sum_{k=t}^{T-1} \gamma^{k-t} \delta_k \\
  \end{align*}
$$
which shows that the MC Error can be written as the sum of discounted TD errors.