# Stanford CME 241 (Winter 2021) - Assignment 16 

In [2]:
import numpy as np
import itertools

### Policy Gradient Algorithms

#### Monte-Carlo (REINFORCE)

In [3]:
def reinforce(env, estimator_policy, estimator_value, num_episodes, discount_factor=1.0): 
        
    for i_episode in range(num_episodes):
        state = env.reset()
        episode = []
        
        for t in itertools.count():      
            action_probs = estimator_policy.predict(state)
            action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
            next_state, reward, done, _ = env.step(action)
            
            episode.append(Transition(
              state=state, action=action, reward=reward, next_state=next_state, done=done))
            
            values.episode_rewards[i_episode] += reward
            values.episode_lengths[i_episode] = t

            if done:
                break             
            state = next_state
    
        for t, transition in enumerate(episode):
            total_return = sum(discount_factor**i * t.reward for i, t in enumerate(episode[t:]))
            baseline_value = estimator_value.predict(transition.state)            
            advantage = total_return - baseline_value
            estimator_value.update(transition.state, total_return)
            estimator_policy.update(transition.state, advantage, transition.action)
    
    return values

#### ACTOR-CRITIC-ELIGIBILITY-TRACES

In [4]:
def actor_critic(env, estimator_policy, estimator_value, num_episodes, discount_factor=1.0):
    
    for i_episode in range(num_episodes):
        state = env.reset()     
        episode = []      
        for t in itertools.count():
            
            action_probs = estimator_policy.predict(state)
            action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
            next_state, reward, done, _ = env.step(action)
            
            episode.append(Transition(
              state=state, action=action, reward=reward, next_state=next_state, done=done))
            
            values.episode_rewards[i_episode] += reward
            values.episode_lengths[i_episode] = t
            
            value_next = estimator_value.predict(next_state)
            td_target = reward + discount_factor * value_next
            td_error = td_target - estimator_value.predict(state)
            
            estimator_value.update(state, td_target)
            estimator_policy.update(state, td_error, action)

            if done:
                break              
            state = next_state
    
    return values

#### Evaluate the score function

$$
  \begin{align*}
    log_{\pi} (s,a; \theta) &= \theta^T \phi(s,a) - log(\displaystyle\sum_{b \in A} e^{\theta^T \phi(s,b)}) \\
    \frac{d}{d \theta_i}(log_{\pi} (s,a; \theta)) &= \phi_i (s,a) - \frac{\displaystyle\sum_{b \in A} \phi_i (s,b) e^{\theta^T \phi(s,b)}}{\displaystyle\sum_{b \in A} e^{\theta^T \phi(s,b)}} \\
    &= \phi_i (s,a) - \displaystyle\sum_{b \in A} \pi (b,s;\theta) \theta_i (s,b) \\
    &= \phi_i (s,a) - \mathbb{E}_{\pi} [\theta_i (s,\cdot)] \\
  \end{align*}
$$

#### Construct the Action-Value function approximation

$$ Q (s,a;w) = w^T \frac{d}{d \theta}(log_{\pi} (s,a; \theta)) $$

#### Action-Value function approximation has zero mean for any state s

$$
  \begin{align*}
    \displaystyle\sum_{a \in A} \pi(s,a;\theta) Q(s,a;w) &= \displaystyle\sum_{a \in A} \pi(s,a;\theta) w^T \frac{d}{d \theta}(log_{\pi} (s,a; \theta)) \\
    &= \displaystyle\sum_{a \in A} w^T \frac{d}{d \theta}(log_{\pi} (s,a; \theta)) \\
    &= w^T \frac{d}{d\theta} \displaystyle\sum_{a \in A} \pi(s,a;\theta) \\
    &= 0 \\
  \end{align*}
$$