# Stanford CME 241 (Winter 2021) - Assignment 15

In [None]:
from typing import Sequence, Tuple, Mapping

S = str
DataType = Sequence[Sequence[Tuple[S, float]]]
ProbFunc = Mapping[S, Mapping[S, float]]
RewardFunc = Mapping[S, float]
ValueFunc = Mapping[S, float]

def get_state_return_samples(
    data: DataType
) -> Sequence[Tuple[S, float]]:
    """
    prepare sequence of (state, return) pairs.
    Note: (state, return) pairs is not same as (state, reward) pairs.
    """
    return [(s, sum(r for (_, r) in l[i:]))
            for l in data for i, (s, _) in enumerate(l)]


def get_mc_value_function(
    state_return_samples: Sequence[Tuple[S, float]]
) -> ValueFunc:
    """
    Implement tabular MC Value Function compatible with the interface defined above.
    """
    num_episodes = len(state_return_samples)
    for i in range(num_episodes):
        G = 0
        for S,R in state_return_samples[i]:
            visited = set({})
            if S not in visited:
                G += R        
                visited.add(S)
            else:      
                state_return_samples[S].append(G)
                ValueFunc[S] = np.mean(state_return_samples[S])
    return ValueFunc
                
def get_state_reward_next_state_samples(
    data: DataType
) -> Sequence[Tuple[S, float, S]]:
    """
    prepare sequence of (state, reward, next_state) triples.
    """
    return [(s, r, l[i+1][0] if i < len(l) - 1 else 'T')
            for l in data for i, (s, r) in enumerate(l)]

def get_td_value_function(
    srs_samples: Sequence[Tuple[S, float, S]],
    num_updates: int = 300000,
    learning_rate: float = 0.3,
    learning_rate_decay: int = 30
) -> ValueFunc:
    """
    Implement tabular TD(0) (with experience replay) Value Function compatible
    with the interface defined above. Let the step size (alpha) be:
    learning_rate * (updates / learning_rate_decay + 1) ** -0.5
    so that Robbins-Monro condition is satisfied for the sequence of step sizes.
    """
    for i_update in range(num_updates):
        S = srs_samples[i_update][0]
        R = srs_samples[i_update][1]
        S_ = srs_samples[i_update][2]
        alpha = learning_rate * (i_update / learning_rate_decay + 1) ** (-0.5)
        for i in range(int(1./alpha)):
            ValueFunc[S] += alpha * (R + ValueFunc[S_] - ValueFunc[S])
            S = S_
    return ValueFunc
            