# Stanford CME 241 (Winter 2021) - Assignment 11

In [14]:
import pandas as pd
import numpy as np

from gridWorldEnvironment import GridWorld
gw = GridWorld(gamma = .9, theta = .5)

## MC Prediction

### Generate random episode

In [13]:
def generate_random_episode(env):
    episode = []
    done = False
    current_state = np.random.choice(env.states)
    episode.append((current_state, -1))
    while not done:
        action = np.random.choice(env.actions)
        next_state, reward = gw.state_transition(current_state, action)
        episode.append((next_state, reward))
        if next_state == 0:
            done = True
        current_state = next_state
    return episode

print(generate_random_episode(gw))

# create and initialize value array (include 2 terminal states)
def value_array(env):
    return np.zeros(len(env.states)+2)

[(10, -1), (6, -1), (7, -1), (7, -1), (3, -1), (3, -1), (3, -1), (3, -1), (3, -1), (3, -1), (3, -1), (7, -1), (7, -1), (7, -1), (11, -1), (11, -1), (7, -1), (6, -1), (5, -1), (4, -1), (4, -1), (0, -1)]


#### First-visit MC Prediction

In [22]:
def first_visit_mc(env, num_iter):
    values = value_array(env)
    returns = dict()
    for state in env.states:
        returns[state] = list()
    
    for i in range(num_iter):
        episode = generate_random_episode(env)
        # use 'already_visited' as the variable to exclude states we have visited
        # initialize with the terminal state in it
        already_visited = set({0})   
        for s, r in episode:
            if s not in already_visited:
                already_visited.add(s)
                idx = episode.index((s, r))
                G = 0
                j = 1
                # walk through all states in the random generated episode in each iteration
                while j + idx < len(episode):
                    G = env.gamma * (G + episode[j + idx][1])
                    j += 1
                returns[s].append(G)
                # we only visit each state once; values[s] = returns[s]
                values[s] = np.mean(returns[s])
    return values, returns

In [23]:
values, returns = first_visit_mc(gw, 10000)
values

array([ 0.        , -4.77835972, -6.40212876, -6.86471444, -4.77615712,
       -5.95115629, -6.43682686, -6.40968011, -6.38685514, -6.46450301,
       -5.90752721, -4.70317018, -6.89330328, -6.44994181, -4.74261567,
        0.        ])

#### Every-visit MC Prediction

In [16]:
def every_visit_mc(env, num_iter):
    values = value_array(env)
    returns = dict()
    for state in env.states:
        returns[state] = list()
    
    for i in range(num_iter):
        episode = generate_random_episode(env)
        for s, r in episode:
            if s != 0: # a simple judgement, we don't need to make a set here
                idx = episode.index((s, r))
                G = 0
                j = 1
                while j + idx < len(episode):
                    G = env.gamma * (G + episode[j + idx][1])
                    j += 1
                returns[s].append(G)
                # we assume f(n) = 1/n; see 2/17 - RL Slide 10/44
                values[s] = np.mean(returns[s])
    return values, returns

In [17]:
values, returns = every_visit_mc(gw, 10000)
values

array([ 0.        , -5.83560098, -7.44156575, -7.75810965, -5.88156501,
       -7.06147093, -7.52677057, -7.48542171, -7.47299368, -7.52990292,
       -7.08556877, -5.95335775, -7.81574431, -7.51473732, -5.97236212,
        0.        ])

## Tabular TD Prediction

### Generate policy and episode

In [9]:
def generate_policy(env):
    random_policy = {}
    for state in env.states:
        random_num = sorted(np.random.sample(3))
        actions = env.actions
        prob = [random_num[0], random_num[1] - random_num[0], random_num[2] - random_num[1], 1-random_num[2]]
        random_policy[state] = (actions, prob)
    return random_policy

def generate_episode(env, s0, a0, policy):
    episode = []
    done = False
    current_state, action = s0, a0
    episode.append((current_state, action, -1))
    
    while not done:
        next_state, reward = gw.state_transition(current_state, action)
        prob = policy[current_state][1]
        prob[np.argmax(prob)] -= .2
        prob[np.random.choice(np.delete(np.arange(4), np.argmax(prob)))] += .1
        prob[np.random.choice(np.delete(np.arange(4), np.argmax(prob)))] += .05
        prob[np.random.choice(np.delete(np.arange(4), np.argmax(prob)))] += .05

        action = np.random.choice(policy[current_state][0], p = prob)
        episode.append((next_state, action, reward))
        
        if next_state == 0:   
            done = True
        current_state = next_state
    return episode[:-1]

### Tabular TD(0)

In [10]:
def tabular_td(env, alpha, num_iter):
    value_fuc = np.zeros(len(env.states)+2)
    pi = generate_policy(env)
    
    for _ in range(num_iter):
        s0, a0 = np.random.choice(env.states), np.random.choice(env.actions)
        episode = generate_episode(env, s0, a0, pi)
        already_visited = set()
        
        for step in range(len(episode)):
            current_state, action = episode[step][0], episode[step][1]
            next_state, reward = env.state_transition(current_state, action)
            value_fuc[current_state] += alpha * (reward + env.gamma * value_fuc[next_state] - value_fuc[current_state])            
    return value_fuc

In [13]:
values = tabular_td(gw, .5, 1000)
values

array([ 0.        , -7.56807923, -7.77505403, -8.11106428, -4.60630788,
       -8.04859408, -7.33351423, -7.81881166, -8.24851365, -7.8158604 ,
       -7.29291106, -3.56236681, -8.23071182, -7.54831316, -6.12672111,
        0.        ])