In [19]:
from __future__ import division
import numpy as np
import pandas as pd

In [41]:
def simulate_single_random_walk():
    '''
    Runs a sinlge random walk starting in the middle of range(0,4).
    If the current_state goes outside this range, the random walk ends.
    INPUT
        NONE
    OUTPUT 
        outcome: Integer. 0 or 1. 0 if the agent fell out the left side of the
            bounds. 1 if the agent fell outside of the right side of the bounds.
        path: list of integers. The indices that the agent covered in the random walk.
    '''
    boundaries = [0,4]
    moves = [-1, 1]
    current_state = 2
    path = []
    
    # while x is within the bounds
    while current_state >= boundaries[0] and current_state <= boundaries[1]:
        path.append(current_state)
        move = np.random.choice(moves)
        current_state += move
        
    if current_state < 0:
        return 0, path
    else:
        return 1, path

In [45]:
def rmse(predictions, targets):
    return np.sqrt(((predictions - targets) ** 2).mean())

In [48]:
def simulate_multiple_walks(alpha=0.35, lam=0):
    '''
    Runs multiple random walks and updates expected values (w vector) according
    to TD(lambda) formula (4).
    INPUT
        alpha: float between 0 and 1. Learning rate.
        lam: float between 0 and 1. Lambda value.
    OUTPUT
        RMS: float between 0 and 1. Root Mean Squared Error.
    '''
    # set up initial expected weights
    w = np.array([0.5, 0.5, 0.5, 0.5, 0.5])
    X = np.array([[1,0,0,0,0], [0,1,0,0,0], [0,0,1,0,0], [0,0,0,1,0], [0,0,0,0,1]])
    actual_probs = np.array([1/6, 1/3, 1/2, 2/3, 5/6])

    # TODO: update to batches per page 20 of Sutton
    for batch in range(100):
        batch_outcomes = []
        batch_paths = []
        for i in range(10):
            walk_outcome, path = simulate_single_random_walk()
            batch_outcomes.append(walk_outcome)
            batch_paths.append(path)
        # update weights based on walk outcome
        for i, step in enumerate(path):
            w[step] += alpha * (walk_outcome - np.dot(w, X[step])) 

    rms = rmse(w, actual_probs)
    return rms

In [49]:
simulate_multiple_walks(alpha=0.035, lam=0)

0.024388065961979177