In [1]:
from __future__ import division

import matplotlib.pyplot as plt
%matplotlib inline

import numpy as np
import pandas as pd


In [44]:

def simulate_single_random_walk():
    '''
    Runs a sinlge random walk starting in the middle of range(0,4).
    If the current_state goes outside this range, the random walk ends.
    INPUT
        NONE
    OUTPUT 
        path: list of integers. The indices that the agent covered in the random walk.
    '''
    boundaries = [0,6]
    moves = [-1, 1]
    current_state = 3
    path = []
    
    # while x is within the bounds
    while current_state > boundaries[0] and current_state < boundaries[1]:
        path.append(current_state)
        move = np.random.choice(moves)
        current_state += move
    # add terminal state
    path.append(current_state)
        
    return path

In [45]:
def rmse(predictions, targets):
    return np.sqrt(((predictions - targets) ** 2).mean())

In [52]:
def w_update(lam, alpha, P_t_1, P_t_0, e_t, x_i):
    # e_t update per page 16 of Learning to Predict by Methods of Temporal Difference
    delta_w_t = (alpha * (P_t_1 - P_t_0) * (lam * e_t + P_t_1)) * x_i
    return delta_w_t

def convergence_simulator(alpha, lam):
    states = [np.zeros(7) for _ in range(7)]
    for idx, state in enumerate(states):
        state[idx] = 1.0
    w = np.array([0, .5, .5, .5, .5, .5, 1])
    actual_probs = np.array([0, 1/6, 1/3, 1/2, 2/3, 5/6, 1])
    rmse_list = []
    
    for training_set in range(100):
        # paths will be a list of lists. Each sublist will be a path walked
        paths = []
        for sequence in range(10):
            path = simulate_single_random_walk()
            paths.append(path)
            
        iter_count = 0
        sigma = .01
        iteration_delta = 100
        while iteration_delta > sigma and iter_count < 1000:
            iter_count += 1
                
            for path in paths:
                original_w = w.copy()
                for idx in range(len(path) - 1):
                    if idx == 0:
                        e_t = path[0]
                    t = idx + 1
                    P_t_1 = np.dot(states[path[idx + 1]], w)
                    P_t_0 = np.dot(states[path[idx]], w)
                    w += w_update(lam, alpha, P_t_1, P_t_0, e_t, states[path[idx]])
                iteration_delta = np.linalg.norm(w - original_w)
                
        rmse_estimate = rmse(w, actual_probs)
        rmse_list.append(rmse_estimate)
        
    print("w: ", w)
    return np.average(rmse_estimate)

In [54]:
convergence_simulator(alpha=.035, lam=0.5)

0.026058920170284119