In [1]:
import numpy as np
import random

In [2]:
import grid_mdp

In [3]:
def update_valuepolicy(valuepolicy, f, a, tvalue, alpha):
    
    pvalue = valuepolicy.qfunc(f,a)
    error = pvalue - tvalue
    fea = valuepolicy.get_fea_vec(f, a)
    valuepolicy.theta -= alpha * error * fea

In [4]:
def update_softmaxpolicy(softmaxpolicy, f, a, qvalue, alpha):
    
    fea = softmaxpolicy.get_fea_vec(f,a)
    prob = softmaxpolicy.pi(f)
    
    delta_logj = fea
    
    for i, ax in enumerate(softmaxpolicy.actions):
        feax = softmaxpolicy.get_fea_vec(f, ax)
        delta_logj -= feax * prob[i]
        
    delta_logj *= -1.0
    
    softmaxpolicy.theta -= alpha *delta_logj * qvalue

In [5]:
def mc(grid, softmaxpolicy, num_iters, alpha):
    
    actions = grid.actions
    gamma = grid.gamma
    
    
    for _ in range(num_iters):
        
        f_sample = []
        a_sample = []
        r_sample = []
        
        f = grid.start()
        t = False
        while t == False:
            
            a = softmaxpolicy.take_action(f)
            t, f1, r = grid.receive(a)
            
            f_sample.append(f)
            r_sample.append(r)
            a_sample.append(a)
            
            f = f1
            
        g = 0.0
        for i in range(len(f_sample)-1, -1, -1):
            
            g *= gamma
            g += r_sample[i]
            
        
        for i in range( len(f_sample)):
            
            update_softmaxpolicy(softmaxpolicy, f_sample[i], a_sample[i], g, alpha)
            
            g -= r_sample[i]
            g /= gamma
            
        return softmaxpolicy

In [6]:
grid = grid_mdp.Grid_Mdp()
softmaxpolicy = grid_mdp.SoftmaxPolicy(grid, epsilon=0.01)

In [7]:
rep = mc(grid, softmaxpolicy, 10000, 0.01)

In [8]:
rep.pi([0,0,0,0,1,0,0,0])

[0.25, 0.25, 0.25, 0.25]

In [16]:
def sarsa(grid, policy, value, num_iter1, alpha):
    
    actions = grid.actions
    gamma   = grid.gamma
    
    for i in xrange(len(policy.theta)):
        
        value.theta[i]  = 0.1
        policy.theta[i] = 0.0

    for iter1 in xrange(num_iter1):
        
        f = grid.start();
        a = actions[int(random.random() * len(actions))]
        t = False

        while False == t:
            
            t,f1,r      = grid.receive(a)
            a1          = policy.take_action(f1)
            
            update_valuepolicy(value, f, a,  r + gamma * value.qfunc(f1, a1), alpha)
            update_softmaxpolicy(policy, f, a, value.qfunc(f,a), alpha);

            f           = f1
            a           = a1

    return policy

In [17]:
grid            = grid_mdp.Grid_Mdp()
softmaxpolicy   = grid_mdp.SoftmaxPolicy(grid, epsilon = 0.01)
valuepolicy     = grid_mdp.ValuePolicy(grid, epsilon = 0.01)

In [19]:
softmaxpolicy  = sarsa(grid, softmaxpolicy, valuepolicy, 2000, 0.01)

In [23]:
softmaxpolicy.pi([0,0,1,0,0,0,0,0])

[0.049158300478542687,
 0.03502352101574302,
 0.88173130885287632,
 0.034086869652838074]