In [None]:
%matplotlib notebook

In [None]:
import numpy as np
import matplotlib.pyplot as plt

In [None]:
from mountaincar import MountainCar, MountainCarViewer

In [None]:
car = MountainCar()

In [None]:
parameter1 = 3.0
n_steps = 200

In [None]:
lmbda = 0.8
gamma = 0.95
eta = 0.01
dt = 0.01
n = 100
tau = 1
steps = 100
width = 20
height = 20
epsilon = 0.2

In [None]:
xx = np.linspace(-150, 30, width)
yy = np.linspace(-15, 15, height)
yy_t = np.array([yy]).T

In [None]:
def Q(s, a, w, s_x = 1, s_y = 1):
    return np.sum(w[:, :, a] * np.exp(- ((xx - s[0]) / s_x) ** 2 - ((yy_t - s[1]) / s_y) ** 2))

In [None]:
def softmax(x, t=1):
    e_x = np.exp(x / t)
    return e_x / e_x.sum()

In [None]:
def toClosest(s):
    return np.abs(xx - s[0]).argmin(), np.abs(yy - s[1]).argmin()

In [None]:
# random q values
w = np.random.rand(width, height, 3)

for epi in np.arange(1):
    
    # initial state
    car.reset()
    s = car.x, car.x_d
    # initial action
    a = np.random.randint(3)
    # no eligibility traces
    e = np.ones((width, height, 3))
    
    mv = MountainCarViewer(car)
    mv.create_figure(5000, 5000)
        
    for trial in np.arange(5000):        
        car.apply_force(a - 1)
        car.simulate_timesteps(steps, dt)
        
        sp = car.x, car.x_d
        spd = toClosest(sp)

        if np.random.rand() < epsilon:
            ap = np.argmax(softmax(w[spd[0], spd[1], :]))
        else:
            ap = np.random.randint(3)
        
        delta = car.R + gamma * Q(sp, ap, w) - Q(s, a, w)
        
        e[spd[0], spd[1], a] += 1
        
        w += eta * delta * e
        e *= gamma * lmbda
            
        a = ap
        s = sp
        
        mv.update_figure()
        plt.draw()  
        
        if car.R > 0.0:
            print("\rreward obtained at t = ", car.t)
            break
        
