In [None]:
%matplotlib inline

In [None]:
import sys
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

In [None]:
from mountaincar import MountainCar, MountainCarViewer

In [None]:
np.seterr(over='raise')

In [None]:
car = MountainCar()

### Plot functions

In [None]:
def vec_plot(p):
    p_max = np.argmax(p, axis=2)
    U = (p_max == 0) * 1 + (p_max == 1) * -1
    V = np.zeros((ngrid_pos, ngrid_speed))
    
    plt.quiver(U, V, alpha=1, scale=1.8, units='xy')

    plt.xlim(-1, 20)
    plt.xticks(())
    plt.ylim(-1, 20)
    plt.yticks(())
    
    plt.xlabel('position $x$')
    plt.ylabel('speed $\dot x$')
    plt.title('Q-values direction vector field (arrows show the direction of applied force)')

    plt.show()

In [None]:
def plot3D(q):
    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')
    
    x, y = np.meshgrid(x_pos, y_speed)
    ax.plot_wireframe(x, y, q, color='grey')
    ax.set_xlabel('position')
    ax.set_ylabel('speed')
    ax.set_zlabel('max q')
    
    plt.show()

### Helper coordinates

In [None]:
# grid discretisation
ngrid_pos = 20
ngrid_speed = 20

In [None]:
# grid corners
int_pos = -150, 30
int_speed = -15, 15

In [None]:
x_pos, center_dist_pos = np.linspace(int_pos[0], int_pos[1], ngrid_pos, retstep=True)
y_speed, center_dist_speed = np.linspace(int_speed[0], int_speed[1], ngrid_speed, retstep=True)
y_speed_t = y_speed.reshape(-1, 1)

### Helper functions

In [None]:
def activity(s):
    return np.exp(- ((x_pos - s[0]) / center_dist_pos) ** 2 - ((y_speed_t - s[1]) / center_dist_speed) ** 2).T

In [None]:
def Q(s, a, w):
    return np.sum(w[:, :, a] * activity(s))

In [None]:
# This is a robust solution so that we have a meaningful value for tau.
def softmax(x, tau):
    
    # all zero mean 1/len(x) chance for each action
    if np.count_nonzero(x) == 0:
        return np.ones_like(x) / len(x)
    
    # rescale to avoid overflow issues
    xp = (np.array(x) - np.min(x)) / (np.max(x) - np.min(x))
    
    e_x = np.exp(xp / tau)
    return e_x / e_x.sum()

In [None]:
qs = np.array([0.3, 0.2, 0.1])
x = np.logspace(-1, 3, 40)
y = np.array(list(zip(*[softmax(qs, xi) for xi in x])))

plt.xscale("log")
plt.plot(x,y[0])
plt.plot(x,y[1])
plt.plot(x,y[2])
plt.show()

In [None]:
tau_max = 1
tau_min = 1e-1
tau_steps = 100
x = range(tau_steps)
y = [tau_max * np.exp((1 / tau_steps) * np.log(tau_min / tau_max))**i for i in x]

In [None]:
plt.plot(x,y)

In [None]:
def sarsa(n_epi = 500,
          tau_max=1, # exploration/expoitation parameter
          tau_min=1e-2,
          tau_steps=500,
          gamma=0.95, 
          lmbda = 0.8, 
          eta = 0.01, 
          dt=0.01, 
          steps=100):
    
    probs = []
    times = []

    # decreasing exp coeficient for tau
    tau_coef = np.exp((1 / tau_steps) * np.log(tau_min / tau_max))
    tau = tau_max
    
    # initial random weights
    w = np.zeros((ngrid_pos, ngrid_speed, 3))

    i = 0
    for epi in np.arange(n_epi):
        print("episode :", epi)

        # null eligibility traces
        e = np.zeros((ngrid_pos, ngrid_speed, 3))

        # initial state
        car.reset()
        s0 = car.x, car.x_d

        # initial random action
        a0 = np.random.randint(3)

        j = 0
        while j < 2000:
            i += 1
            j += 1

            # take action between -1, 0 and 1
            car.apply_force(a0 - 1)
            car.simulate_timesteps(steps, dt)

            # retrieve current state and closest discret state
            s1 = car.x, car.x_d

            # compute proba for each action and choose among them
            p = softmax([Q(s1, a, w) for a in range(3)], tau)
            a1 = np.random.choice(range(3), p=p)

            # decrease eligibility traces and increase selection action
            e *= gamma * lmbda
            e[:, :, a0] += activity(s0)[:, :]

            # update weights
            delta = car.R + gamma * Q(s1, a1, w) - Q(s0, a0, w)
            w += eta * delta * e

            # propagate next action and state
            a0 = a1
            s0 = s1

            if car.R > 0.0:
                print('reward obtained at t =', car.t)
                break
                
        #We diminish tau only if we obtained the goal
        if j != 2000:
            #Tau update (minimum value to prevent overflow)
            tau = max(tau*tau_coef, tau_min)
            print("new tau", tau)

        prob = np.array([[softmax([Q((x, y), a, w) for a in range(3)], tau) for x in x_pos] for y in y_speed])
        max_action = np.max([[[Q((x, y), a, w) for a in range(3)] for x in x_pos] for y in y_speed], axis=2)

        #vec_plot(prob)
        #plot3D(max_action)
        #plt.show()

        probs.append(prob)
        times.append(car.t)
    return w, probs, times

In [None]:
w, probs, times = sarsa()

In [None]:
smooth = 10
plt.plot(times)
plt.plot(np.convolve(np.ones(smooth) / smooth, times, mode='same'))

In [None]:
vec_plot(probs[100])