In [None]:
%matplotlib inline

In [None]:
import sys
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

In [None]:
from mountaincar import MountainCar, MountainCarViewer

In [None]:
car = MountainCar()

### Parameters

In [None]:
# grid discretisation
ngrid_pos = 20
ngrid_speed = 20

In [None]:
# grid corners
int_pos = -150, 30
int_speed = -15, 15

In [None]:
# exploration/expoitation parameter
# -> inf more exploration
# -> 0 more exploitation
tau = 1.

In [None]:
gamma = 0.95
lmbda = 0.95
eta = 0.001

In [None]:
dt = 0.01
steps = 100

### Initialisation

In [None]:
x_pos, center_dist_pos = np.linspace(int_pos[0], int_pos[1], ngrid_pos, retstep=True)
y_speed, center_dist_speed = np.linspace(int_speed[0], int_speed[1], ngrid_speed, retstep=True)
y_speed_t = y_speed.reshape(-1, 1)

### Helper functions

In [None]:
def reward_activity(s):
    activity = np.exp(- ((x_pos - s[0]) / center_dist_pos) ** 2 - ((y_speed_t - s[1]) / center_dist_speed) ** 2)
    return activity.T

In [None]:
def Q(s, a, w):
    return np.sum(w[:, :, a] * reward_activity(s))

In [None]:
def softmax(x, tau):
    e_x = np.exp(np.minimum(np.array(x) / tau, 30))
    return e_x / e_x.sum()

### Plot functions

In [None]:
def vec_plot(p):
    p_max = np.argmax(p, axis=2)
    U = (p_max == 0) * 1 + (p_max == 1) * -1
    V = np.zeros((ngrid_pos, ngrid_speed))
    
    plt.quiver(U, V, alpha=1, scale=1.8, units='xy')

    plt.xlim(-1, 20)
    plt.xticks(())
    plt.ylim(-1, 20)
    plt.yticks(())
    
    plt.xlabel('position $x$')
    plt.ylabel('speed $\dot x$')
    plt.title('Q-values direction vector field (arrows show the direction of applied force)')

    plt.show()

In [None]:
def plot3D(q):
    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')
    
    x, y = np.meshgrid(x_pos, y_speed)
    ax.plot_wireframe(x, y, q, color='grey')
    ax.set_xlabel('position')
    ax.set_ylabel('speed')
    ax.set_zlabel('max q')
    
    plt.show()

## Sarsa Algorithm

In [None]:
probs = []
times = []
np.random.seed(345435)
np.seterr(over='raise')

# initial random weights
w = np.zeros((ngrid_pos, ngrid_speed, 3))

i = 0
for epi in np.arange(1000):
    print("episode :", epi)
    
    # null eligibility traces
    e = np.zeros((ngrid_pos, ngrid_speed, 3))
    
    # initial state
    car.reset()
    s0 = car.x, car.x_d
    
    # initial random action
    a0 = np.random.randint(3)
    
    j = 0
    while True:
        i += 1
        j += 1
        
        # take action between -1, 0 and 1
        car.apply_force(a0 - 1)
        car.simulate_timesteps(steps, dt)
        
        # retrieve current state and closest discret state
        s1 = car.x, car.x_d
        
        # compute proba for each action and choose among them
        p = softmax([Q(s1, a, w) for a in range(3)], tau)
        a1 = np.random.choice(range(3), p=p)
                
        # decrease eligibility traces and increase selection action
        e *= gamma * lmbda
        e[:, :, a0] += reward_activity(s0)[:, :]
        
        # update weights
        delta = car.R + gamma * Q(s1, a1, w) - Q(s0, a0, w) - j/1000.
        w += eta * delta * e

        # propagate next action and state
        a0 = a1
        s0 = s1
        
        if i % 1000 == 0:
            plot3D(-np.max([[[Q((x, y), a, w) for a in range(3)] for x in x_pos] for y in y_speed], axis=2))
            plt.show()
        
        if car.R > 0.0:
            print('reward obtained at t = ', car.t)
            break
    
    prob = np.array([[softmax(np.array([Q((x, y), a, w) for a in range(3)]), tau) for y in y_speed] for x in x_pos])
    probs.append(prob)
    times.append(car.t)

In [None]:
vec_plot(probs[-1])
plt.show()

In [None]:
plt.plot(times)
plt.plot(np.convolve(np.ones(25) / 25, times, mode='full'))
plt.show()

In [None]:
times[-10:-1]