### Imports

In [54]:
import gym
import random
import numpy as np
from bento.client import Client
from bento.sim import Simulation
from bento.example.mountcar import MountainCar, Action, State
from bento.example.specs import Velocity, Position
from IPython.display import clear_output
from tqdm.auto import tqdm

### Load Environment (OpenAI Gym)

In [55]:
gym_env = gym.make('MountainCar-v0')
lows = gym_env.observation_space.low
highs = gym_env.observation_space.high
n_actions = gym_env.action_space.n
print(f'Number of Features: {len(lows)}')
print(f'Position: [{lows[0]}, {highs[0]}]')
print(f'Velocity: [{lows[1]}, {highs[1]}]')
print(f'Number of Actions: {n_actions}')

Number of Features: 2
Position: [-1.2000000476837158, 0.6000000238418579]
Velocity: [-0.07000000029802322, 0.07000000029802322]
Number of Actions: 3


### Create Environment (BentoBox)

In [56]:
class MountainCarEnv:
    def __init__(self, host='bento.mrzzy.co', port='54242'):
        client = Client(host=host, port=port)
        client.connect(timeout_sec=30)
        if 'mountain_car' in client.list_sims():
            client.remove_sim('mountain_car')
        sim = Simulation.from_def(MountainCar, client)
        self.sim = sim
    def get_state(self):
        car = self.sim.entity(components=[Velocity, Position])
        return np.array([car[Position].x, car[Velocity].x])
    def reset(self):
        try: self.sim.stop()
        except: pass
        self.t = 0
        self.sim.start()
        return self.get_state()
    def step(self, action):
        env = self.sim.entity(components=[Action, State])
        env[Action].accelerate = action
        self.sim.step()
        self.t += 1
        state = self.get_state()
        reward = env[State].reward
        done = env[State].ended if self.t < 200 else True
        return state, reward, done, None

### Load Environment (BentoBox)

In [57]:
bento_env = MountainCarEnv()

### Preprocessing (Discretize state space)

In [58]:
def discretize(lows, highs, intervals):
    intervals *= np.ones(len(lows), dtype=int)
    highs = np.nextafter(highs, np.inf)
    widths = (highs - lows) / intervals
    keys = np.cumprod(intervals) // intervals[0]
    clipmax = intervals - 1
    return lambda x: np.dot(keys, np.clip(
        ((x - lows) / widths).astype(int), 0, clipmax))

In [59]:
intervals = 6
preprocess = discretize(lows, highs, intervals)
n_states = intervals ** 2

### Q-Learning

In [60]:
def tabular_qlearning(
    env, n_states, n_actions, preprocess=lambda x: x,
    alpha=0.1, gamma=0.9, epsilon=0.1, episodes=1000):
    q_table = np.zeros([n_states, n_actions])
    for i in tqdm(range(episodes)):
        state = env.reset()
        state = preprocess(state)
        while True:
            # Act
            action = (
                env.action_space.sample()
                if random.random() < epsilon else 
                np.argmax(q_table[state]))
            # Observe
            next_state, reward, done, info = env.step(action)
            next_state = preprocess(next_state)
            bootstrap = np.max(q_table[next_state])
            backup = reward + gamma * bootstrap
            q_table[state,action] += \
                alpha * (backup - q_table[state,action])
            # Loop
            state = next_state
            if done: break
    policy = lambda state: np.argmax(preprocess(state))
    return policy

In [61]:
policy = tabular_qlearning(
    bento_env, n_states, n_actions, preprocess=preprocess,
    alpha=0.1, gamma=0.99, epsilon=0.0, episodes=3)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=3.0), HTML(value='')))




In [62]:
policy = tabular_qlearning(
    gym_env, n_states, n_actions, preprocess=preprocess,
    alpha=0.15, gamma=0.99, epsilon=0.0, episodes=1000)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1000.0), HTML(value='')))




### Test

In [63]:
def test(env, policy):
    state = env.reset()
    while True:
        # Act
        action = policy(state)
        # Observe
        state, reward, done, info = env.step(action)
        # Render
        clear_output(wait=True)
        env.render()
        # Loop
        if done: break
    env.close()

In [65]:
test(gym_env, policy)