In [3]:
import gym
import math
import time
import numpy as np
from sklearn.preprocessing import KBinsDiscretizer
np.random.seed(0)

In [4]:
env = gym.make('MountainCar-v0')

In [5]:
#?env.env

[1;31mType:[0m        MountainCarEnv
[1;31mString form:[0m <MountainCarEnv<MountainCar-v0>>
[1;31mFile:[0m        c:\users\taha\anaconda3\lib\site-packages\gym\envs\classic_control\mountain_car.py
[1;31mDocstring:[0m  
Description:
    The agent (a car) is started at the bottom of a valley. For any given
    state the agent may choose to accelerate to the left, right or cease
    any acceleration.

Source:
    The environment appeared first in Andrew Moore's PhD Thesis (1990).

Observation:
    Type: Box(2)
    Num    Observation               Min            Max
    0      Car Position              -1.2           0.6
    1      Car Velocity              -0.07          0.07

Actions:
    Type: Discrete(3)
    Num    Action
    0      Accelerate to the Left
    1      Don't accelerate
    2      Accelerate to the Right

    Note: This does not affect the amount of velocity affected by the
    gravitational pull acting on the car.

Reward:
     Reward of 0 is awarded if the agent 

In [6]:
n_bins = (6 , 6)
lower_bounds = [env.observation_space.low[0], env.observation_space.low[1]]
upper_bounds = [env.observation_space.high[0], env.observation_space.high[1]]

def discretizer(pos, velocity):
    est = KBinsDiscretizer(n_bins=n_bins, encode='ordinal', strategy='uniform')
    est.fit([lower_bounds, upper_bounds])
    return tuple(map(int,est.transform([[pos, velocity]])[0]))

In [7]:
Q_table = np.zeros(n_bins + (env.action_space.n,))
Q_table.shape

(6, 6, 3)

In [8]:
def qlearn():
    tic = time.time()
    reward_list = []
    total_reward = 0
    t = 0
    for i in range(1, 10001):
        current, done = discretizer(*env.reset()), False
        while not done:
            lr = max(0.01, min(1.0, 1.0 - math.log10((i + 1) / 25)))
            epsilon =  max(0.05, min(1, 1.0 - math.log10((i  + 1) / 25)))
            action = np.argmax(Q_table[current])
            if np.random.random() < epsilon:
                action = env.action_space.sample()
            observation, reward, done, info = env.step(action)
            next_state = discretizer(*observation)
            Q_table[current][action] += lr*(reward + np.max(Q_table[next_state]) - Q_table[current][action])
            current = next_state
            total_reward += reward
        if i % 100 == 0:
            t += 1
            reward_list.append(total_reward/100)
#            print(total_reward/100)
            for index in range(len(reward_list)):
                if reward_list[index] >= -110.0:
                    toc = time.time()
                    first_avg_r = index
                    return f'Took {toc-tic:.2f} seconds and achieved average reward of {reward_list[first_avg_r]} in {(first_avg_r+1)*100} episodes'
            total_reward = 0
    toc = time.time()
    print(f'Took {toc-tic:.2f} seconds and failed')
    print(f'Max average reward achieved: {np.max(reward_list):.2f}')
    env.close()

In [None]:
qlearn()