In [3]:
import gym
import math
import time
import numpy as np
from sklearn.preprocessing import KBinsDiscretizer
np.random.seed(0)

In [9]:
env = gym.make('MountainCar-v0')

In [10]:
?env.env

[1;31mType:[0m        MountainCarEnv
[1;31mString form:[0m <MountainCarEnv<MountainCar-v0>>
[1;31mFile:[0m        c:\users\taha\anaconda3\lib\site-packages\gym\envs\classic_control\mountain_car.py
[1;31mDocstring:[0m  
Description:
    The agent (a car) is started at the bottom of a valley. For any given
    state the agent may choose to accelerate to the left, right or cease
    any acceleration.

Source:
    The environment appeared first in Andrew Moore's PhD Thesis (1990).

Observation:
    Type: Box(2)
    Num    Observation               Min            Max
    0      Car Position              -1.2           0.6
    1      Car Velocity              -0.07          0.07

Actions:
    Type: Discrete(3)
    Num    Action
    0      Accelerate to the Left
    1      Don't accelerate
    2      Accelerate to the Right

    Note: This does not affect the amount of velocity affected by the
    gravitational pull acting on the car.

Reward:
     Reward of 0 is awarded if the agent 

In [11]:
state_space = (10,10)
lower_bounds = [-1.2, -1.5]
upper_bounds = [0.6, 1.5]

def discretizer(pos, velocity):
    est = KBinsDiscretizer(n_bins=state_space, encode='ordinal', strategy='uniform')
    est.fit([lower_bounds, upper_bounds])
    return tuple(map(int,est.transform([[pos, velocity]])[0]))

In [12]:
Q_table = np.zeros(state_space + (env.action_space.n,))
Q_table.shape

(10, 10, 3)

In [13]:
tic = time.time()
Q_table = np.zeros(state_space + (env.action_space.n,))
reward_list = []
total_reward = 0
success = False
for i in range(1, 20000):
    current, done = discretizer(*env.reset()), False
    while not done:
        lr = max(0.01, min(1.0, 1.0 - math.log10((i + 1) / 25)))
        epsilon =  max(0.05, min(1, 1.0 - math.log10((i  + 1) / 25)))
        if np.random.random() < epsilon:
            action = env.action_space.sample()
        else:
            action = np.argmax(Q_table[current])
        observation, reward, done, info = env.step(action) 
        next_state = discretizer(*observation)
        Q_table[current][action] += lr*(reward + np.max(Q_table[next_state]) - Q_table[current][action])
        current = next_state
        total_reward += reward
    if i % 100 == 0:
#        print(total_reward/100)
        reward_list.append(total_reward/100)
        if reward_list[-1] >= -110.0:
            toc = time.time()
            success = True
            print(f'Took {toc-tic:.2f} seconds and achieved average reward of {reward_list[-1]} in {(reward_list.index(reward_list[-1]) + 1)*100} episodes')
            env.close()
            break
        total_reward = 0
toc = time.time()
if not success:
    print(f'Took {toc-tic:.2f} seconds and failed')
    print(f'Max average reward achieved: {np.max(reward_list):.2f}')
env.close()
print(Q_table)

Took 1528.58 seconds and failed
Max average reward achieved: -158.60
[[[   0.            0.            0.        ]
  [   0.            0.            0.        ]
  [   0.            0.            0.        ]
  [   0.            0.            0.        ]
  [-350.01142788 -350.02397224 -350.00641633]
  [-350.48803978 -350.49554891 -350.36860047]
  [   0.            0.            0.        ]
  [   0.            0.            0.        ]
  [   0.            0.            0.        ]
  [   0.            0.            0.        ]]

 [[   0.            0.            0.        ]
  [   0.            0.            0.        ]
  [   0.            0.            0.        ]
  [   0.            0.            0.        ]
  [-351.73969794 -351.78366714 -351.78746713]
  [-347.65383548 -346.46173148 -347.6229646 ]
  [   0.            0.            0.        ]
  [   0.            0.            0.        ]
  [   0.            0.            0.        ]
  [   0.            0.            0.        ]]

 [[   0