In [1]:
import gym
import math
import time
import numpy as np
from sklearn.preprocessing import KBinsDiscretizer
np.random.seed(0)

In [8]:
env = gym.make('MountainCar-v0')

In [9]:
?env.env

[1;31mType:[0m        MountainCarEnv
[1;31mString form:[0m <MountainCarEnv<MountainCar-v0>>
[1;31mFile:[0m        c:\users\taha\anaconda3\lib\site-packages\gym\envs\classic_control\mountain_car.py
[1;31mDocstring:[0m  
Description:
    The agent (a car) is started at the bottom of a valley. For any given
    state the agent may choose to accelerate to the left, right or cease
    any acceleration.

Source:
    The environment appeared first in Andrew Moore's PhD Thesis (1990).

Observation:
    Type: Box(2)
    Num    Observation               Min            Max
    0      Car Position              -1.2           0.6
    1      Car Velocity              -0.07          0.07

Actions:
    Type: Discrete(3)
    Num    Action
    0      Accelerate to the Left
    1      Don't accelerate
    2      Accelerate to the Right

    Note: This does not affect the amount of velocity affected by the
    gravitational pull acting on the car.

Reward:
     Reward of 0 is awarded if the agent 

In [12]:
n_bins = (6 , 6)
lower_bounds = [env.observation_space.low[0], env.observation_space.low[1]]
upper_bounds = [env.observation_space.high[0], env.observation_space.high[1]]

def discretizer(pos, velocity):
    est = KBinsDiscretizer(n_bins=n_bins, encode='ordinal', strategy='uniform')
    est.fit([lower_bounds, upper_bounds])
    return tuple(map(int,est.transform([[pos, velocity]])[0]))

In [13]:
Q_table = np.zeros(n_bins + (env.action_space.n,))
Q_table.shape

(6, 6, 3)

In [16]:
def qlearn():
    tic = time.time()
    reward_list = []
    total_reward = 0
    t = 0
    for i in range(1, 10001):
        current, done = discretizer(*env.reset()), False
        while not done:
            lr = max(0.01, min(1.0, 1.0 - math.log10((i + 1) / 25)))
            epsilon =  max(0.05, min(1, 1.0 - math.log10((i  + 1) / 25)))
            action = np.argmax(Q_table[current])
            if np.random.random() < epsilon:
                action = env.action_space.sample()
            observation, reward, done, info = env.step(action)
            next_state = discretizer(*observation)
            Q_table[current][action] += lr*(reward + np.max(Q_table[next_state]) - Q_table[current][action])
            current = next_state
            total_reward += reward
        if i % 100 == 0:
            t += 1
            reward_list.append(total_reward/100)
#            print(total_reward/100)
            for index in range(len(reward_list)):
                if reward_list[index] >= -110.0:
                    toc = time.time()
                    first_avg_r = index
                    return f'Took {toc-tic:.2f} seconds and achieved average reward of {reward_list[first_avg_r]} in {(first_avg_r+1)*100} episodes'
            total_reward = 0
    toc = time.time()
    print(f'Took {toc-tic:.2f} seconds and failed')
    print(f'Max average reward achieved: {np.max(reward_list):.2f}')
    env.close()

In [17]:
qlearn()

-200.0
-200.0
-170.62
-177.32
-151.92
-170.71
-144.03
-132.51
-138.14
-149.5
-196.9
-194.8
-196.03
-192.96
-187.46
-183.84
-187.18
-175.43
-195.2
-186.37
-160.43
-168.88
-188.26
-191.59
-189.8
-186.08
-183.37
-164.19
-192.61
-187.9
-197.63
-164.69
-169.04
-175.54
-170.56
-179.31
-167.79
-171.45
-194.97
-153.43
-179.73
-188.54
-194.28
-189.71
-196.81
-181.22
-190.76
-174.56
-169.59
-185.74
-195.42
-196.33
-198.09
-191.2
-194.61
-194.13
-195.5
-193.72
-188.98
-194.4
-190.9
-192.01
-185.76
-175.75
-161.42
-171.63
-196.99
-173.42
-159.42
-166.54
-165.15
-169.75
-163.89
-168.03
-190.31
-185.21
-177.65
-189.4
-195.49
-196.7
-171.09
-197.48
-194.3
-196.84
-178.09
-173.01
-185.75
-199.59
-193.91
-175.04
-186.74
-175.12
-169.13
-174.74
-181.07
-180.85
-192.68
-190.85
-196.84
-190.38


-132.51