In [5]:
import gym
import numpy as np
import matplotlib.pyplot as plt

In [6]:
env = gym.make("MountainCar-v0")
env.reset()

array([-0.5063934,  0.       ], dtype=float32)

In [7]:
print(env.observation_space.high)
print(env.observation_space.low)
print(env.action_space.n)

[0.6  0.07]
[-1.2  -0.07]
3


In [8]:
DISCRETE_OS_SIZE = [20] * len(env.observation_space.high)

In [9]:
DISCRETE_OS_SIZE

[20, 20]

In [10]:
discrete_os_win_size = (env.observation_space.high - env.observation_space.low) / DISCRETE_OS_SIZE
#print(discrete_os_win_size)

In [11]:
EPSILON             = 0.5
LEARNING_RATE       = 0.1
DISCOUNT_FACTOR     = 0.95
EPISODES            = 2000
MOMENT              = 500
START_EPSILON_DECAY = 1
END_EPSILON_DECAY   = EPISODES // 2
EPSILON_DECAY_VALUE = EPSILON / (END_EPSILON_DECAY - START_EPSILON_DECAY)

In [12]:
q_table = np.random.uniform(low=-2, high=0, size=(DISCRETE_OS_SIZE+ [env.action_space.n]))

In [13]:
episodes_rewards = []
aggregate_rewards = {
    'ep':  [], 
    'avg': [], 
    'min': [], 
    'max': []
} 

In [14]:
def get_discrete_state(state):
    discrete_state = (state - env.observation_space.low) / discrete_os_win_size
    return tuple(discrete_state.astype(np.int))

In [10]:
# discrete_state = get_discrete_state(env.reset())
# print(discrete_state)
# print(np.argmax(q_table[discrete_state]))

In [None]:
for episode in range(EPISODES):
    episode_reward = 0
    if episode % MOMENT == 0:
        print(episode)
        render = True
    else:
        render = False
        
    discrete_state = get_discrete_state(env.reset())
    done = False
    while not done:
        if np.random.random() > EPSILON:
            action = np.argmax(q_table[discrete_state])
        else:
            action = np.random.randint(0, env.action_space.n)
            
        new_state, reward, done, _ = env.step(action)
        episode_reward += reward
        new_discrete_state = get_discrete_state(new_state)
        if render:
            env.render()
        if not done:
            max_future_q = np.max(q_table[new_discrete_state])
            current_q    = q_table[discrete_state + (action, )]     
            new_q = (1 - LEARNING_RATE) * current_q + LEARNING_RATE * (reward + DISCOUNT_FACTOR * max_future_q)
            q_table[discrete_state + (action,)] = new_q
        elif new_state[0] >= env.goal_position:
            print(f"we made it on episode: {episode}")
            q_table[discrete_state + (action,)] = 0

        discrete_state = new_discrete_state
    
    if END_EPSILON_DECAY >= episode >= START_EPSILON_DECAY:
        EPSILON -= EPSILON_DECAY_VALUE
    
    episodes_rewards.append(episode_reward)

    if not episode % MOMENT:
            current = episodes_rewards[-MOMENT:]
            min_current = min(current)
            max_current = max(current)
            avg_current = sum(current) / len(current)
            aggregate_rewards['ep'].append(episode)
            aggregate_rewards['avg'].append(avg_current)
            aggregate_rewards['min'].append(min_current)
            aggregate_rewards['max'].append(max_current)
            
            print(f"episode:{episode}, avg:{avg_current}, min:{min_current}, max:{max_current}")
            
env.close()

plt.plot(aggregate_rewards['ep'],aggregate_rewards['avg'], label="avg")
plt.plot(aggregate_rewards['ep'],aggregate_rewards['min'], label="min")
plt.plot(aggregate_rewards['ep'],aggregate_rewards['max'], label="max")
plt.legend(loc=4)
plt.show();

episode:0, avg:-200.0, min:-200.0, max:-200.0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
