In [1]:
"""
Q-Learning example using OpenAI gym MountainCar enviornment
Author: Moustafa Alzantot (malzantot@ucla.edu)
"""


import numpy as np

import gym
from gym import wrappers

n_states = 2 #状态
iter_max = 1000

initial_lr = 1.0 # Learning rate(学习率)   
min_lr = 0.003
gamma = 1.0
t_max = 10000
eps = 0.02  #每股收益

def run_episode(env, policy=None, render=False):
    obs = env.reset()
    total_reward = 0
    step_idx = 0
    for _ in range(t_max):
        if render:
            env.render()
        if policy is None:
            action = env.action_space.sample()
        else:
            a,b=obs_to_state(env, obs)        #得到离散化的状态
            action= policy[a][b]         #按照policy决定下次action
        obs, reward, done, _ = env.step(action)
        total_reward += gamma ** step_idx * reward
        step_idx += 1
        if done:
            break
    return total_reward

def obs_to_state(env, obs):             #该函数的作用的是将连续的状态转换成有限的离散状态
    """ Maps an observation to state """
    """将观察结果映射到状态"""
    env_low = env.observation_space.low
    env_high = env.observation_space.high
    env_dx = (env_high - env_low) / n_states
    a = int((obs[0] - env_low[0])/env_dx[0])
    b = int((obs[1] - env_low[1])/env_dx[1])
    return a, b

if __name__ == '__main__':
    env_name = 'MountainCar-v0'
    env = gym.make(env_name)  #创建环境
    np.random.seed(0)#改变随机数生成器的种子
    print ('----- using Q Learning -----')
    q_table = np.zeros((n_states,n_states,3)) #初始化q表
    for i in range(iter_max):
        obs = env.reset() #reset() 函数把数组的内部指针指向第一个元素，并返回这个元素的值
        total_reward = 0  #报酬总额
        ## eta: learning rate is decreased at each step（学习率每一步都在下降）
        eta = max(min_lr, initial_lr * (0.85 ** (i//100)))
        for j in range(t_max):
            a, b = obs_to_state(env, obs)
            if np.random.uniform(0, 1) < eps:  #eps每股收益
                action = np.random.choice(env.action_space.n)
            else:
                logits = q_table[a][b]
                logits_exp = np.exp(logits)
                probs = logits_exp / np.sum(logits_exp)    
                action = np.random.choice(env.action_space.n, p=probs)    #按照概率probs选择下一个action
            obs, reward, done, _ = env.step(action) #实现动作
            total_reward += (gamma ** j) * reward  #报酬累加
            # update q table  更新q表
            a_, b_ = obs_to_state(env, obs)
            q_table[a][b][action] = q_table[a][b][action] + eta * (reward + gamma * np.max(q_table)-q_table[a][b][action])   #更新q表
            if done:
                break
        if i % 100 == 0:
            print('Iteration #%d -- Total reward = %d.' %(i+1, total_reward))
    solution_policy = np.argmax(q_table, axis=2)
    solution_policy_scores = [run_episode(env, solution_policy, False) for _ in range(100)]
    print("Average score of solution = ", np.mean(solution_policy_scores))
    # Animate it  制作动画
    run_episode(env, solution_policy, True)

----- using Q Learning -----
[[[0. 0. 0.]
  [0. 0. 0.]]

 [[0. 0. 0.]
  [0. 0. 0.]]]
Iteration #1 -- Total reward = -200.
Iteration #101 -- Total reward = -200.
Iteration #201 -- Total reward = -200.
Iteration #301 -- Total reward = -200.
Iteration #401 -- Total reward = -200.
Iteration #501 -- Total reward = -200.
Iteration #601 -- Total reward = -200.
Iteration #701 -- Total reward = -200.
Iteration #801 -- Total reward = -200.
Iteration #901 -- Total reward = -200.
Average score of solution =  -200.0
