In [2]:
import gym
env = gym.make("Taxi-v3",render_mode='ansi')
env.reset()
print(env.render())
print("Action Space {}".format(env.action_space))
print("State Space {}".format(env.observation_space))

+---------+
|[34;1mR[0m: | : :[35mG[0m|
| : | : : |
| : : : : |
| | : | : |
|Y|[43m [0m: |B: |
+---------+


Action Space Discrete(6)
State Space Discrete(500)


In [3]:
state = env.encode(3,1,2,0)
print("State:",state)

env.s = state
print(env.render()) #说明这个状态位于0~499的328索引处

State: 328
+---------+
|[34;1mR[0m: | : :[35mG[0m|
| : | : : |
| : : : : |
| | : | : |
|Y|[43m [0m: |B: |
+---------+




In [6]:
env.P[328]
# {action: [(probability, nextstate, reward, done)]}.

{0: [(1.0, 428, -1, False)],
 1: [(1.0, 228, -1, False)],
 2: [(1.0, 348, -1, False)],
 3: [(1.0, 328, -1, False)],
 4: [(1.0, 328, -10, False)],
 5: [(1.0, 328, -10, False)]}

P是初始的Reward矩阵，states x actions 矩阵。
所有的操作都有-1的reward, 接送操作有-10的reward，
如果使用蛮力解决问题，思路如下：

In [9]:
env.s = 328

epochs = 0
penalties,reward = 0,0

frames = []

done  = False
while not done:
    action = env.action_space.sample()
    state,reward,done,_,info = env.step(action)
    if reward==-10:
        penalties +=1
    
    frames.append({
        'frame':env.render(),
        'state':state,
        'action':action,
        'reward':reward
    })

    epochs +=1

print("Timesteps take: {}".format(epochs))
print("Penaltied incurred: {}".format(penalties))


Timesteps take: 3864
Penaltied incurred: 1259


`env.action_space.sample()`随机选择一个可以采取的行动


In [11]:
from IPython.display import clear_output
from time import sleep
def print_frames(frames):
    for i,frame in enumerate(frames):
        clear_output(wait=True)
        print(frame['frame'])
        print(f"TimeStep:{i+1}")
        print(f"State: {frame['state']}")
        print(f"Action: {frame['action']}")
        print(f"Reward:{frame['reward']}")
        sleep(0.1)

print_frames(frames)

+---------+
|R: | :[43m [0m:[35mG[0m|
| : | : : |
| : : : : |
| | : | : |
|[34;1mY[0m| : |B: |
+---------+
  (North)

TimeStep:596
State: 69
Action: 1
Reward:-1


KeyboardInterrupt: 

### Q-learning 学习
提供记忆的强化学习算法
Reward: `P`，可以从agent中学习而得
Q-values,Q-table: (state,action)
$$
Q(state,action) \leftarrow 
(1-\alpha) Q(state,action)+ \alpha(reward + \gamma max_{a}Q(next state,all actions))
$$
Q-table是一个表，存储了每个状态下 每个action的Q-value

| ~~~ | Actions| | | | | | 
| ------ | ------ | ------ | ------ | ------ |------ | ------ |
|States| South(0) | North(1) | East(2) | West(3)| Pickup(4)| Dropoff(5)|
|0 |  |  |  |  |  | |
|... |  |  |  |  |  | |
|327 |  |  |  |  |  | |
|... |  |  |  |  |  | |
|499|  |  |  |  |  | |


基本步骤如下：
* 初始化Q-table为0
* 开始探索动作，从当前状态s中任意选择一个action(a)
* 通过action(a) 转移到下一个状态$S^{\prime}$
* 对于所有的状态$S^{\prime}$，选择最高Q值的那个
* 更新Q-table
* 令next state为当前state
* 如果到达目标，则结束并重复当前过程

## Traning the Agent

In [34]:
## 初始化Q-table为500x6的表格
import numpy as np
q_table = np.zeros([env.observation_space.n,env.action_space.n])

import random
from IPython.display import clear_output

alpha = 0.1 # 学习率，每次更新的程度
gamma = 0.6 # 折扣因子，对未来奖励的重视程度
epsilon = 0.1 #探索率

all_epochs = []
all_penalties = []

for i in range(1,100001):
    state = env.reset()[0]

    epochs,penalties,reward = 0,0,0
    done = False

    while not done:
        if random.uniform(0,1)<epsilon:
            action = env.action_space.sample()
        else:
            action = int(np.argmax(q_table[state]))
        
        next_state,reward,done,_,info = env.step(action)

        old_value = q_table[state,action]
        next_max = np.max(q_table[next_state])

        new_value = (1-alpha)*old_value + alpha*(reward + gamma*next_max)
        q_table[state,action] = new_value
        if reward == -10:
            penalties +=1
            
        state = next_state
        epochs += 1
    
    if i% 100 ==0:
        clear_output(wait=True)
        print(f"Episode: {i}")

print("Training finished.\n")


Episode: 100000
Training finished.



In [32]:
a = np.array([1,2,3])
print(type(np.argmax(a).item()))
print(env.reset())

<class 'int'>
(308, {'prob': 1.0, 'action_mask': array([1, 1, 0, 0, 0, 0], dtype=int8)})


In [35]:
q_table[328]

array([ -2.40917971,  -2.27325184,  -2.40333878,  -2.36018312,
       -11.00438004, -10.07730281])

### Evaluate agent: performance after Q-learning



In [38]:
total_epochs, total_penalties = 0,0
episodes = 100
for _ in range(episodes):
    state = env.reset()[0]
    epochs, penalties,reward = 0,0,0

    done = False

    while not done:
        action = np.argmax(q_table[state])
        state,reward,done,_,info = env.step(action)

        if reward == -10:
            penalties += 1
        epochs += 1

    total_epochs += epochs
    total_penalties += penalties

print(f"Results after {episodes} episodes:")
print(f"Average timesteps per episode: {total_epochs / episodes}")
print(f"Average penalties per episode: {total_penalties / episodes}")



Results after 100 episodes:
Average timesteps per episode: 13.6
Average penalties per episode: 0.0


$\alpha$: 随着知识库变大而降低
$\gamma$: 如果
$\epsilon$: 探索率，更少的探索更多的利用策略应该减少