In [6]:

%load_ext autoreload 
# %aimport rl_envs.grid_world_env

%autoreload 2
import torch
import math
from torch.utils.tensorboard import SummaryWriter # type: ignore

from rl_envs.gym_grid_world_env import GridWorldEnv
from agents.policy_gradient import PGAgent
from tools.helper import *


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [7]:
LEARN_RATE = 1e-4
DISCOUNTED_FACTOR = 0.9

FORBIDDEN_REWARD = -1
HITWALL_REWARD = -1
TARGET_REWARD = 1

In [8]:
env = GridWorldEnv(size=3,fixed_map = True, forbidden_grids=[(1,1)], target_grids=[(2,2)], forbidden_reward=FORBIDDEN_REWARD, hit_wall_reward=HITWALL_REWARD, target_reward=TARGET_REWARD)
# env = GridWorldEnv(fixed_map = True, forbidden_grids=[(1,1),(1,2), (2,2),(3,1),(3,3),(4,1)], target_grids=[(3,2)], forbidden_reward=FORBIDDEN_REWARD, hit_wall_reward=HITWALL_REWARD, target_reward=TARGET_REWARD)

In [9]:
episode_rewards = []
episode_lengths = []


In [10]:
agent = PGAgent(2, env.action_n, lr = LEARN_RATE, discounted_factor=DISCOUNTED_FACTOR)
writer = SummaryWriter()
num_episodes = 20000
episode_len = 100
epochs = 1
iter_counter = 0
# 第一次收集改为随机收集
trajectory = []
obs, _ = env.reset()
# for _ in range(1000):
#     state = tuple(obs['agent'])
#     action = agent.get_behavior_action(state)
#     obs, reward, terminated, truncated, info = env.step(action)
#     trajectory.append((state, action, reward+10))
running_reward = -10
for episode in range(num_episodes):
    # 首先, 根据 policy 生成 episode
    obs, _ = env.reset()
    trajectory = []
    ep_reward = 0
    real_episode_len = 0
    # 初始策略是不是有比较大的影响? 
    for real_episode_len in range(episode_len):
        state = tuple(obs['agent'])
        action = agent.get_action(state) # action 这里也有随机性
        obs, reward, terminated, truncated, info = env.step(action)
        trajectory.append((state, action, reward))
        ep_reward += reward
        if terminated or truncated:
            break
    running_reward = 0.05 * ep_reward + (1 - 0.05) * running_reward
    for epoch in range(epochs):
        total_l = 0
        discounted_reward = 0
        for t, (state, action, reward) in enumerate(trajectory):
        # for state, action, reward in reversed(trajectory):
            # discounted_reward = sum(DISCOUNTED_FACTOR**i * t[2] for i, t in enumerate(trajectory[t:]))
            discounted_reward = discounted_reward * agent.discounted_factor + reward
            # policy update
            """
            特别注意: 这里 log π 中的 π(a|s) 是选择 a 的概率, policy network 得输出一个概率, 而不是什么 a 的值
            当然我们可以用输出的值, 归一化一下作为 action 的概率

            有一个 变体可能, 在 sample action 时就计算 prob并存储, 然后在 update 时就只是计算 reward 从而计算 loss, 将一个 episode 的loss 都加到一起来一起 backward, 然后更新一次 policy network
            感觉这种变体才是对的, action_prob , 现在计算的有点迟 (但是影响应该不大)
            主要还是得加 advantage 的操作

            """
            action_probs = agent.policy_net(torch.tensor(state, dtype=torch.float))
            agent.q[state][action] = discounted_reward 
            agent.v[state] = sum([agent.q[state][a] * action_probs[a] for a in agent.q[state].keys()])
            # agent.v[state] = np.mean(agent.q[state])

            # action_probs = actions_val/actions_val.sum()
            agent.optimizer.zero_grad()
            """
            当 discounted reward < 0 时, loss < 0. 若是 action 选择错误, 则 discounted_reward 小, 使得 loss 小 (或者说负地厉害) 
            梯度下降会将 loss 减地更小, 也就使得对应错误 action 的 action_probs[action] 减小

            相反, 当选择正确 action 时, discounted_reward 理想下应该更大, 则 loss 也更大, 梯度下降同样降低 loss,
            使得对应正确 action 的 action_probs[action] 减小. 
            
            关键就在于, 要使得 loss 小的时候梯度下降地比 loss 大的时候要更快.

            (若是训练地成功 下一轮时 discounted_reward 就会变大, 那么 loss 也就是越来越大, 自然就是向上走,
            至于为什么 loss 会趋近于 0, 我猜测是因为 discounted_reward 有一个由负变正的过程, 而在其中当 loss 变为 0 时
            ) 
            """
            # loss = -torch.log(action_probs[action]) * (discounted_reward)
            loss = -torch.log(action_probs[action]) * (discounted_reward - agent.v[state]) # add baselline advantage
            # [parms.grad for name, parms in agent.policy_net.named_parameters()]
            # loss = abs(loss)
            loss.backward()
            torch.nn.utils.clip_grad.clip_grad_norm_(agent.policy_net.parameters(), 100)
            agent.optimizer.step()
            writer.add_scalar('Loss', loss, iter_counter)
            iter_counter+=1
            total_l += loss
        writer.add_scalar('episodeLoss', total_l, episode*epochs + epoch)
        writer.add_scalar('episodeReward', discounted_reward, episode*epochs + epoch)
        writer.add_scalar('ep_reward', ep_reward, episode*epochs + epoch)


    if episode % 100 == 0:
        print('Episode {}\tLast reward: {:.2f}\tAverage reward: {:.2f}'.format(
                episode, ep_reward, running_reward))
    if running_reward > 0.6:
        print("Solved! Running reward is now {} and "
                "the last episode runs to {} time steps!".format(running_reward, real_episode_len))
        break


writer.flush()
writer.close()

Episode 0	Last reward: -6.00	Average reward: -9.80
Episode 100	Last reward: -14.00	Average reward: -6.77
Episode 200	Last reward: 0.00	Average reward: -7.14
Episode 300	Last reward: -1.00	Average reward: -4.57
Episode 400	Last reward: -1.00	Average reward: -6.95
Episode 500	Last reward: -3.00	Average reward: -5.83
Episode 600	Last reward: -3.00	Average reward: -7.40
Episode 700	Last reward: -7.00	Average reward: -6.24
Episode 800	Last reward: -5.00	Average reward: -4.97
Episode 900	Last reward: -8.00	Average reward: -5.16
Episode 1000	Last reward: 0.00	Average reward: -4.32
Episode 1100	Last reward: -8.00	Average reward: -5.16
Episode 1200	Last reward: -2.00	Average reward: -5.53
Episode 1300	Last reward: -13.00	Average reward: -5.51
Episode 1400	Last reward: -1.00	Average reward: -5.05
Episode 1500	Last reward: 0.00	Average reward: -3.65
Episode 1600	Last reward: 0.00	Average reward: -3.29
Episode 1700	Last reward: -20.00	Average reward: -5.16
Episode 1800	Last reward: -3.00	Average r

In [11]:

policy = agent.generate_policy_table(env.height, env.width)

print_by_dict(env, policy)

for i in range(env.height):
    print("[", end=" ")
    for j in range(env.width):
        state = (i,j)
        action = np.argmax(policy[state])
        print(env.action_mappings[action], end=" ")
    print("]")

[ [[0.27010414 0.18948609 0.09536619 0.09522164 0.34982193]] [[0.46058697 0.08359051 0.06711915 0.06711915 0.32158417]] [[0.64286083 0.02568266 0.02568266 0.02568266 0.28009114]] ]
[ [[0.09970145 0.48710915 0.06999383 0.06440948 0.27878618]] [[0.29346988 0.27921018 0.07374656 0.07013351 0.28343987]] [[0.7018638  0.0395993  0.02760158 0.02760158 0.20333369]] ]
[ [[0.01471615 0.79551375 0.01542892 0.01471615 0.15962514]] [[0.02852456 0.77766603 0.01956342 0.01946699 0.15477894]] [[0.23398921 0.3908543  0.02886593 0.02886593 0.31742465]] ]
[  ↺   ↓   ↓  ]
[  →   ↓   ↓  ]
[  →   →   →  ]


In [12]:
# env.max_steps = 10
# gridworld_demo(agent, env, repeat_times=500)
# gridworld_demo(agent, forbidden_reward=FORBIDDEN_REWARD, hit_wall_reward=HITWALL_REWARD, target_reward=TARGET_REWARD)