In [120]:

%load_ext autoreload 
# %aimport rl_envs.grid_world_env

%autoreload 2
import torch
import math
from torch.utils.tensorboard import SummaryWriter # type: ignore

from rl_envs.gym_grid_world_env import GridWorldEnv
from agents.policy_gradient import PGAgent
from tools.helper import *


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [121]:
LEARN_RATE = 1e-4
DISCOUNTED_FACTOR = 0.9

FORBIDDEN_REWARD = 0
HITWALL_REWARD = 0
TARGET_REWARD = 1

In [122]:
env = GridWorldEnv(fixed_map = True, forbidden_grids=[(1,1),(1,2), (2,2),(3,1),(3,3),(4,1)], target_grids=[(3,2)], forbidden_reward=FORBIDDEN_REWARD, hit_wall_reward=HITWALL_REWARD, target_reward=TARGET_REWARD)

In [123]:
episode_rewards = []
episode_lengths = []


In [125]:
agent = PGAgent(2, env.action_n, lr = LEARN_RATE, discounted_factor=DISCOUNTED_FACTOR)
writer = SummaryWriter()
num_episodes = 20
episode_len = 3000
epochs = 100
iter_counter = 0
# 第一次收集改为随机收集
trajectory = []
obs, _ = env.reset()
for _ in range(1000):
    state = tuple(obs['agent'])
    action = agent.get_behavior_action(state)
    obs, reward, terminated, truncated, info = env.step(action)
    trajectory.append((state, action, reward))

for episode in range(num_episodes):
    for epoch in range(epochs):
        total_l = 0
        for t, (state, action, reward) in enumerate(trajectory):
        # for state, action, reward in reversed(trajectory):
            discounted_reward = sum(DISCOUNTED_FACTOR**i * t[2] for i, t in enumerate(trajectory[t:]))
            # discounted_reward = discounted_reward * agent.discounted_factor + reward
            # agent.q[observation][action] = discounted_reward 
            # policy update
            """
            特别注意: 这里 log π 中的 π(a|s) 是选择 a 的概率, policy network 得输出一个概率, 而不是什么 a 的值
            当然我们可以用输出的值, 归一化一下作为 action 的概率

            有一个 变体可能, 在 sample action 时就计算 prob并存储, 然后在 update 时就只是计算 reward 从而计算 loss, 将一个 episode 的loss 都加到一起来一起 backward, 然后更新一次 policy network
            """

            action_probs = agent.policy_net(torch.tensor(state, dtype=torch.float))
            # action_probs = actions_val/actions_val.sum()
            agent.optimizer.zero_grad()
            loss = -torch.log(action_probs[action]) * discounted_reward
            # [parms.grad for name, parms in agent.policy_net.named_parameters()]
            # loss = abs(loss)
            loss.backward()
            torch.nn.utils.clip_grad.clip_grad_norm_(agent.policy_net.parameters(), 100)
            agent.optimizer.step()
            writer.add_scalar('Loss', loss, iter_counter)
            iter_counter+=1
            total_l += loss
        writer.add_scalar('episodeLoss', total_l, episode*epochs + epoch)

    # 首先, 根据 policy 生成 episode
    obs, _ = env.reset()
    trajectory = []
    # 初始策略是不是有比较大的影响?
    for _ in range(episode_len):
        state = tuple(obs['agent'])
        action = agent.get_action(state) # action 这里也有随机性
        obs, reward, terminated, truncated, info = env.step(action)
        trajectory.append((state, action, reward))


writer.flush()
writer.close()

KeyboardInterrupt: 

In [126]:

policy = agent.generate_policy_table(env.height, env.width)

print_by_dict(env, policy)

for i in range(env.height):
    print("[", end=" ")
    for j in range(env.width):
        state = (i,j)
        action = np.argmax(policy[state])
        print(env.action_mappings[action], end=" ")
    print("]")

[ [[0.39561397 0.12332556 0.05199818 0.22307943 0.20598286]] [[0.5060537  0.07135594 0.04576454 0.21237789 0.16444787]] [[0.44437984 0.1300606  0.06464551 0.24568307 0.11523104]] [[0.45169136 0.13823198 0.08859667 0.24091221 0.08056778]] [[0.44508648 0.14166999 0.11594912 0.24101715 0.05627727]] ]
[ [[0.41710952 0.28278688 0.02887618 0.15853842 0.11268894]] [[0.5682794  0.14071025 0.03349277 0.1442297  0.11328786]] [[0.47954184 0.13724673 0.03768829 0.2255518  0.11997138]] [[0.36380222 0.17916325 0.0440016  0.33160406 0.08142883]] [[0.33177203 0.17673226 0.05406205 0.38231882 0.05511483]] ]
[ [[0.20720705 0.54485005 0.04130831 0.10563801 0.10099661]] [[0.41630587 0.27707475 0.06078083 0.11398135 0.13185729]] [[0.53760934 0.07217819 0.04869492 0.15366492 0.18785273]] [[0.2905602  0.14592473 0.04948127 0.40153116 0.11250268]] [[0.16875313 0.1806551  0.0341997  0.5577638  0.05862822]] ]
[ [[0.13277824 0.5957768  0.06302667 0.09551053 0.11290771]] [[0.15818414 0.45173243 0.1574121  0.09228

In [None]:
# gridworld_demo(agent, forbidden_reward=FORBIDDEN_REWARD, hit_wall_reward=HITWALL_REWARD, target_reward=TARGET_REWARD)