In [45]:
import numpy as np
from mapgen import Dungeon
from PIL import Image

In [2]:
from ray.tune.registry import register_env

# Reward

There are two main modifications for the reward. The first modification alters original env reward by given +10 for discovering new cells and zero otherwise. This way, we can reinforce exploring and as we do not to think about collisions in this problem, we can use zero as default reward. The second modification is based on Intrinsic Curiosity Module from the paper Curiosity-driven Exploration by Self-supervised Prediction. This method uses another network to predict novelty of the state as an error between predicted and actual latent spaces of randomly initialized network. It serves as an additional incentive to explore states even if the environmental reward is 0.

In [137]:
from gym import spaces

class ModifiedDungeon(Dungeon):
    """Use this class to change the behavior of the original env (e.g. remove the trajectory from observation, like here)"""
    def __init__(self,
        width=20,
        height=20,
        max_rooms=3,
        min_room_xy=5,
        max_room_xy=12,
        observation_size=11,
        vision_radius=5,        
        max_steps: int = 400
    ):
        observation_size = 11
        super().__init__(
            width=width,
            height=height,
            max_rooms=max_rooms,
            min_room_xy=min_room_xy,
            max_room_xy=max_room_xy,
            observation_size = observation_size,
            vision_radius = vision_radius,
            max_steps = max_steps
        )

    def step(self, action):
        observation, reward , done, info = super().step(action)
        if info["is_new"]:
            reward = 10.
        else:
            reward = 0.
        return observation, reward , done, info

In [138]:
register_env("Dungeon", lambda config: ModifiedDungeon(**config))

In [139]:
import ray.rllib.agents.ppo as ppo

def ppo_config():
    config = ppo.DEFAULT_CONFIG.copy()
    config["num_gpus"] = 0
    config["log_level"] = "INFO"
    config["framework"] = "torch"
    config["env"] = "Dungeon"
    config["env_config"] = {
        "width": 20,
        "height": 20,
        "max_rooms": 3,
        "min_room_xy": 5,
        "max_room_xy": 10,
        "observation_size": 11,
        "vision_radius": 5
    }

    config["model"] = {
        "conv_filters": [
            [16, (3, 3), 2],
            [32, (3, 3), 2],
            [32, (3, 3), 1],
        ],
        "post_fcnet_hiddens": [32],
        "post_fcnet_activation": "relu",
        "vf_share_layers": False,
    }


    config["rollout_fragment_length"] = 100
    config["entropy_coeff"] = 0.1
    config["lambda"] = 0.95
    config["vf_loss_coeff"] = 1.0
    config["num_workers"] = 0
    
    config["exploration_config"] = {
    "type": "Curiosity",
    "eta": 1.0,
    "lr": 0.001,
    "feature_dim": 288,
    "feature_net_config": {
        "fcnet_hiddens": [],
        "fcnet_activation": "relu",
        "conv_filters": [
            [16, (3, 3), 2],
            [32, (3, 3), 2],
            [32, (3, 3), 1],
        ],
    },
    "inverse_net_hiddens": [256],
    "inverse_net_activation": "relu",
    "forward_net_hiddens": [256], 
    "forward_net_activation": "relu",
    "beta": 0.2,
    "sub_exploration": {
        "type": "StochasticSampling",
    }
    }
    return config

In [140]:
from torch.utils.tensorboard import SummaryWriter


def train_agent(config):
    agent = ppo.PPOTrainer(config)
    writer = SummaryWriter()
    N_ITER = 500
    s = "{:3d} reward {:6.2f}/{:6.2f}/{:6.2f} len {:6.2f} saved {}"

    #env = Dungeon(50, 50, 3)
    rates = []
    for n in range(N_ITER):
        result = agent.train()
        file_name = agent.save(CHECKPOINT_ROOT)
        writer.add_scalar('Loss/Entropy Loss', result["info"]["learner"]["default_policy"]["learner_stats"]["entropy"],
                          result['timesteps_total'])
        writer.add_scalar('Loss/Value Loss', result["info"]["learner"]["default_policy"]["learner_stats"]["vf_loss"],
                          result['timesteps_total'])
        writer.add_scalar('Loss/Policy Loss', result["info"]["learner"]["default_policy"]["learner_stats"]["policy_loss"],
                          result['timesteps_total'])
        writer.add_scalar('Eprewmean', result['episode_reward_mean'], result['timesteps_total'])
        print(s.format(
            n + 1,
            result["episode_reward_min"],
            result["episode_reward_mean"],
            result["episode_reward_max"],
            result["episode_len_mean"],
            file_name
        ))
        if (n+1)%5 == 0:
            env = Dungeon(20, 20, 3, min_room_xy=5, max_room_xy=10, vision_radius=5)
            obs = env.reset()
            Image.fromarray(env._map.render(env._agent)).convert('RGB').resize((500, 500), Image.NEAREST).save('tmp.png')

            frames = []

            for _ in range(500):
                action = agent.compute_single_action(obs)

                frame = Image.fromarray(env._map.render(env._agent)).convert('RGB').resize((500, 500), Image.NEAREST).quantize()
                frames.append(frame)
                obs, reward, done, info = env.step(action)
                
                if done:
                    break
            print('Explore rate: ', info["total_explored"] / info["total_cells"])
            rates.append(info["total_explored"] / info["total_cells"])
            writer.add_scalar('Eval/Explored cells', info["total_explored"], result['timesteps_total'])
            writer.add_scalar('Eval/Total cells', info["total_cells"], result['timesteps_total'])
            writer.add_scalar('Eval/Avg Explore Rate', info["avg_explored_per_step"], result['timesteps_total'])
            frames[0].save(f"out.gif", save_all=True, append_images=frames[1:], loop=0, duration=1000/60)
    writer.add_hparams(config, {"hparam/Explored": np.mean(rates)})

In [141]:
import shutil
import os

CHECKPOINT_ROOT = "tmp/ppo/dungeon"
shutil.rmtree(CHECKPOINT_ROOT, ignore_errors=True, onerror=None)

ray_results = "tmp/ray_results1/"
shutil.rmtree(ray_results, ignore_errors=True, onerror=None)

In [142]:
train_agent(ppo_config())

2021-10-25 21:40:47,660	INFO torch_policy.py:136 -- TorchPolicy (worker=local) running on CPU.
2021-10-25 21:40:47,734	INFO rollout_worker.py:1345 -- Built policy map: {'default_policy': <ray.rllib.policy.policy_template.PPOTorchPolicy object at 0x200953af0>}
2021-10-25 21:40:47,735	INFO rollout_worker.py:1346 -- Built preprocessor map: {'default_policy': <ray.rllib.models.preprocessors.NoPreprocessor object at 0x2009538e0>}
2021-10-25 21:40:47,736	INFO rollout_worker.py:603 -- Built filter map: {'default_policy': <ray.rllib.utils.filter.NoFilter object at 0x200953d00>}


  1 reward 290.00/407.00/610.00 len 392.00 saved tmp/ppo/dungeon/checkpoint_000001/checkpoint-1
  2 reward 290.00/479.05/810.00 len 377.86 saved tmp/ppo/dungeon/checkpoint_000002/checkpoint-2
  3 reward 290.00/532.19/980.00 len 371.44 saved tmp/ppo/dungeon/checkpoint_000003/checkpoint-3
  4 reward 290.00/566.98/1010.00 len 365.91 saved tmp/ppo/dungeon/checkpoint_000004/checkpoint-4
  5 reward 290.00/585.93/1010.00 len 366.20 saved tmp/ppo/dungeon/checkpoint_000005/checkpoint-5
Explore rate:  1.0
  6 reward 290.00/589.06/1010.00 len 369.05 saved tmp/ppo/dungeon/checkpoint_000006/checkpoint-6
  7 reward 290.00/588.53/1010.00 len 369.75 saved tmp/ppo/dungeon/checkpoint_000007/checkpoint-7
  8 reward 290.00/594.82/1010.00 len 372.68 saved tmp/ppo/dungeon/checkpoint_000008/checkpoint-8
  9 reward 290.00/592.81/1010.00 len 371.12 saved tmp/ppo/dungeon/checkpoint_000009/checkpoint-9
 10 reward 270.00/617.10/1240.00 len 366.92 saved tmp/ppo/dungeon/checkpoint_000010/checkpoint-10
Explore rate:

Explore rate:  1.0
 81 reward  60.00/702.80/1450.00 len 359.27 saved tmp/ppo/dungeon/checkpoint_000081/checkpoint-81
 82 reward  60.00/717.50/1450.00 len 364.42 saved tmp/ppo/dungeon/checkpoint_000082/checkpoint-82
 83 reward  60.00/735.80/1450.00 len 362.62 saved tmp/ppo/dungeon/checkpoint_000083/checkpoint-83
 84 reward  60.00/739.40/1450.00 len 365.00 saved tmp/ppo/dungeon/checkpoint_000084/checkpoint-84
 85 reward  60.00/747.70/1450.00 len 363.29 saved tmp/ppo/dungeon/checkpoint_000085/checkpoint-85
Explore rate:  1.0
 86 reward  60.00/754.10/1450.00 len 368.23 saved tmp/ppo/dungeon/checkpoint_000086/checkpoint-86


KeyboardInterrupt: 