In [1]:
from pathlib import Path

import gym
import navstack_gym
import torch

import numpy as np
import matplotlib.pyplot as plt
import h5py

from gym.wrappers import TimeLimit, FrameStack
from algo.wrapper import ResizeObservation, TensorObservation

from pprint import pprint as p
from algo import utils
from algo.agent import make_DrQ_agent



In [2]:
################# Task Setting ####################
# reference ./conf/ 
TARGET = 'treasure_hunt_with_fixed_all'
EXPERIMENT_NAME = 'maintask_meta_fixed_all'

map_agent = 'subtask_map_fixed_room'
chest_agent = 'subtask_chest_fixed_room'
key_agent = 'subtask_key_fixed_room'
map_agent_path = utils.get_workspace_path(map_agent) / 'agent'
chest_agent_path = utils.get_workspace_path(chest_agent) / 'agent'
key_agent_path = utils.get_workspace_path(key_agent) / 'agent'

n_subtask = 3

cfg = utils.load_param('conf', 'train_config.yaml', [f'task={TARGET}', f'experiment_name={EXPERIMENT_NAME}'])

utils.create_workspace(cfg.experiment_name)
workspace_path = utils.get_workspace_path(cfg.experiment_name)

p(cfg)
env_mode = cfg.env_config.full
room_mode = cfg.room_config.hodoyoi

gpu = torch.cuda.current_device() if torch.cuda.is_available() else -1

set workspace: work/maintask_meta
{'room_config': {'easy': {'scene_obstacle_count': 10, 'scene_obstacle_size': 0.7, 'scene_target_size': 0.2, 'scene_key_size': 0.2, 'scene_obstacle_zone_thresh': 1.5, 'scene_distance_key_placing': 0.7, 'scene_range_key_placing': 0.3, 'scene_room_length_max': 9, 'scene_room_wall_thickness': 0.05, 'scene_wall_threshold': 0.1}, 'crowded': {'scene_obstacle_count': 15, 'scene_obstacle_size': 0.6, 'scene_target_size': 0.2, 'scene_key_size': 0.2, 'scene_obstacle_zone_thresh': 1.5, 'scene_distance_key_placing': 0.7, 'scene_range_key_placing': 0.3, 'scene_room_length_max': 9, 'scene_room_wall_thickness': 0.05, 'scene_wall_threshold': 0.1}, 'hodoyoi': {'scene_obstacle_count': 13, 'scene_obstacle_size': 0.6, 'scene_target_size': 0.3, 'scene_key_size': 0.2, 'scene_obstacle_zone_thresh': 1.0, 'scene_distance_key_placing': 0.7, 'scene_range_key_placing': 0.3, 'scene_room_length_max': 9, 'scene_room_wall_thickness': 0.05, 'scene_wall_threshold': 0.2}}, 'env_config': {

In [3]:
env = gym.make(cfg.env_id, **env_mode)
env = ResizeObservation(env, cfg.observation_resize)
env = FrameStack(env, cfg.stack_num)
env = TensorObservation(env)
env = TimeLimit(env, max_episode_steps=cfg.max_episode_steps)

if cfg.set_seed:
    np.random.seed(cfg.env_seed)
    
obs = env.reset(**room_mode)

  logger.warn(
  return torch.as_tensor(observation, dtype=torch.float32)


In [4]:
subtasks = [None] * n_subtask

for i in range(n_subtask):
    subtasks[i] = make_DrQ_agent(
        experiment_name=cfg.experiment_name, 
        obs_space=env.observation_space, 
        action_space=env.action_space,
        feature_dim=cfg.feature_dim,
        hidden_dim=cfg.hidden_dim,
        lr=cfg.lr,
        image_pad=cfg.image_pad,
        gamma=cfg.gamma,
        replay_start_size=cfg.replay_start_size,
        capacity=cfg.capacity,
        gpu=gpu,
        batch_size=cfg.batch_size,
        update_interval=cfg.update_interval,
        is_persistent_buffer=False)


set workspace: work/maintask_meta
set workspace: work/maintask_meta
set workspace: work/maintask_meta


In [5]:
subtasks[0].load(str(map_agent_path))
subtasks[1].load(str(chest_agent_path))
subtasks[2].load(str(key_agent_path))

In [6]:
for s in subtasks:
    s.training = False

## DQN Agent

In [7]:
import pfrl
qfunc = pfrl.q_functions.DuelingDQN(n_subtask, cfg.stack_num)

final_eps = 0.01
eval_eps = 0.001
final_exploration_frames = 5 * 10 ** 4


In [8]:
opt = pfrl.optimizers.RMSpropEpsInsideSqrt(
    qfunc.parameters(),
    lr=cfg.lr,
    alpha=0.95,
    momentum=0.0,
    eps=1e-2,
    centered=True,
)
rbuf = pfrl.replay_buffers.PersistentEpisodicReplayBuffer(f'{str(workspace_path)}/rbuf', cfg.capacity)
explorer = pfrl.explorers.LinearDecayEpsilonGreedy(
    1.0,
    final_eps,
    final_exploration_frames,
    lambda: np.random.randint(n_subtask),
)
phi = lambda x: np.asarray(x, dtype=np.float32) / 100

In [9]:
agent = pfrl.agents.DoubleDQN(
    qfunc,
    opt,
    rbuf,
    gpu=gpu,
    gamma=cfg.gamma,
    explorer=explorer,
    replay_start_size=cfg.replay_start_size,
    update_interval=cfg.update_interval,
    minibatch_size=cfg.batch_size,
    target_update_interval=5000,
    phi=phi
)

## Train

In [10]:
result_filename = f'{str(workspace_path)}/result.hdf5'

with h5py.File(result_filename, 'a') as f:
    agent_statics_group = 'agent_statics'
    agent_statics_labels = [s[0] for s in agent.get_statistics()]
    for l in agent_statics_labels:
        f.create_dataset(f'{agent_statics_group}/{l}', shape=(cfg.n_episodes,))

    episode_rewards_group = 'episode_rewards'
    episode_rewards_labels = ['episode', 'total_reward']
    for l in episode_rewards_labels:
        f.create_dataset(f'{episode_rewards_group}/{l}', shape=(cfg.n_episodes,))

import datetime
start = datetime.datetime.now()
print(f'\n====================\nStart Training: {start}\n====================\n')


Start Training: 2022-02-01 18:08:29.985541



In [11]:
for i in range(cfg.n_episodes):
    obs = env.reset(is_generate_room=cfg.change_room, is_generate_pose=cfg.change_pose, **room_mode)
    R = 0  # return (sum of rewards)
    while True:
        # Uncomment to watch the behavior in a GUI window
        # env.render()
        select = agent.act(obs)

        ## select subtask
        sub = subtasks[select]
        action = sub.act(obs)

        obs, reward, done, _ = env.step(action)
        R += reward
        
        agent.observe(obs, reward, done, done)
        sub.observe(obs, reward, done, done)

        if done:
            break

    print('=============')
    print(f'episode: {i}, reward: {R}')
    print(f'statics: {agent.get_statistics()}')
    
    ## record result
    with h5py.File(result_filename, 'a') as f:
        f[episode_rewards_group]['episode'][i] = i+1
        f[episode_rewards_group]['total_reward'][i] = R
        for statics in agent.get_statistics():
            f[agent_statics_group][statics[0]][i] = statics[1]

print('Finished')
print(f'\n====================\nTraining Time: {datetime.datetime.now() - start}\n====================\n')

agent.save(f'{str(workspace_path)}/agent')

episode: 0, reward: 48.000000000000036
statics: [('average_q', nan), ('average_loss', nan), ('cumulative_steps', 50), ('n_updates', 0), ('rlen', 50)]
episode: 1, reward: 97.99999999999977
statics: [('average_q', nan), ('average_loss', nan), ('cumulative_steps', 100), ('n_updates', 0), ('rlen', 100)]
episode: 2, reward: 48.00000000000003
statics: [('average_q', nan), ('average_loss', nan), ('cumulative_steps', 150), ('n_updates', 0), ('rlen', 150)]
episode: 3, reward: -2.000000000000001
statics: [('average_q', nan), ('average_loss', nan), ('cumulative_steps', 200), ('n_updates', 0), ('rlen', 200)]
episode: 4, reward: 148.0000000000001
statics: [('average_q', nan), ('average_loss', nan), ('cumulative_steps', 250), ('n_updates', 0), ('rlen', 250)]
episode: 5, reward: 97.99999999999996
statics: [('average_q', nan), ('average_loss', nan), ('cumulative_steps', 300), ('n_updates', 0), ('rlen', 300)]
episode: 6, reward: -2.000000000000001
statics: [('average_q', nan), ('average_loss', nan), ('

	addcmul_(Number value, Tensor tensor1, Tensor tensor2)
Consider using one of the following signatures instead:
	addcmul_(Tensor tensor1, Tensor tensor2, *, Number value) (Triggered internally at  ../torch/csrc/utils/python_arg_parser.cpp:1050.)
  square_avg.mul_(alpha).addcmul_(1 - alpha, grad, grad)


episode: 40, reward: 48.00000000000003
statics: [('average_q', 0.17526743), ('average_loss', 34.688764980492685), ('cumulative_steps', 2050), ('n_updates', 26), ('rlen', 2050)]
episode: 41, reward: -2.000000000000001
statics: [('average_q', 0.18551989), ('average_loss', 39.119288754521634), ('cumulative_steps', 2100), ('n_updates', 51), ('rlen', 2100)]
episode: 42, reward: 48.000000000000036
statics: [('average_q', 0.17880005), ('average_loss', 39.32541632377788), ('cumulative_steps', 2150), ('n_updates', 76), ('rlen', 2150)]
episode: 43, reward: 97.99999999999997
statics: [('average_q', 0.1758468), ('average_loss', 40.32349971607327), ('cumulative_steps', 2200), ('n_updates', 101), ('rlen', 2200)]
episode: 44, reward: 48.000000000000014
statics: [('average_q', 0.17595136), ('average_loss', 37.77058039620519), ('cumulative_steps', 2250), ('n_updates', 126), ('rlen', 2250)]
episode: 45, reward: -2.000000000000001
statics: [('average_q', 0.17961743), ('average_loss', 35.27226596355438), 