This experiment, higher learning rate. Larger rollouts.

In [1]:
import deep_rl

In [2]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [3]:
import torch
from torch.nn import functional as F
from torch.autograd import Variable
from torch import nn, optim
import torch.utils.data

# load as dask array
import time

import logging
import sys
import os
import glob
import numpy as np
import datetime
from matplotlib import pyplot as plt
from tqdm import tqdm_notebook as tqdm

In [4]:
from deep_rl.utils import Config
from deep_rl.utils.logger import get_logger, get_default_log_dir

from deep_rl.network.network_heads import CategoricalActorCriticNet, QuantileNet, OptionCriticNet, DeterministicActorCriticNet, GaussianActorCriticNet
from deep_rl.network.network_bodies import FCBody
from deep_rl.utils.normalizer import RunningStatsNormalizer
from deep_rl.component.task import ParallelizedTask

In [5]:
from world_models_sonic.models.vae import VAE5, loss_function_vae
from world_models_sonic.helpers.summarize import TorchSummarizeDf
from world_models_sonic.models.rnn import MDNRNN2
from world_models_sonic.models.inverse_model import InverseModel
from world_models_sonic.models.world_model import WorldModel
from world_models_sonic.custom_envs.env import make_env
from world_models_sonic.custom_envs.wrappers import RandomGameReset
from world_models_sonic import config
from world_models_sonic.helpers.deep_rl import PPOAgent, run_iterations, SonicWorldModelDeepRL, CategoricalWorldActorCriticNet

Importing 0 potential games...
Imported 0 games


# Init

In [6]:
# #TODO make saves a hash of 
# agent_name, config.tag, agent.task.name
# and timestamp

In [7]:
cuda = torch.cuda.is_available()
env_name = 'sonic256'
z_dim = 512  # latent dimensions
channels = 3*4

# RNN
action_dim = 10
image_size = 128

verbose = True  # Set this true to render (and make it go slower)

NAME = 'RNN_v3b_128im_512z_1512_v6i_VAE5_all'
ppo_save_file = './outputs/{NAME}/PPO_512z_all_g.pkl'.format(NAME=NAME)

if not os.path.isdir('./outputs/{NAME}'.format(NAME=NAME)):
    os.makedirs('./outputs/{NAME}'.format(NAME=NAME))

# Log to file and stream
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logger = logging.getLogger(NAME)

log_dir = log_dir='./outputs/{NAME}'.format(NAME=NAME)
print(log_dir)

deep_rl_logger = get_logger(
    NAME,
    file_name='deep_rl_ppo.log',
    level=logging.INFO,
    log_dir='./outputs/{NAME}'.format(NAME=NAME), )

./outputs/RNN_v3b_128im_512z_1512_v6i_VAE5_all


# World model

In [8]:
# Load VAE
# TODO swap z and k dim, since it's inconsistent with other models
vae = VAE5(image_size=image_size, z_dim=128, conv_dim=64, code_dim=8, k_dim=z_dim, channels=channels)
    
# Load MDRNN
action_dim, hidden_size, n_mixture, temp = action_dim, z_dim*2, 5, 0.0

mdnrnn = MDNRNN2(z_dim, action_dim, hidden_size, n_mixture, temp)
    
finv = InverseModel(z_dim, action_dim, hidden_size=z_dim*2)
    
world_model = WorldModel(vae, mdnrnn, finv, logger=deep_rl_logger, lambda_vae_kld=1 / 1024., lambda_finv=1/100, lambda_vae=1, lambda_loss=1000)
world_model = world_model.train()
if cuda:
    world_model = world_model.cuda()

In [9]:
import torch.optim.lr_scheduler
torch.cuda.empty_cache()
optimizer = optim.Adam(world_model.parameters(), lr=1e-5)

world_model.optimizer = optimizer

# Train

In [10]:
z_state_dim=world_model.mdnrnn.z_dim + world_model.mdnrnn.hidden_size  + world_model.mdnrnn.action_dim


def task_fn(log_dir):
    return SonicWorldModelDeepRL(
        env_fn=lambda: RandomGameReset(make_env(
            'sonic', max_episode_steps=1000, to_gray=False, image_size=image_size)),
        log_dir=log_dir,
        verbose=verbose
    )

config = Config()

verbose = False  # Set this true to render (and make it go slower)
config.num_workers = 1 if verbose else 8
config.task_fn = lambda: ParallelizedTask(
    task_fn, config.num_workers, single_process=config.num_workers == 1)
config.optimizer_fn = lambda params: torch.optim.RMSprop(params, 3e-4)
config.network_fn = lambda state_dim, action_dim: CategoricalWorldActorCriticNet(
    state_dim, action_dim, FCBody(z_state_dim, hidden_units=(64, 64), gate=F.relu), gpu=0 if cuda else -1, world_model_fn=lambda: world_model,
    render=(config.num_workers==1 and verbose),
    z_shape=(32, 16)
)
config.discount = 0.99
config.logger = deep_rl_logger
config.use_gae = True
config.gae_tau = 0.95
config.entropy_weight = 0.001
config.gradient_clip = 0.4
config.rollout_length = 1*64//config.num_workers
config.optimization_epochs = 10
config.num_mini_batches = 8*1
config.ppo_ratio_clip = 0.2
config.iteration_log_interval = 10

# I tuned these so the intrinsic reward was 1) within an order of magnitude of the extrinsic. 2) smaller, 3) negative when stuck
# TODO use reward normalisers to avoid the need for these hyperparameters
config.curiosity_only = False
config.curiosity_weight = 0.1
config.curiosity_boredom = 0 # how many standard deviations above the mean does it's new experience need to be, so it's not bored
config.intrinsic_reward_normalizer = RunningStatsNormalizer()
config.reward_normalizer = RunningStatsNormalizer()
agent = PPOAgent(config)

print('rollout of ', config.rollout_length*config.num_workers)
print('mini batch', (config.rollout_length*config.num_workers)//config.num_mini_batches)

if os.path.isfile(ppo_save_file):
    print('loading ppo_save_file', ppo_save_file, 'modified', time.ctime(os.path.getmtime(ppo_save_file)))
    agent.load(ppo_save_file)
    
    # also load normalizers
    state_dict = torch.load(ppo_save_file.replace('.pkl', '-intrinsic_reward_normalizer.pkl'))
    config.intrinsic_reward_normalizer.load_state_dict(state_dict)

    state_dict = torch.load(ppo_save_file.replace('.pkl', '-reward_normalizer.pkl'))
    config.reward_normalizer.load_state_dict(state_dict)
else:
    print("couldn't find save")

game: SonicTheHedgehog2-Genesis state: ChemicalPlantZone.Act2
game: SonicTheHedgehog2-Genesis state: WingFortressZone
game: SonicAndKnuckles3-Genesis state: HiddenPalaceZone
game: SonicTheHedgehog2-Genesis state: ChemicalPlantZone.Act1
game: SonicTheHedgehog2-Genesis state: AquaticRuinZone.Act2
game: SonicAndKnuckles3-Genesis state: IcecapZone.Act2
game: SonicTheHedgehog2-Genesis state: AquaticRuinZone.Act1
game: SonicTheHedgehog2-Genesis state: OilOceanZone.Act2
rollout of  64
mini batch 8
loading ppo_save_file ./outputs/RNN_v3b_128im_512z_1512_v6i_VAE5_all/PPO_512z_all_g.pkl modified Wed Jun  6 07:14:54 2018


Process ProcessWrapper-7:
Process ProcessWrapper-5:
Process ProcessWrapper-2:
Process ProcessWrapper-8:
Process ProcessWrapper-3:
Process ProcessWrapper-1:
Process ProcessWrapper-4:
Traceback (most recent call last):
  File "/home/wassname/.pyenv/versions/3.5.3/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
Traceback (most recent call last):
Traceback (most recent call last):
Process ProcessWrapper-6:
  File "/home/wassname/.pyenv/versions/3.5.3/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
Traceback (most recent call last):
  File "/media/oldhome/wassname/Documents/projects/retro_sonic_comp/DeepRL/deep_rl/component/task.py", line 177, in run
    op, data = self.pipe.recv()
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/wassname/.pyenv/versions/3.5.3/lib/python3.5/multiprocessing/connection.py", line 250, in recv
    buf = self._recv_bytes()
  File "/home/wassname/.pyenv/versions

In [11]:
agent.network.network

ActorCriticNet(
  (phi_body): FCBody(
    (layers): ModuleList(
      (0): Linear(in_features=1546, out_features=64, bias=True)
      (1): Linear(in_features=64, out_features=64, bias=True)
    )
  )
  (actor_body): DummyBody()
  (critic_body): DummyBody()
  (fc_action): Linear(in_features=64, out_features=10, bias=True)
  (fc_critic): Linear(in_features=64, out_features=1, bias=True)
)

In [12]:
# # if we want to reset the actor
# from deep_rl.network.network_heads import ActorCriticNet
# agent.network.network = ActorCriticNet(agent.network.z_state_dim, action_dim, FCBody(z_state_dim, hidden_units=(64, 64), gate=F.relu), None, None)
# agent.network.network.cuda()

In [13]:
try:
    run_iterations(agent, log_dir=log_dir)
except:
    if config.num_workers == 1:
        agent.task.tasks[0].env.close()
    else:
        [t.close() for t in agent.task.tasks]
    print("saving", ppo_save_file)
    agent.save(ppo_save_file)
    torch.save(config.intrinsic_reward_normalizer.state_dict(), ppo_save_file.replace('.pkl', '-intrinsic_reward_normalizer.pkl'))
    torch.save(config.reward_normalizer.state_dict(), ppo_save_file.replace('.pkl', '-reward_normalizer.pkl'))

    # Backup since it sometimes get's corrupted
    ts = datetime.datetime.utcnow().strftime('%Y%m%d_%H-%M-%S')
    print("saving backup",
          ppo_save_file.replace('.pkl', '-%s.pkl' % ts),)
    agent.save(ppo_save_file.replace('.pkl', '-%s.pkl' % ts))
    # TODO save and load normalizers
    raise

rollout extrinsic, intrinsic reward [min/mean/max]: 0.0287/0.0287/0.0287, -0.0000/0.0000/-0.0000


2018-06-06 07:15:19,562 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: loss_rnn=35.9779, loss_inv= 7.4266=0.0100 * 742.6586, loss_vae=29.9502=1.0000 * (29.9279 + 0.0010 * 22.7637)
2018-06-06 07:15:19,566 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: total steps 64, min/mean/max reward 0.0000/0.0000/0.0000 of 8
2018-06-06 07:15:19,570 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: running min/mean/max reward 0.0000/0.0000/0.0000 of 8 0.4950 s/rollout


rollout extrinsic, intrinsic reward [min/mean/max]: 0.0250/0.0250/0.0250, -0.0000/0.0000/-0.0000
rollout extrinsic, intrinsic reward [min/mean/max]: 0.0861/0.0861/0.0861, -0.0000/0.0000/-0.0000
rollout extrinsic, intrinsic reward [min/mean/max]: 0.1231/0.1231/0.1231, -0.0000/0.0000/-0.0000
rollout extrinsic, intrinsic reward [min/mean/max]: 0.1164/0.1164/0.1164, -0.0000/0.0000/-0.0000
rollout extrinsic, intrinsic reward [min/mean/max]: -0.2051/-0.2051/-0.2051, -0.0000/0.0000/-0.0000
rollout extrinsic, intrinsic reward [min/mean/max]: -0.1996/-0.1996/-0.1996, -0.0000/0.0000/-0.0000
rollout extrinsic, intrinsic reward [min/mean/max]: -0.0785/-0.0785/-0.0785, -0.0000/0.0000/-0.0000
rollout extrinsic, intrinsic reward [min/mean/max]: -0.0421/-0.0421/-0.0421, -0.0000/0.0000/-0.0000
rollout extrinsic, intrinsic reward [min/mean/max]: -0.2103/-0.2103/-0.2103, -0.0000/0.0000/-0.0000
rollout extrinsic, intrinsic reward [min/mean/max]: -0.3021/-0.3021/-0.3021, -0.0000/0.0000/-0.0000


2018-06-06 07:15:58,840 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: loss_rnn=35.9779, loss_inv= 7.3698=0.0100 * 736.9768, loss_vae=28.2404=1.0000 * (28.2189 + 0.0010 * 22.0458)
2018-06-06 07:15:58,841 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: total steps 704, min/mean/max reward 0.0000/0.9397/5.1114 of 8
2018-06-06 07:15:58,843 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: running min/mean/max reward 0.0000/0.1401/5.1114 of 88 0.4913 s/rollout


rollout extrinsic, intrinsic reward [min/mean/max]: -0.0453/-0.0453/-0.0453, -0.0000/0.0000/-0.0000
rollout extrinsic, intrinsic reward [min/mean/max]: -0.1433/-0.1433/-0.1433, -0.0000/0.0000/-0.0000
rollout extrinsic, intrinsic reward [min/mean/max]: 0.4131/0.4131/0.4131, -0.0000/0.0000/-0.0000
rollout extrinsic, intrinsic reward [min/mean/max]: 0.0352/0.0352/0.0352, -0.0000/0.0000/-0.0000
rollout extrinsic, intrinsic reward [min/mean/max]: 0.1648/0.1648/0.1648, -0.0000/0.0000/-0.0000
rollout extrinsic, intrinsic reward [min/mean/max]: 0.4260/0.4260/0.4260, -0.0000/0.0000/-0.0000
rollout extrinsic, intrinsic reward [min/mean/max]: 0.4758/0.4758/0.4758, -0.0000/0.0000/-0.0000
rollout extrinsic, intrinsic reward [min/mean/max]: -0.1531/-0.1531/-0.1531, -0.0000/0.0000/-0.0000
rollout extrinsic, intrinsic reward [min/mean/max]: 0.1012/0.1012/0.1012, -0.0000/0.0000/-0.0000
rollout extrinsic, intrinsic reward [min/mean/max]: 0.1218/0.1218/0.1218, -0.0000/0.0000/-0.0000


2018-06-06 07:16:35,826 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: loss_rnn=35.9776, loss_inv= 7.4028=0.0100 * 740.2777, loss_vae=27.9973=1.0000 * (27.9758 + 0.0010 * 22.0457)
2018-06-06 07:16:35,827 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: total steps 1344, min/mean/max reward 0.0000/0.9397/5.1114 of 8
2018-06-06 07:16:35,831 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: running min/mean/max reward 0.0000/0.5209/5.1114 of 168 0.4775 s/rollout


rollout extrinsic, intrinsic reward [min/mean/max]: -0.1903/-0.1903/-0.1903, -0.0000/0.0000/-0.0000
rollout extrinsic, intrinsic reward [min/mean/max]: 0.0945/0.0945/0.0945, -0.0000/0.0000/-0.0000
rollout extrinsic, intrinsic reward [min/mean/max]: 0.1447/0.1447/0.1447, -0.0000/0.0000/-0.0000
rollout extrinsic, intrinsic reward [min/mean/max]: 0.1569/0.1569/0.1569, -0.0000/0.0000/-0.0000
rollout extrinsic, intrinsic reward [min/mean/max]: -0.1811/-0.1811/-0.1811, -0.0000/0.0000/-0.0000
rollout extrinsic, intrinsic reward [min/mean/max]: 0.4098/0.4098/0.4098, -0.0000/0.0000/-0.0000
rollout extrinsic, intrinsic reward [min/mean/max]: -0.1342/-0.1342/-0.1342, -0.0000/0.0000/-0.0000
rollout extrinsic, intrinsic reward [min/mean/max]: -0.2515/-0.2515/-0.2515, -0.0000/0.0000/-0.0000
rollout extrinsic, intrinsic reward [min/mean/max]: -0.2644/-0.2644/-0.2644, -0.0000/0.0000/-0.0000
rollout extrinsic, intrinsic reward [min/mean/max]: -0.1497/-0.1497/-0.1497, -0.0000/0.0000/-0.0000


2018-06-06 07:17:14,467 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: loss_rnn=35.9777, loss_inv= 7.3701=0.0100 * 737.0135, loss_vae=27.6704=1.0000 * (27.6490 + 0.0010 * 21.8947)
2018-06-06 07:17:14,468 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: total steps 1984, min/mean/max reward 0.0000/0.9397/5.1114 of 8
2018-06-06 07:17:14,476 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: running min/mean/max reward 0.0000/0.6560/5.1114 of 248 0.4793 s/rollout


rollout extrinsic, intrinsic reward [min/mean/max]: 0.3261/0.3261/0.3261, -0.0000/0.0000/-0.0000
rollout extrinsic, intrinsic reward [min/mean/max]: 0.6927/0.6927/0.6927, -0.0000/0.0000/-0.0000
rollout extrinsic, intrinsic reward [min/mean/max]: 0.2823/0.2823/0.2823, -0.0000/0.0000/-0.0000
rollout extrinsic, intrinsic reward [min/mean/max]: 0.0124/0.0124/0.0124, -0.0000/0.0000/-0.0000
rollout extrinsic, intrinsic reward [min/mean/max]: -0.0278/-0.0278/-0.0278, -0.0000/0.0000/-0.0000
rollout extrinsic, intrinsic reward [min/mean/max]: 0.0098/0.0098/0.0098, -0.0000/0.0000/-0.0000
rollout extrinsic, intrinsic reward [min/mean/max]: 0.0420/0.0420/0.0420, -0.0000/0.0000/-0.0000
rollout extrinsic, intrinsic reward [min/mean/max]: 0.2584/0.2584/0.2584, -0.0000/0.0000/-0.0000
rollout extrinsic, intrinsic reward [min/mean/max]: 0.4391/0.4391/0.4391, -0.0000/0.0000/-0.0000
rollout extrinsic, intrinsic reward [min/mean/max]: 0.7386/0.7386/0.7386, -0.0000/0.0000/-0.0000


2018-06-06 07:17:52,207 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: loss_rnn=35.9777, loss_inv= 7.3595=0.0100 * 735.9513, loss_vae=27.4282=1.0000 * (27.4068 + 0.0010 * 21.9552)
2018-06-06 07:17:52,209 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: total steps 2624, min/mean/max reward 1.6981/6.8708/13.7001 of 8
2018-06-06 07:17:52,210 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: running min/mean/max reward 0.0000/2.2051/16.0815 of 328 0.4775 s/rollout


rollout extrinsic, intrinsic reward [min/mean/max]: 0.7757/0.7757/0.7757, -0.0000/0.0000/-0.0000
rollout extrinsic, intrinsic reward [min/mean/max]: 0.5904/0.5904/0.5904, -0.0000/0.0000/-0.0000
rollout extrinsic, intrinsic reward [min/mean/max]: 0.4027/0.4027/0.4027, -0.0000/0.0000/-0.0000
rollout extrinsic, intrinsic reward [min/mean/max]: 0.1298/0.1298/0.1298, -0.0000/0.0000/-0.0000
rollout extrinsic, intrinsic reward [min/mean/max]: -0.0913/-0.0913/-0.0913, -0.0000/0.0000/-0.0000
rollout extrinsic, intrinsic reward [min/mean/max]: 0.0717/0.0717/0.0717, -0.0000/0.0000/-0.0000
rollout extrinsic, intrinsic reward [min/mean/max]: -0.0027/-0.0027/-0.0027, -0.0000/0.0000/-0.0000
rollout extrinsic, intrinsic reward [min/mean/max]: -0.0687/-0.0687/-0.0687, -0.0000/0.0000/-0.0000
rollout extrinsic, intrinsic reward [min/mean/max]: -0.1318/-0.1318/-0.1318, -0.0000/0.0000/-0.0000
rollout extrinsic, intrinsic reward [min/mean/max]: 0.3691/0.3691/0.3691, -0.0000/0.0000/-0.0000


2018-06-06 07:18:29,648 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: loss_rnn=35.9777, loss_inv= 7.3310=0.0100 * 733.0998, loss_vae=27.4590=1.0000 * (27.4374 + 0.0010 * 22.1388)
2018-06-06 07:18:29,649 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: total steps 3264, min/mean/max reward 0.7896/7.9297/23.2307 of 8
2018-06-06 07:18:29,650 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: running min/mean/max reward 0.0000/3.2277/23.2307 of 408 0.4756 s/rollout


rollout extrinsic, intrinsic reward [min/mean/max]: 0.3729/0.3729/0.3729, -0.0000/0.0000/-0.0000
rollout extrinsic, intrinsic reward [min/mean/max]: -0.2044/-0.2044/-0.2044, -0.0000/0.0000/-0.0000
rollout extrinsic, intrinsic reward [min/mean/max]: -0.1986/-0.1986/-0.1986, -0.0000/0.0000/-0.0000
rollout extrinsic, intrinsic reward [min/mean/max]: -0.1871/-0.1871/-0.1871, -0.0000/0.0000/-0.0000
rollout extrinsic, intrinsic reward [min/mean/max]: -0.1793/-0.1793/-0.1793, -0.0000/0.0000/-0.0000
rollout extrinsic, intrinsic reward [min/mean/max]: 0.0722/0.0722/0.0722, -0.0000/0.0000/-0.0000
rollout extrinsic, intrinsic reward [min/mean/max]: 0.3556/0.3556/0.3556, -0.0000/0.0000/-0.0000
rollout extrinsic, intrinsic reward [min/mean/max]: -0.0450/-0.0450/-0.0450, -0.0000/0.0000/-0.0000
rollout extrinsic, intrinsic reward [min/mean/max]: -0.2374/-0.2374/-0.2374, -0.0000/0.0000/-0.0000
rollout extrinsic, intrinsic reward [min/mean/max]: -0.1734/-0.1734/-0.1734, -0.0000/0.0000/-0.0000


2018-06-06 07:19:06,793 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: loss_rnn=35.9777, loss_inv= 7.3139=0.0100 * 731.3881, loss_vae=27.5882=1.0000 * (27.5662 + 0.0010 * 22.5347)
2018-06-06 07:19:06,794 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: total steps 3904, min/mean/max reward 2.2185/8.7887/23.2307 of 8
2018-06-06 07:19:06,795 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: running min/mean/max reward 0.0000/4.1145/23.2307 of 488 0.4737 s/rollout


rollout extrinsic, intrinsic reward [min/mean/max]: -0.2302/-0.2302/-0.2302, -0.0000/0.0000/-0.0000
rollout extrinsic, intrinsic reward [min/mean/max]: 0.0774/0.0774/0.0774, -0.0000/0.0000/-0.0000
rollout extrinsic, intrinsic reward [min/mean/max]: 0.3486/0.3486/0.3486, -0.0000/0.0000/-0.0000
rollout extrinsic, intrinsic reward [min/mean/max]: 0.1410/0.1410/0.1410, -0.0000/0.0000/-0.0000
rollout extrinsic, intrinsic reward [min/mean/max]: 0.0873/0.0873/0.0873, -0.0000/0.0000/-0.0000
rollout extrinsic, intrinsic reward [min/mean/max]: 0.4504/0.4504/0.4504, -0.0000/0.0000/-0.0000
rollout extrinsic, intrinsic reward [min/mean/max]: 0.6999/0.6999/0.6999, -0.0000/0.0000/-0.0000
rollout extrinsic, intrinsic reward [min/mean/max]: 0.6864/0.6864/0.6864, -0.0000/0.0000/-0.0000
rollout extrinsic, intrinsic reward [min/mean/max]: 0.4327/0.4327/0.4327, -0.0000/0.0000/-0.0000
rollout extrinsic, intrinsic reward [min/mean/max]: 0.5012/0.5012/0.5012, -0.0000/0.0000/-0.0000


2018-06-06 07:19:43,520 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: loss_rnn=35.9778, loss_inv= 7.3210=0.0100 * 732.0953, loss_vae=28.0035=1.0000 * (27.9816 + 0.0010 * 22.4887)
2018-06-06 07:19:43,521 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: total steps 4544, min/mean/max reward 1.9422/7.9349/20.4118 of 8
2018-06-06 07:19:43,523 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: running min/mean/max reward 0.0000/5.3381/23.2307 of 500 0.4717 s/rollout


rollout extrinsic, intrinsic reward [min/mean/max]: 0.0914/0.0914/0.0914, -0.0000/0.0000/-0.0000
rollout extrinsic, intrinsic reward [min/mean/max]: 0.3282/0.3282/0.3282, -0.0000/0.0000/-0.0000
saving ./outputs/RNN_v3b_128im_512z_1512_v6i_VAE5_all/PPO_512z_all_g.pkl
saving backup ./outputs/RNN_v3b_128im_512z_1512_v6i_VAE5_all/PPO_512z_all_g-20180605_23-19-57.pkl


KeyboardInterrupt: 

In [None]:
agent.save(ppo_save_file)
torch.save(config.intrinsic_reward_normalizer.state_dict(), ppo_save_file.replace('.pkl', '-intrinsic_reward_normalizer.pkl'))
torch.save(config.reward_normalizer.state_dict(), ppo_save_file.replace('.pkl', '-reward_normalizer.pkl'))



To monitor with tensorboard
```sh
cd ~/Documents/projects/retro_sonic_comp/world-models-pytorch/log 
tensorboard  --logdir .
#then open http://localhost:6006/#scalars
```


In [None]:
agent.save(ppo_save_file)
ppo_save_file

# summarize

In [None]:
from IPython.display import display

with torch.no_grad():
    img = np.random.randn(image_size, image_size, 3)
    action = np.array(np.random.randint(0,action_dim))[np.newaxis]
    action = Variable(torch.from_numpy(action)).float().cuda()[np.newaxis]
    gpu_img = Variable(torch.from_numpy(img[np.newaxis].transpose(0, 3, 1, 2))).float().cuda()
    if cuda:
        gpu_img = gpu_img.cuda()
    with TorchSummarizeDf(vae) as tdf:
        x, mu_vae, logvar_vae = vae.forward(gpu_img)
        z = vae.sample(mu_vae, logvar_vae)
        df_vae = tdf.make_df()

    display(df_vae[df_vae.level<2])
    
    with TorchSummarizeDf(mdnrnn) as tdf: 
        pi, mu, sigma, hidden_state = mdnrnn.forward(z.unsqueeze(1).repeat((1,2,1)))
        z_next = mdnrnn.sample(pi, mu, sigma)
        df_mdnrnn = tdf.make_df()
    
    display(df_mdnrnn)
    

    with TorchSummarizeDf(finv) as tdf:
        finv(z.repeat((1,2,1)), z_next)   
        df_finv = tdf.make_df()
    display(df_finv)

    with TorchSummarizeDf(world_model) as tdf:
        world_model(gpu_img, action)
        df_world_model = tdf.make_df()
    display(df_world_model[df_world_model.level<2])
    
    del img, action, gpu_img, x, mu, z, z_next, mu_vae, pi, sigma, logvar_vae