This experiment, higher learning rate. Larger rollouts.

In [1]:
import deep_rl

In [2]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [3]:
import torch
from torch.nn import functional as F
from torch.autograd import Variable
from torch import nn, optim
import torch.utils.data

# load as dask array
import time

import logging
import sys
import os
import glob
import numpy as np
import datetime
from matplotlib import pyplot as plt
from tqdm import tqdm_notebook as tqdm

In [4]:
from deep_rl.utils import Config
from deep_rl.utils.logger import get_logger, get_default_log_dir

from deep_rl.network.network_heads import CategoricalActorCriticNet, QuantileNet, OptionCriticNet, DeterministicActorCriticNet, GaussianActorCriticNet
from deep_rl.network.network_bodies import FCBody

from deep_rl.component.task import ParallelizedTask

In [5]:
from world_models_sonic.models.vae import VAE5, loss_function_vae
from world_models_sonic.helpers.summarize import TorchSummarizeDf
from world_models_sonic.helpers.dataset import load_cache_data
from world_models_sonic.models.rnn import MDNRNN2
from world_models_sonic.models.inverse_model import InverseModel
from world_models_sonic.models.world_model import WorldModel
from world_models_sonic.custom_envs.env import make_env
from world_models_sonic.custom_envs.wrappers import RandomGameReset
from world_models_sonic import config
from world_models_sonic.helpers.deep_rl import PPOAgent, run_iterations, SonicWorldModelDeepRL, CategoricalWorldActorCriticNet

Importing 0 potential games...
Imported 0 games


  from ._conv import register_converters as _register_converters


# Init

In [6]:
# #TODO make saves a hash of 
# agent_name, config.tag, agent.task.name
# and timestamp

In [7]:
cuda = torch.cuda.is_available()
env_name = 'sonic256'
z_dim = 512  # latent dimensions
channels = 3*4

# RNN
action_dim = 10
image_size = 128

verbose = True  # Set this true to render (and make it go slower)

NAME = 'RNN_v3b_128im_512z_1512_v6i_VAE5_all'
ppo_save_file = './outputs/{NAME}/PPO_512z_all_f.pkl'.format(NAME=NAME)

if not os.path.isdir('./outputs/{NAME}'.format(NAME=NAME)):
    os.makedirs('./outputs/{NAME}'.format(NAME=NAME))

# Log to file and stream
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logger = logging.getLogger(NAME)

log_dir = log_dir='./outputs/{NAME}'.format(NAME=NAME)
print(log_dir)

deep_rl_logger = get_logger(
    NAME,
    file_name='deep_rl_ppo.log',
    level=logging.INFO,
    log_dir='./outputs/{NAME}'.format(NAME=NAME), )

./outputs/RNN_v3b_128im_512z_1512_v6i_VAE5_all


# World model

In [8]:
# Load VAE
# TODO swap z and k dim, since it's inconsistent with other models
vae = VAE5(image_size=image_size, z_dim=128, conv_dim=64, code_dim=8, k_dim=z_dim, channels=channels)
    
# Load MDRNN
action_dim, hidden_size, n_mixture, temp = action_dim, z_dim*2, 5, 0.0

mdnrnn = MDNRNN2(z_dim, action_dim, hidden_size, n_mixture, temp)
    
finv = InverseModel(z_dim, action_dim, hidden_size=z_dim*2)
    
world_model = WorldModel(vae, mdnrnn, finv, logger=deep_rl_logger, lambda_vae_kld=1 / 4., lambda_finv=1, lambda_vae=1/100, lambda_loss=1000)
world_model = world_model.train()
if cuda:
    world_model = world_model.cuda()

In [9]:
import torch.optim.lr_scheduler
torch.cuda.empty_cache()
optimizer = optim.Adam(world_model.parameters(), lr=1e-4)

world_model.optimizer = optimizer

# Train

In [10]:
z_state_dim=world_model.mdnrnn.z_dim + world_model.mdnrnn.hidden_size  + world_model.mdnrnn.action_dim


def task_fn(log_dir):
    return SonicWorldModelDeepRL(
        env_fn=lambda: RandomGameReset(make_env(
            'sonic', max_episode_steps=1000, to_gray=False, image_size=image_size)),
        log_dir=log_dir,
        verbose=verbose
    )

config = Config()

verbose = False  # Set this true to render (and make it go slower)
config.num_workers = 1 if verbose else 8
config.task_fn = lambda: ParallelizedTask(
    task_fn, config.num_workers, single_process=config.num_workers == 1)
config.optimizer_fn = lambda params: torch.optim.Adam(params, 3e-4, eps=1e-5)
config.network_fn = lambda state_dim, action_dim: CategoricalWorldActorCriticNet(
    state_dim, action_dim, FCBody(z_state_dim, hidden_units=(64, 64), gate=F.relu), gpu=0 if cuda else -1, world_model_fn=lambda: world_model,
    render=(config.num_workers==1 and verbose),
    z_shape=(32, 16)
)
config.discount = 0.99
config.logger = deep_rl_logger
config.use_gae = True
config.gae_tau = 0.95
config.entropy_weight = 0.001
config.gradient_clip = 0.4
config.rollout_length = 64//config.num_workers
config.optimization_epochs = 10
config.num_mini_batches = 8
config.ppo_ratio_clip = 0.2
config.iteration_log_interval = 10
config.curiosity_weight = 1
config.curiosity_only = True
agent = PPOAgent(config)

print('rollout of ', config.rollout_length*config.num_workers)
print('mini batch', (config.rollout_length*config.num_workers)//config.num_mini_batches)

if os.path.isfile(ppo_save_file):
    print('loading ppo_save_file', ppo_save_file, 'modified', time.ctime(os.path.getmtime(ppo_save_file)))
    agent.load(ppo_save_file)
else:
    print("couldn't find save")

game: SonicTheHedgehog-Genesis state: MarbleZone.Act1
game: SonicAndKnuckles3-Genesis state: HydrocityZone.Act2
game: SonicTheHedgehog2-Genesis state: MysticCaveZone.Act1
game: SonicAndKnuckles3-Genesis state: SandopolisZone.Act1
game: SonicTheHedgehog-Genesis state: LabyrinthZone.Act1
game: SonicAndKnuckles3-Genesis state: DeathEggZone.Act1
game: SonicTheHedgehog2-Genesis state: ChemicalPlantZone.Act2
game: SonicAndKnuckles3-Genesis state: IcecapZone.Act1
rollout of  64
mini batch 8
loading ppo_save_file ./outputs/RNN_v3b_128im_512z_1512_v6i_VAE5_all/PPO_512z_all_f.pkl modified Mon Jun  4 18:01:35 2018


In [None]:
try:
    run_iterations(agent, log_dir=log_dir)
except:
    if config.num_workers == 1:
        agent.task.tasks[0].env.close()
    else:
        [t.close() for t in agent.task.tasks]
    print("saving", ppo_save_file)
    agent.save(ppo_save_file)

    # Backup since it sometimes get's corrupted
    ts = datetime.datetime.utcnow().strftime('%Y%m%d_%H-%M-%S')
    print("saving backup",
          ppo_save_file.replace('.pkl', '-%s.pkl' % ts),)
    agent.save(ppo_save_file.replace('.pkl', '-%s.pkl' % ts))
    raise 

rollout extrinsic vs intrinsic reward 0.1781 0.0793


2018-06-04 18:01:55,254 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: loss_rnn=33.1850, loss_inv= 655.1586=1.0000 * 655.1586, loss_vae=0.2704=0.0100 * (26.2516 + 0.2500 * 3.1507)
2018-06-04 18:01:55,255 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: total steps 64, min/mean/max reward 0.0000/0.0000/0.0000 of 8
2018-06-04 18:01:55,257 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: running min/mean/max reward 0.0000/0.0000/0.0000 of 8 0.4781 s/rollout


rollout extrinsic vs intrinsic reward 0.1910 -0.0824
rollout extrinsic vs intrinsic reward 0.2161 -0.1100
rollout extrinsic vs intrinsic reward 0.0614 0.0666
rollout extrinsic vs intrinsic reward 0.0477 0.0049
rollout extrinsic vs intrinsic reward 0.0769 0.0609
rollout extrinsic vs intrinsic reward 0.0289 0.0085
rollout extrinsic vs intrinsic reward 0.0586 0.0448
rollout extrinsic vs intrinsic reward 0.0282 0.0201
rollout extrinsic vs intrinsic reward 0.0005 0.0431
rollout extrinsic vs intrinsic reward 0.0417 0.1023


2018-06-04 18:02:37,252 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: loss_rnn=32.9487, loss_inv= 655.1586=1.0000 * 655.1586, loss_vae=0.2259=0.0100 * (21.8707 + 0.2500 * 2.8589)
2018-06-04 18:02:37,253 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: total steps 704, min/mean/max reward 0.0000/0.8457/6.7660 of 8
2018-06-04 18:02:37,254 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: running min/mean/max reward 0.0000/0.5382/6.7660 of 88 0.5207 s/rollout


rollout extrinsic vs intrinsic reward 0.0216 -0.0462
rollout extrinsic vs intrinsic reward 0.0207 0.0185
rollout extrinsic vs intrinsic reward 0.0000 0.0380
rollout extrinsic vs intrinsic reward 0.0224 0.0332
rollout extrinsic vs intrinsic reward 0.0510 0.1091
rollout extrinsic vs intrinsic reward 0.0165 -0.0014
rollout extrinsic vs intrinsic reward 0.0128 -0.0317
rollout extrinsic vs intrinsic reward 0.0410 -0.0237
rollout extrinsic vs intrinsic reward 0.0001 0.1447
rollout extrinsic vs intrinsic reward 0.0000 -0.0547


2018-06-04 18:03:13,724 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: loss_rnn=32.9340, loss_inv= 655.1586=1.0000 * 655.1586, loss_vae=0.1885=0.0100 * (18.1167 + 0.2500 * 2.9333)
2018-06-04 18:03:13,725 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: total steps 1344, min/mean/max reward 0.0000/2.1926/8.5565 of 8
2018-06-04 18:03:13,726 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: running min/mean/max reward 0.0000/1.2468/8.5565 of 168 0.4898 s/rollout


rollout extrinsic vs intrinsic reward 0.0000 0.0897
rollout extrinsic vs intrinsic reward 0.0000 -0.0602
rollout extrinsic vs intrinsic reward 0.0000 0.0735
rollout extrinsic vs intrinsic reward 0.0000 -0.1177
rollout extrinsic vs intrinsic reward 0.0162 0.0064
rollout extrinsic vs intrinsic reward 0.0074 0.0287
rollout extrinsic vs intrinsic reward 0.0125 0.0061
rollout extrinsic vs intrinsic reward 0.0307 0.0062
rollout extrinsic vs intrinsic reward 0.0149 0.0259
rollout extrinsic vs intrinsic reward 0.0316 -0.0685


2018-06-04 18:03:50,146 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: loss_rnn=32.9074, loss_inv= 655.5619=1.0000 * 655.5619, loss_vae=0.1725=0.0100 * (16.5212 + 0.2500 * 2.9069)
2018-06-04 18:03:50,147 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: total steps 1984, min/mean/max reward 0.0000/2.1926/8.5565 of 8
2018-06-04 18:03:50,148 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: running min/mean/max reward 0.0000/1.5519/8.5565 of 248 0.4787 s/rollout


rollout extrinsic vs intrinsic reward 0.0959 0.0011
rollout extrinsic vs intrinsic reward 0.2129 0.0523
rollout extrinsic vs intrinsic reward 0.1496 0.0458
rollout extrinsic vs intrinsic reward 0.1207 0.0456
rollout extrinsic vs intrinsic reward 0.0537 -0.0143
rollout extrinsic vs intrinsic reward 0.0635 -0.0415
rollout extrinsic vs intrinsic reward 0.1055 0.0079
rollout extrinsic vs intrinsic reward 0.0891 -0.0883
rollout extrinsic vs intrinsic reward 0.0280 -0.0389
rollout extrinsic vs intrinsic reward 0.0882 -0.0114


2018-06-04 18:04:26,597 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: loss_rnn=32.8198, loss_inv= 656.0733=1.0000 * 656.0733, loss_vae=0.1963=0.0100 * (18.9180 + 0.2500 * 2.8575)
2018-06-04 18:04:26,598 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: total steps 2624, min/mean/max reward 0.7998/7.3213/11.3203 of 8
2018-06-04 18:04:26,599 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: running min/mean/max reward 0.0000/3.0283/14.6726 of 328 0.4731 s/rollout


rollout extrinsic vs intrinsic reward 0.0399 -0.0451
rollout extrinsic vs intrinsic reward 0.0077 -0.0221
rollout extrinsic vs intrinsic reward 0.0096 0.0402
rollout extrinsic vs intrinsic reward 0.0002 -0.0944
rollout extrinsic vs intrinsic reward 0.0174 0.0014
rollout extrinsic vs intrinsic reward 0.0215 -0.0189
rollout extrinsic vs intrinsic reward 0.0095 0.0987
rollout extrinsic vs intrinsic reward 0.0329 -0.0219
rollout extrinsic vs intrinsic reward 0.0221 -0.0491
rollout extrinsic vs intrinsic reward 0.0356 -0.0750


2018-06-04 18:05:02,716 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: loss_rnn=32.7739, loss_inv= 656.8743=1.0000 * 656.8743, loss_vae=0.1907=0.0100 * (18.3598 + 0.2500 * 2.8367)
2018-06-04 18:05:02,718 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: total steps 3264, min/mean/max reward 0.7998/8.3705/11.3203 of 8
2018-06-04 18:05:02,719 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: running min/mean/max reward 0.0000/3.8906/14.6726 of 408 0.4688 s/rollout


rollout extrinsic vs intrinsic reward 0.0699 -0.0113
rollout extrinsic vs intrinsic reward 0.1102 -0.0695
rollout extrinsic vs intrinsic reward 0.0545 -0.0900
rollout extrinsic vs intrinsic reward 0.1169 -0.0824
rollout extrinsic vs intrinsic reward 0.1053 -0.0786
rollout extrinsic vs intrinsic reward 0.0602 0.0231
rollout extrinsic vs intrinsic reward 0.0693 -0.0833
rollout extrinsic vs intrinsic reward 0.0323 -0.0842
rollout extrinsic vs intrinsic reward 0.0000 0.0313
rollout extrinsic vs intrinsic reward 0.0011 0.0508


2018-06-04 18:05:39,076 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: loss_rnn=32.7399, loss_inv= 656.7980=1.0000 * 656.7980, loss_vae=0.1966=0.0100 * (18.9450 + 0.2500 * 2.8411)
2018-06-04 18:05:39,077 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: total steps 3904, min/mean/max reward 0.7998/9.3869/23.2791 of 8
2018-06-04 18:05:39,078 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: running min/mean/max reward 0.0000/4.7583/23.2791 of 488 0.4665 s/rollout


rollout extrinsic vs intrinsic reward 0.0000 -0.0445
rollout extrinsic vs intrinsic reward 0.0253 0.0508
rollout extrinsic vs intrinsic reward 0.1619 -0.0903
rollout extrinsic vs intrinsic reward 0.0883 0.0404
rollout extrinsic vs intrinsic reward 0.0435 0.0508
rollout extrinsic vs intrinsic reward 0.0007 0.0722
rollout extrinsic vs intrinsic reward 0.0254 -0.0134
rollout extrinsic vs intrinsic reward 0.0000 0.0029
rollout extrinsic vs intrinsic reward 0.0000 0.0253
rollout extrinsic vs intrinsic reward 0.0000 -0.0432


2018-06-04 18:06:15,190 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: loss_rnn=32.7161, loss_inv= 656.5671=1.0000 * 656.5671, loss_vae=0.1893=0.0100 * (18.2253 + 0.2500 * 2.8374)
2018-06-04 18:06:15,191 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: total steps 4544, min/mean/max reward 0.7998/7.0716/23.2791 of 8
2018-06-04 18:06:15,192 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: running min/mean/max reward 0.0000/5.8020/23.2791 of 500 0.4644 s/rollout


rollout extrinsic vs intrinsic reward 0.0275 0.0601
rollout extrinsic vs intrinsic reward 0.0393 0.0712
rollout extrinsic vs intrinsic reward 0.0246 -0.0393
rollout extrinsic vs intrinsic reward 0.0001 -0.0350
rollout extrinsic vs intrinsic reward 0.0000 -0.1396
rollout extrinsic vs intrinsic reward 0.0000 0.0436
rollout extrinsic vs intrinsic reward 0.0075 -0.0163
rollout extrinsic vs intrinsic reward 0.0161 -0.0190
rollout extrinsic vs intrinsic reward 0.0000 0.0014
rollout extrinsic vs intrinsic reward 0.0000 -0.0443


2018-06-04 18:06:51,593 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: loss_rnn=32.7247, loss_inv= 656.5475=1.0000 * 656.5475, loss_vae=0.1861=0.0100 * (17.8890 + 0.2500 * 2.8884)
2018-06-04 18:06:51,594 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: total steps 5184, min/mean/max reward 0.7998/7.0716/23.2791 of 8
2018-06-04 18:06:51,595 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: running min/mean/max reward 0.0000/6.6695/23.2791 of 500 0.4632 s/rollout


rollout extrinsic vs intrinsic reward 0.0000 0.0105
rollout extrinsic vs intrinsic reward 0.0041 0.1017
rollout extrinsic vs intrinsic reward 0.0000 0.0490
rollout extrinsic vs intrinsic reward 0.0014 -0.0166
rollout extrinsic vs intrinsic reward 0.0386 -0.0371
rollout extrinsic vs intrinsic reward 0.0469 -0.0329
rollout extrinsic vs intrinsic reward 0.0091 0.0161
rollout extrinsic vs intrinsic reward 0.0471 -0.0065
rollout extrinsic vs intrinsic reward 0.0916 0.0224
rollout extrinsic vs intrinsic reward 0.0468 0.0619


2018-06-04 18:07:28,200 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: loss_rnn=32.8212, loss_inv= 656.3949=1.0000 * 656.3949, loss_vae=0.1837=0.0100 * (17.5939 + 0.2500 * 3.1146)
2018-06-04 18:07:28,201 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: total steps 5824, min/mean/max reward 0.7998/4.4079/11.0287 of 8
2018-06-04 18:07:28,202 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: running min/mean/max reward 0.0000/7.2472/23.2791 of 500 0.4626 s/rollout


rollout extrinsic vs intrinsic reward 0.0152 -0.0422
rollout extrinsic vs intrinsic reward 0.0202 0.0464
rollout extrinsic vs intrinsic reward 0.0627 -0.0157
rollout extrinsic vs intrinsic reward 0.1336 0.0680
rollout extrinsic vs intrinsic reward 0.1401 0.0446
rollout extrinsic vs intrinsic reward 0.1597 -0.0564
rollout extrinsic vs intrinsic reward 0.1912 -0.0182
rollout extrinsic vs intrinsic reward 0.1124 0.1322
rollout extrinsic vs intrinsic reward 0.0739 0.0521
rollout extrinsic vs intrinsic reward 0.1048 0.0680


2018-06-04 18:08:04,928 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: loss_rnn=32.8863, loss_inv= 656.2725=1.0000 * 656.2725, loss_vae=0.1890=0.0100 * (18.0956 + 0.2500 * 3.2368)
2018-06-04 18:08:04,929 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: total steps 6464, min/mean/max reward 0.2605/5.5920/11.0287 of 8
2018-06-04 18:08:04,930 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: running min/mean/max reward 0.2605/7.1030/23.2791 of 500 0.4622 s/rollout


rollout extrinsic vs intrinsic reward 0.0690 0.0033
rollout extrinsic vs intrinsic reward 0.0469 0.0125
rollout extrinsic vs intrinsic reward 0.0509 0.0002
rollout extrinsic vs intrinsic reward 0.0259 0.0330
rollout extrinsic vs intrinsic reward 0.0377 0.0801
rollout extrinsic vs intrinsic reward 0.0333 -0.0190
rollout extrinsic vs intrinsic reward 0.0908 0.0021
rollout extrinsic vs intrinsic reward 0.1243 0.0393
rollout extrinsic vs intrinsic reward 0.0712 0.0061
rollout extrinsic vs intrinsic reward 0.0228 -0.0192


2018-06-04 18:08:41,181 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: loss_rnn=32.9369, loss_inv= 656.1721=1.0000 * 656.1721, loss_vae=0.1901=0.0100 * (18.1949 + 0.2500 * 3.2703)
2018-06-04 18:08:41,183 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: total steps 7104, min/mean/max reward 0.2605/6.8989/14.3670 of 8
2018-06-04 18:08:41,184 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: running min/mean/max reward 0.2605/6.9450/23.2791 of 500 0.4614 s/rollout


rollout extrinsic vs intrinsic reward 0.0790 -0.0347
rollout extrinsic vs intrinsic reward 0.0200 -0.0572
rollout extrinsic vs intrinsic reward 0.0116 -0.0027
rollout extrinsic vs intrinsic reward 0.0313 -0.0540
rollout extrinsic vs intrinsic reward 0.0551 -0.0900
rollout extrinsic vs intrinsic reward 0.0543 -0.0646
rollout extrinsic vs intrinsic reward 0.0254 -0.0212
rollout extrinsic vs intrinsic reward 0.0100 -0.0370
rollout extrinsic vs intrinsic reward 0.0555 -0.0036
rollout extrinsic vs intrinsic reward 0.0475 0.0200


2018-06-04 18:09:18,147 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: loss_rnn=32.9349, loss_inv= 656.1917=1.0000 * 656.1917, loss_vae=0.1929=0.0100 * (18.4800 + 0.2500 * 3.2447)
2018-06-04 18:09:18,148 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: total steps 7744, min/mean/max reward 0.2605/9.5248/17.9207 of 8
2018-06-04 18:09:18,150 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: running min/mean/max reward 0.2605/6.8543/23.2791 of 500 0.4615 s/rollout


rollout extrinsic vs intrinsic reward 0.0567 0.1164
rollout extrinsic vs intrinsic reward 0.0588 0.0156
rollout extrinsic vs intrinsic reward 0.0756 0.0490
rollout extrinsic vs intrinsic reward 0.0082 -0.0455
rollout extrinsic vs intrinsic reward 0.1287 -0.0594
rollout extrinsic vs intrinsic reward 0.0967 -0.0225
rollout extrinsic vs intrinsic reward 0.1121 -0.0504
rollout extrinsic vs intrinsic reward 0.2157 0.0309
rollout extrinsic vs intrinsic reward 0.1022 0.0229
rollout extrinsic vs intrinsic reward 0.0074 0.0318


2018-06-04 18:09:54,564 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: loss_rnn=32.9216, loss_inv= 656.1128=1.0000 * 656.1128, loss_vae=0.1973=0.0100 * (18.9198 + 0.2500 * 3.2311)
2018-06-04 18:09:54,565 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: total steps 8384, min/mean/max reward 3.0852/12.3486/21.7394 of 8
2018-06-04 18:09:54,566 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: running min/mean/max reward 0.2605/7.3929/23.2791 of 500 0.4610 s/rollout


rollout extrinsic vs intrinsic reward 0.0324 -0.0144
rollout extrinsic vs intrinsic reward 0.0136 0.1070
rollout extrinsic vs intrinsic reward 0.0222 0.0386
rollout extrinsic vs intrinsic reward 0.0713 -0.0376
rollout extrinsic vs intrinsic reward 0.0480 -0.0222
rollout extrinsic vs intrinsic reward 0.0086 0.0660
rollout extrinsic vs intrinsic reward 0.0020 -0.0271
rollout extrinsic vs intrinsic reward 0.0477 0.0675
rollout extrinsic vs intrinsic reward 0.0010 -0.0250
rollout extrinsic vs intrinsic reward 0.0222 0.0346


2018-06-04 18:10:30,694 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: loss_rnn=32.9083, loss_inv= 656.0452=1.0000 * 656.0452, loss_vae=0.1984=0.0100 * (19.0369 + 0.2500 * 3.2320)
2018-06-04 18:10:30,696 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: total steps 9024, min/mean/max reward 0.1048/10.6097/21.7394 of 8
2018-06-04 18:10:30,697 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: running min/mean/max reward 0.1048/8.0933/23.2791 of 500 0.4603 s/rollout


rollout extrinsic vs intrinsic reward 0.0473 0.0492
rollout extrinsic vs intrinsic reward 0.0313 -0.1379
rollout extrinsic vs intrinsic reward 0.0000 -0.0099
rollout extrinsic vs intrinsic reward 0.0028 -0.1112
rollout extrinsic vs intrinsic reward 0.0205 -0.0022
rollout extrinsic vs intrinsic reward 0.0055 0.0496
rollout extrinsic vs intrinsic reward 0.0044 -0.0029
rollout extrinsic vs intrinsic reward 0.0459 0.0173
rollout extrinsic vs intrinsic reward 0.0160 0.0537
rollout extrinsic vs intrinsic reward 0.0446 -0.0277


2018-06-04 18:11:07,308 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: loss_rnn=32.9164, loss_inv= 655.9864=1.0000 * 655.9864, loss_vae=0.1981=0.0100 * (18.9988 + 0.2500 * 3.2396)
2018-06-04 18:11:07,309 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: total steps 9664, min/mean/max reward 0.1048/6.9322/15.0103 of 8
2018-06-04 18:11:07,310 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: running min/mean/max reward 0.1048/8.5702/21.7394 of 500 0.4602 s/rollout


rollout extrinsic vs intrinsic reward 0.0802 0.0482
rollout extrinsic vs intrinsic reward 0.0746 0.0751
rollout extrinsic vs intrinsic reward 0.0609 0.0243
rollout extrinsic vs intrinsic reward 0.0229 -0.0424
rollout extrinsic vs intrinsic reward 0.0499 -0.0012
rollout extrinsic vs intrinsic reward 0.0301 0.0725
rollout extrinsic vs intrinsic reward 0.0000 0.0089
rollout extrinsic vs intrinsic reward 0.0003 0.0456
rollout extrinsic vs intrinsic reward 0.0037 0.0031
rollout extrinsic vs intrinsic reward 0.0238 0.0105


2018-06-04 18:11:43,902 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: loss_rnn=32.9184, loss_inv= 656.0903=1.0000 * 656.0903, loss_vae=0.1951=0.0100 * (18.6959 + 0.2500 * 3.2438)
2018-06-04 18:11:43,904 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: total steps 10304, min/mean/max reward 0.1048/8.3014/21.5311 of 8
2018-06-04 18:11:43,905 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: running min/mean/max reward 0.1048/8.9505/21.7394 of 500 0.4600 s/rollout


rollout extrinsic vs intrinsic reward 0.0128 0.0689
rollout extrinsic vs intrinsic reward 0.0000 -0.0007
rollout extrinsic vs intrinsic reward 0.0000 0.0499
rollout extrinsic vs intrinsic reward 0.0363 0.1043
rollout extrinsic vs intrinsic reward 0.0510 -0.0681
rollout extrinsic vs intrinsic reward 0.0423 -0.1055
rollout extrinsic vs intrinsic reward 0.0419 0.0621
rollout extrinsic vs intrinsic reward 0.0371 0.0359
rollout extrinsic vs intrinsic reward 0.0076 -0.0412
rollout extrinsic vs intrinsic reward 0.0000 -0.0317


2018-06-04 18:12:20,492 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: loss_rnn=32.9211, loss_inv= 656.0358=1.0000 * 656.0358, loss_vae=0.1909=0.0100 * (18.2792 + 0.2500 * 3.2449)
2018-06-04 18:12:20,494 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: total steps 10944, min/mean/max reward 1.8115/7.6996/21.5311 of 8
2018-06-04 18:12:20,495 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: running min/mean/max reward 0.1048/9.1430/21.7394 of 500 0.4598 s/rollout


rollout extrinsic vs intrinsic reward 0.0155 -0.0223
rollout extrinsic vs intrinsic reward 0.0307 0.0038
rollout extrinsic vs intrinsic reward 0.0627 0.0262
rollout extrinsic vs intrinsic reward 0.0677 0.0601
rollout extrinsic vs intrinsic reward 0.0116 -0.0280
rollout extrinsic vs intrinsic reward 0.0323 0.0398
rollout extrinsic vs intrinsic reward 0.0241 -0.0856
rollout extrinsic vs intrinsic reward 0.0012 -0.0728
rollout extrinsic vs intrinsic reward 0.0126 -0.1309
rollout extrinsic vs intrinsic reward 0.0073 0.0541


2018-06-04 18:12:57,290 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: loss_rnn=32.9106, loss_inv= 656.0564=1.0000 * 656.0564, loss_vae=0.1884=0.0100 * (18.0347 + 0.2500 * 3.2238)
2018-06-04 18:12:57,291 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: total steps 11584, min/mean/max reward 2.2185/7.5913/21.5311 of 8
2018-06-04 18:12:57,292 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: running min/mean/max reward 0.1048/9.0984/21.7394 of 500 0.4598 s/rollout


rollout extrinsic vs intrinsic reward 0.0838 -0.0474
rollout extrinsic vs intrinsic reward 0.0533 0.0017
rollout extrinsic vs intrinsic reward 0.0706 0.0061
rollout extrinsic vs intrinsic reward 0.0371 0.0704
rollout extrinsic vs intrinsic reward 0.0392 -0.0696
rollout extrinsic vs intrinsic reward 0.0866 -0.0643
rollout extrinsic vs intrinsic reward 0.0139 -0.0694
rollout extrinsic vs intrinsic reward 0.0243 -0.0321
rollout extrinsic vs intrinsic reward 0.0535 -0.0478
rollout extrinsic vs intrinsic reward 0.0539 0.0100


2018-06-04 18:13:34,374 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: loss_rnn=32.8924, loss_inv= 656.0094=1.0000 * 656.0094, loss_vae=0.1899=0.0100 * (18.1864 + 0.2500 * 3.2046)
2018-06-04 18:13:34,375 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: total steps 12224, min/mean/max reward 0.7998/6.0550/14.3670 of 8
2018-06-04 18:13:34,376 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: running min/mean/max reward 0.1048/8.5851/21.7394 of 500 0.4600 s/rollout


rollout extrinsic vs intrinsic reward 0.0053 -0.0336
rollout extrinsic vs intrinsic reward 0.0301 0.1491
rollout extrinsic vs intrinsic reward 0.0001 0.0838
rollout extrinsic vs intrinsic reward 0.0000 0.1594
rollout extrinsic vs intrinsic reward 0.0266 -0.0085
rollout extrinsic vs intrinsic reward 0.0862 -0.0366
rollout extrinsic vs intrinsic reward 0.0646 0.0525
rollout extrinsic vs intrinsic reward 0.0266 0.0271
rollout extrinsic vs intrinsic reward 0.0366 0.0076
rollout extrinsic vs intrinsic reward 0.0205 -0.0618


2018-06-04 18:14:11,033 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: loss_rnn=32.8801, loss_inv= 655.9671=1.0000 * 655.9671, loss_vae=0.1895=0.0100 * (18.1491 + 0.2500 * 3.2144)
2018-06-04 18:14:11,034 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: total steps 12864, min/mean/max reward 0.7998/6.7513/17.8315 of 8
2018-06-04 18:14:11,035 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: running min/mean/max reward 0.1048/7.7169/21.7394 of 500 0.4599 s/rollout


rollout extrinsic vs intrinsic reward 0.0033 0.0320
rollout extrinsic vs intrinsic reward 0.0000 0.0590
rollout extrinsic vs intrinsic reward 0.0337 -0.0219
rollout extrinsic vs intrinsic reward 0.0022 0.0350
rollout extrinsic vs intrinsic reward 0.0247 0.0493
rollout extrinsic vs intrinsic reward 0.0117 0.0015
rollout extrinsic vs intrinsic reward 0.0346 -0.0454
rollout extrinsic vs intrinsic reward 0.0511 0.0571
rollout extrinsic vs intrinsic reward 0.0915 -0.0077
rollout extrinsic vs intrinsic reward 0.0974 -0.0142


2018-06-04 18:14:48,126 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: loss_rnn=32.8686, loss_inv= 655.9880=1.0000 * 655.9880, loss_vae=0.1863=0.0100 * (17.8278 + 0.2500 * 3.2091)
2018-06-04 18:14:48,127 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: total steps 13504, min/mean/max reward 0.7998/7.0126/17.8315 of 8
2018-06-04 18:14:48,128 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: running min/mean/max reward 0.1048/7.1641/21.5311 of 500 0.4601 s/rollout


rollout extrinsic vs intrinsic reward 0.0378 0.0088
rollout extrinsic vs intrinsic reward 0.0245 0.0462
rollout extrinsic vs intrinsic reward 0.0486 0.0684
rollout extrinsic vs intrinsic reward 0.0790 0.0050
rollout extrinsic vs intrinsic reward 0.0727 -0.0117
rollout extrinsic vs intrinsic reward 0.0084 -0.0100
rollout extrinsic vs intrinsic reward 0.0527 0.0429
rollout extrinsic vs intrinsic reward 0.0805 -0.0123
rollout extrinsic vs intrinsic reward 0.0911 -0.0231
rollout extrinsic vs intrinsic reward 0.1386 -0.3744


2018-06-04 18:15:24,338 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: loss_rnn=32.8505, loss_inv= 656.0070=1.0000 * 656.0070, loss_vae=0.1844=0.0100 * (17.6439 + 0.2500 * 3.1854)
2018-06-04 18:15:24,339 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: total steps 14144, min/mean/max reward 0.2605/9.4579/21.3794 of 8
2018-06-04 18:15:24,341 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: running min/mean/max reward 0.1048/7.3280/21.5311 of 500 0.4598 s/rollout


rollout extrinsic vs intrinsic reward 0.1331 -0.0126
rollout extrinsic vs intrinsic reward 0.1060 0.0022
rollout extrinsic vs intrinsic reward 0.0441 -0.0155
rollout extrinsic vs intrinsic reward 0.0842 -0.0003
rollout extrinsic vs intrinsic reward 0.1483 0.0132
rollout extrinsic vs intrinsic reward 0.1438 0.0009
rollout extrinsic vs intrinsic reward 0.0930 0.0213
rollout extrinsic vs intrinsic reward 0.0112 0.0470
rollout extrinsic vs intrinsic reward 0.0011 -0.0027
rollout extrinsic vs intrinsic reward 0.0016 0.0108


2018-06-04 18:16:01,168 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: loss_rnn=32.9584, loss_inv= 655.9703=1.0000 * 655.9703, loss_vae=0.1907=0.0100 * (18.2495 + 0.2500 * 3.2862)
2018-06-04 18:16:01,170 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: total steps 14784, min/mean/max reward 0.2605/6.6419/14.3670 of 8
2018-06-04 18:16:01,171 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: running min/mean/max reward 0.1048/7.2685/21.5311 of 500 0.4598 s/rollout


rollout extrinsic vs intrinsic reward 0.0000 0.0044
rollout extrinsic vs intrinsic reward 0.0209 -0.0070
rollout extrinsic vs intrinsic reward 0.0195 0.0223
rollout extrinsic vs intrinsic reward 0.0211 -0.0054
rollout extrinsic vs intrinsic reward 0.0000 0.0035
rollout extrinsic vs intrinsic reward 0.0120 -0.0165
rollout extrinsic vs intrinsic reward 0.0227 -0.0150
rollout extrinsic vs intrinsic reward 0.0000 0.0266
rollout extrinsic vs intrinsic reward 0.0028 0.0175
rollout extrinsic vs intrinsic reward 0.0354 0.0074


2018-06-04 18:16:37,507 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: loss_rnn=33.0592, loss_inv= 655.9366=1.0000 * 655.9366, loss_vae=0.1897=0.0100 * (18.1002 + 0.2500 * 3.4609)
2018-06-04 18:16:37,508 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: total steps 15424, min/mean/max reward 0.2605/5.8497/14.3670 of 8
2018-06-04 18:16:37,509 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: running min/mean/max reward 0.2605/7.1359/21.5311 of 500 0.4596 s/rollout


rollout extrinsic vs intrinsic reward 0.0399 -0.0181
rollout extrinsic vs intrinsic reward 0.0081 -0.0195
rollout extrinsic vs intrinsic reward 0.0514 -0.0248
rollout extrinsic vs intrinsic reward 0.0220 0.0063
rollout extrinsic vs intrinsic reward 0.1548 0.0112
rollout extrinsic vs intrinsic reward 0.1445 -0.0364
rollout extrinsic vs intrinsic reward 0.1031 0.1797
rollout extrinsic vs intrinsic reward 0.0656 -0.0546
rollout extrinsic vs intrinsic reward 0.0326 0.0097
rollout extrinsic vs intrinsic reward 0.0053 -0.0736


2018-06-04 18:17:13,540 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: loss_rnn=33.1408, loss_inv= 655.9554=1.0000 * 655.9554, loss_vae=0.1943=0.0100 * (18.5334 + 0.2500 * 3.5895)
2018-06-04 18:17:13,541 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: total steps 16064, min/mean/max reward 0.1048/5.1592/14.6726 of 8
2018-06-04 18:17:13,542 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: running min/mean/max reward 0.1048/6.8151/21.3794 of 500 0.4592 s/rollout


rollout extrinsic vs intrinsic reward 0.0091 0.0264
rollout extrinsic vs intrinsic reward 0.0000 0.0228
rollout extrinsic vs intrinsic reward 0.0000 0.0276
rollout extrinsic vs intrinsic reward 0.0074 0.0635
rollout extrinsic vs intrinsic reward 0.0358 0.0508
rollout extrinsic vs intrinsic reward 0.0098 0.0324
rollout extrinsic vs intrinsic reward 0.0018 0.0534
rollout extrinsic vs intrinsic reward 0.0137 0.0396
rollout extrinsic vs intrinsic reward 0.0382 0.0189
rollout extrinsic vs intrinsic reward 0.0425 0.0304


2018-06-04 18:17:49,767 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: loss_rnn=33.2270, loss_inv= 655.9249=1.0000 * 655.9249, loss_vae=0.1948=0.0100 * (18.4356 + 0.2500 * 4.1948)
2018-06-04 18:17:49,769 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: total steps 16704, min/mean/max reward 0.1048/6.1281/15.0103 of 8
2018-06-04 18:17:49,771 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: running min/mean/max reward 0.1048/6.7674/21.3794 of 500 0.4590 s/rollout


rollout extrinsic vs intrinsic reward 0.0428 0.0153
rollout extrinsic vs intrinsic reward 0.0288 0.0569
rollout extrinsic vs intrinsic reward 0.0045 0.0123
rollout extrinsic vs intrinsic reward 0.0030 0.0203
rollout extrinsic vs intrinsic reward 0.0352 0.0507
rollout extrinsic vs intrinsic reward 0.0692 0.0403
rollout extrinsic vs intrinsic reward 0.1099 0.0358
rollout extrinsic vs intrinsic reward 0.1036 0.0516
rollout extrinsic vs intrinsic reward 0.0393 0.0310
rollout extrinsic vs intrinsic reward 0.0089 0.0711


2018-06-04 18:18:26,819 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: loss_rnn=33.3035, loss_inv= 655.8966=1.0000 * 655.8966, loss_vae=0.1964=0.0100 * (18.4644 + 0.2500 * 4.7132)
2018-06-04 18:18:26,820 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: total steps 17344, min/mean/max reward 0.1048/8.9331/20.1912 of 8
2018-06-04 18:18:26,821 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: running min/mean/max reward 0.1048/6.7578/21.3794 of 500 0.4591 s/rollout


rollout extrinsic vs intrinsic reward 0.0399 0.0278
rollout extrinsic vs intrinsic reward 0.0294 0.0135
rollout extrinsic vs intrinsic reward 0.0272 0.0337
rollout extrinsic vs intrinsic reward 0.0492 0.0532
rollout extrinsic vs intrinsic reward 0.0000 0.0289
rollout extrinsic vs intrinsic reward 0.0121 0.0123
rollout extrinsic vs intrinsic reward 0.0300 0.0321
rollout extrinsic vs intrinsic reward 0.0510 0.0596
rollout extrinsic vs intrinsic reward 0.0596 0.0205
rollout extrinsic vs intrinsic reward 0.0275 0.0181


2018-06-04 18:19:02,812 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: loss_rnn=33.3702, loss_inv= 655.8704=1.0000 * 655.8704, loss_vae=0.1967=0.0100 * (18.4014 + 0.2500 * 5.0935)
2018-06-04 18:19:02,813 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: total steps 17984, min/mean/max reward 0.8133/10.0483/20.1912 of 8
2018-06-04 18:19:02,814 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: running min/mean/max reward 0.1048/7.0154/21.3794 of 500 0.4588 s/rollout


rollout extrinsic vs intrinsic reward 0.0027 0.0387


In [None]:
agent.network.world_model.last_loss_vae


To monitor with tensorboard
```sh
cd ~/Documents/projects/retro_sonic_comp/world-models-pytorch/log 
tensorboard  --logdir .
#then open http://localhost:6006/#scalars
```


In [None]:
agent.save(ppo_save_file)
ppo_save_file

# summarize

In [None]:
from IPython.display import display

with torch.no_grad():
    img = np.random.randn(image_size, image_size, 3)
    action = np.array(np.random.randint(0,action_dim))[np.newaxis]
    action = Variable(torch.from_numpy(action)).float().cuda()[np.newaxis]
    gpu_img = Variable(torch.from_numpy(img[np.newaxis].transpose(0, 3, 1, 2))).float().cuda()
    if cuda:
        gpu_img = gpu_img.cuda()
    with TorchSummarizeDf(vae) as tdf:
        x, mu_vae, logvar_vae = vae.forward(gpu_img)
        z = vae.sample(mu_vae, logvar_vae)
        df_vae = tdf.make_df()

    display(df_vae[df_vae.level<2])
    
    with TorchSummarizeDf(mdnrnn) as tdf: 
        pi, mu, sigma, hidden_state = mdnrnn.forward(z.unsqueeze(1).repeat((1,2,1)))
        z_next = mdnrnn.sample(pi, mu, sigma)
        df_mdnrnn = tdf.make_df()
    
    display(df_mdnrnn)
    

    with TorchSummarizeDf(finv) as tdf:
        finv(z.repeat((1,2,1)), z_next)   
        df_finv = tdf.make_df()
    display(df_finv)

    with TorchSummarizeDf(world_model) as tdf:
        world_model(gpu_img, action)
        df_world_model = tdf.make_df()
    display(df_world_model[df_world_model.level<2])
    
    del img, action, gpu_img, x, mu, z, z_next, mu_vae, pi, sigma, logvar_vae