This experiment, higher learning rate. Larger rollouts.

In [1]:
import deep_rl

In [2]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [3]:
import torch
from torch.nn import functional as F
from torch.autograd import Variable
from torch import nn, optim
import torch.utils.data

# load as dask array
import time

import logging
import sys
import os
import glob
import numpy as np
import datetime
from matplotlib import pyplot as plt
from tqdm import tqdm_notebook as tqdm

In [4]:
from deep_rl.utils import Config
from deep_rl.utils.logger import get_logger, get_default_log_dir

from deep_rl.network.network_heads import CategoricalActorCriticNet, QuantileNet, OptionCriticNet, DeterministicActorCriticNet, GaussianActorCriticNet
from deep_rl.network.network_bodies import FCBody

from deep_rl.component.task import ParallelizedTask

In [5]:
from world_models_sonic.models.vae import VAE5, loss_function_vae
from world_models_sonic.helpers.summarize import TorchSummarizeDf
from world_models_sonic.helpers.dataset import load_cache_data
from world_models_sonic.models.rnn import MDNRNN2
from world_models_sonic.models.inverse_model import InverseModel
from world_models_sonic.models.world_model import WorldModel
from world_models_sonic.custom_envs.env import make_env
from world_models_sonic.custom_envs.wrappers import RandomGameReset
from world_models_sonic import config
from world_models_sonic.helpers.deep_rl import PPOAgent, run_iterations, SonicWorldModelDeepRL, CategoricalWorldActorCriticNet

Importing 0 potential games...
Imported 0 games


  from ._conv import register_converters as _register_converters


# Init

In [6]:
# #TODO make saves a hash of 
# agent_name, config.tag, agent.task.name
# and timestamp

In [7]:
cuda = torch.cuda.is_available()
env_name = 'sonic256'
z_dim = 512  # latent dimensions
channels = 3*4

# RNN
action_dim = 10
image_size = 128

verbose = True  # Set this true to render (and make it go slower)

NAME = 'RNN_v3b_128im_512z_1512_v6i_VAE5_all'
ppo_save_file = './outputs/{NAME}/PPO_512z_all_f.pkl'.format(NAME=NAME)

if not os.path.isdir('./outputs/{NAME}'.format(NAME=NAME)):
    os.makedirs('./outputs/{NAME}'.format(NAME=NAME))

# Log to file and stream
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logger = logging.getLogger(NAME)

log_dir = log_dir='./outputs/{NAME}'.format(NAME=NAME)
print(log_dir)

deep_rl_logger = get_logger(
    NAME,
    file_name='deep_rl_ppo.log',
    level=logging.INFO,
    log_dir='./outputs/{NAME}'.format(NAME=NAME), )

./outputs/RNN_v3b_128im_512z_1512_v6i_VAE5_all


# World model

In [8]:
# Load VAE
# TODO swap z and k dim, since it's inconsistent with other models
vae = VAE5(image_size=image_size, z_dim=128, conv_dim=64, code_dim=8, k_dim=z_dim, channels=channels)
    
# Load MDRNN
action_dim, hidden_size, n_mixture, temp = action_dim, z_dim*2, 5, 0.0

mdnrnn = MDNRNN2(z_dim, action_dim, hidden_size, n_mixture, temp)
    
finv = InverseModel(z_dim, action_dim, hidden_size=z_dim*2)
    
world_model = WorldModel(vae, mdnrnn, finv, logger=deep_rl_logger, lambda_vae_kld=1 / 4., C=0, lambda_finv=10, lambda_vae=1 / 100.)
world_model = world_model.train()
if cuda:
    world_model = world_model.cuda()

In [9]:
import torch.optim.lr_scheduler
torch.cuda.empty_cache()
optimizer = optim.Adam(world_model.parameters(), lr=1e-4)

world_model.optimizer = optimizer

# Train

In [10]:
z_state_dim=world_model.mdnrnn.z_dim + world_model.mdnrnn.hidden_size  + world_model.mdnrnn.action_dim


def task_fn(log_dir):
    return SonicWorldModelDeepRL(
        env_fn=lambda: RandomGameReset(make_env(
            'sonic', max_episode_steps=1000, to_gray=False, image_size=image_size)),
        log_dir=log_dir,
        verbose=verbose
    )

config = Config()

verbose = False  # Set this true to render (and make it go slower)
config.num_workers = 1 if verbose else 8
config.task_fn = lambda: ParallelizedTask(
    task_fn, config.num_workers, single_process=config.num_workers == 1)
config.optimizer_fn = lambda params: torch.optim.Adam(params, 3e-4, eps=1e-5)
config.network_fn = lambda state_dim, action_dim: CategoricalWorldActorCriticNet(
    state_dim, action_dim, FCBody(z_state_dim, hidden_units=(64, 64), gate=F.relu), gpu=0 if cuda else -1, world_model_fn=lambda: world_model,
    render=(config.num_workers==1 and verbose),
    z_shape=(32, 16)
)
config.discount = 0.99
config.logger = deep_rl_logger
config.use_gae = True
config.gae_tau = 0.95
config.entropy_weight = 0.001
config.gradient_clip = 0.4
config.rollout_length = 64//config.num_workers
config.optimization_epochs = 10
config.num_mini_batches = 8
config.ppo_ratio_clip = 0.2
config.iteration_log_interval = 10
config.curiosity_weight = 1
config.curiosity_only = True
agent = PPOAgent(config)

print('rollout of ', config.rollout_length*config.num_workers)
print('mini batch', (config.rollout_length*config.num_workers)//config.num_mini_batches)


if os.path.isfile(ppo_save_file):
    print('loading ppo_save_file', ppo_save_file, 'modified', time.ctime(os.path.getmtime(ppo_save_file)))
    agent.load(ppo_save_file)
else:
    print("couldn't find save")

game: SonicAndKnuckles3-Genesis state: HydrocityZone.Act2
game: SonicTheHedgehog-Genesis state: LabyrinthZone.Act2
game: SonicAndKnuckles3-Genesis state: SandopolisZone.Act2
game: SonicTheHedgehog-Genesis state: StarLightZone.Act2
game: SonicAndKnuckles3-Genesis state: LaunchBaseZone.Act2
game: SonicAndKnuckles3-Genesis state: LaunchBaseZone.Act1
game: SonicTheHedgehog2-Genesis state: MysticCaveZone.Act2
game: SonicAndKnuckles3-Genesis state: IcecapZone.Act1
rollout of  64
mini batch 8
loading ppo_save_file ./outputs/RNN_v3b_128im_512z_1512_v6i_VAE5_all/PPO_512z_all_f.pkl modified Mon Jun  4 15:40:39 2018


Process ProcessWrapper-1:
Process ProcessWrapper-7:
Process ProcessWrapper-3:
Process ProcessWrapper-8:
Process ProcessWrapper-5:
Traceback (most recent call last):
  File "/home/wassname/.pyenv/versions/3.5.3/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
Traceback (most recent call last):
Process ProcessWrapper-4:
Process ProcessWrapper-6:
Process ProcessWrapper-2:
Traceback (most recent call last):
  File "/home/wassname/.pyenv/versions/3.5.3/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/media/oldhome/wassname/Documents/projects/retro_sonic_comp/DeepRL/deep_rl/component/task.py", line 177, in run
    op, data = self.pipe.recv()
Traceback (most recent call last):
  File "/home/wassname/.pyenv/versions/3.5.3/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
  File "/home/

In [None]:
try:
    run_iterations(agent, log_dir=log_dir)
except:
    if config.num_workers == 1:
        agent.task.tasks[0].env.close()
    else:
        [t.close() for t in agent.task.tasks]
    print("saving", ppo_save_file)
    agent.save(ppo_save_file)

    # Backup since it sometimes get's corrupted
    ts = datetime.datetime.utcnow().strftime('%Y%m%d_%H-%M-%S')
    print("saving backup",
          ppo_save_file.replace('.pkl', '-%s.pkl' % ts),)
    agent.save(ppo_save_file.replace('.pkl', '-%s.pkl' % ts))
    raise

sampled extristic vs intrinsic reward 0.12853991985321045 8.22068977355957


2018-06-04 15:41:02,074 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: loss_rnn=16.8278, loss_inv= 6.6766=10.0000 * 0.6677, loss_vae=185.0249=0.0250 * (7296.5835 + 0.2500 * 417.6427)
2018-06-04 15:41:02,075 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: total steps 64, min/mean/max reward 0.0000/0.0000/0.0000 of 8
2018-06-04 15:41:02,076 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: running min/mean/max reward 0.0000/0.0000/0.0000 of 8 0.5988 s/rollout


sampled extristic vs intrinsic reward 0.09623606503009796 4.478583335876465
sampled extristic vs intrinsic reward 0.2128911316394806 2.1207313537597656
sampled extristic vs intrinsic reward 0.1070796400308609 0.8686275482177734
sampled extristic vs intrinsic reward 0.046075884252786636 1.1292343139648438
sampled extristic vs intrinsic reward 0.02629019133746624 1.4219741821289062
sampled extristic vs intrinsic reward 0.1747005581855774 4.262777328491211
sampled extristic vs intrinsic reward -0.030071787536144257 2.2652225494384766
sampled extristic vs intrinsic reward 0.004995410796254873 2.7249183654785156
sampled extristic vs intrinsic reward 0.025924012064933777 7.022390365600586
sampled extristic vs intrinsic reward 0.06552304327487946 0.5104179382324219


2018-06-04 15:41:55,181 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: loss_rnn=16.7294, loss_inv= 6.9834=10.0000 * 0.6983, loss_vae=167.1896=0.0250 * (6581.7743 + 0.2500 * 423.2422)
2018-06-04 15:41:55,182 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: total steps 704, min/mean/max reward 0.0000/0.7987/5.1114 of 8
2018-06-04 15:41:55,183 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: running min/mean/max reward 0.0000/0.2614/5.1114 of 88 0.6579 s/rollout


sampled extristic vs intrinsic reward 0.012495970353484154 1.9749183654785156
sampled extristic vs intrinsic reward -0.036803942173719406 3.280792236328125
sampled extristic vs intrinsic reward -0.022230029106140137 5.11069393157959
sampled extristic vs intrinsic reward 0.009265306405723095 4.15388298034668
sampled extristic vs intrinsic reward -0.005111893638968468 13.598298072814941
sampled extristic vs intrinsic reward 0.008233952336013317 2.2006874084472656
sampled extristic vs intrinsic reward 0.003833919996395707 0.8826866149902344
sampled extristic vs intrinsic reward 0.0028754400555044413 1.1493453979492188
sampled extristic vs intrinsic reward 0.0028754400555044413 1.0788040161132812
sampled extristic vs intrinsic reward 0.0015974667621776462 1.2197465896606445


2018-06-04 15:42:43,209 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: loss_rnn=16.7737, loss_inv= 6.8846=10.0000 * 0.6885, loss_vae=144.0978=0.0250 * (5653.1576 + 0.2500 * 443.0170)
2018-06-04 15:42:43,210 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: total steps 1344, min/mean/max reward 0.0000/0.7987/5.1114 of 8
2018-06-04 15:42:43,211 - RNN_v3b_128im_512z_1512_v6i_VAE5_all - INFO: running min/mean/max reward 0.0000/0.5172/5.1114 of 168 0.6305 s/rollout


sampled extristic vs intrinsic reward 0.0 1.3730220794677734
sampled extristic vs intrinsic reward 0.0 0.454193115234375
sampled extristic vs intrinsic reward 0.0162394680082798 3.758376121520996
sampled extristic vs intrinsic reward 0.05723147839307785 1.5327644348144531
saving ./outputs/RNN_v3b_128im_512z_1512_v6i_VAE5_all/PPO_512z_all_f.pkl


In [None]:
F.binary_cross_entropy_with_logits?


To monitor with tensorboard
```sh
cd ~/Documents/projects/retro_sonic_comp/world-models-pytorch/log 
tensorboard  --logdir .
#then open http://localhost:6006/#scalars
```


In [None]:
agent.save(ppo_save_file)
ppo_save_file

# summarize

In [None]:
from IPython.display import display

with torch.no_grad():
    img = np.random.randn(image_size, image_size, 3)
    action = np.array(np.random.randint(0,action_dim))[np.newaxis]
    action = Variable(torch.from_numpy(action)).float().cuda()[np.newaxis]
    gpu_img = Variable(torch.from_numpy(img[np.newaxis].transpose(0, 3, 1, 2))).float().cuda()
    if cuda:
        gpu_img = gpu_img.cuda()
    with TorchSummarizeDf(vae) as tdf:
        x, mu_vae, logvar_vae = vae.forward(gpu_img)
        z = vae.sample(mu_vae, logvar_vae)
        df_vae = tdf.make_df()

    display(df_vae[df_vae.level<2])
    
    with TorchSummarizeDf(mdnrnn) as tdf: 
        pi, mu, sigma, hidden_state = mdnrnn.forward(z.unsqueeze(1).repeat((1,2,1)))
        z_next = mdnrnn.sample(pi, mu, sigma)
        df_mdnrnn = tdf.make_df()
    
    display(df_mdnrnn)
    

    with TorchSummarizeDf(finv) as tdf:
        finv(z.repeat((1,2,1)), z_next)   
        df_finv = tdf.make_df()
    display(df_finv)

    with TorchSummarizeDf(world_model) as tdf:
        world_model(gpu_img, action)
        df_world_model = tdf.make_df()
    display(df_world_model[df_world_model.level<2])
    
    del img, action, gpu_img, x, mu, z, z_next, mu_vae, pi, sigma, logvar_vae