In [32]:
import argparse
import functools
import logging
import sys
from distutils.version import LooseVersion

import gym
import gym.wrappers
import numpy as np
import torch
from torch import distributions, nn

import pfrl
from pfrl import experiments, replay_buffers, utils
from pfrl.nn.lmbda import Lambda

In [33]:
# Set different random seeds for different subprocesses.
# If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3].
# If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7].
seed = 0
num_envs = 4
process_seeds = np.arange(num_envs) + seed * num_envs

replay_start_size = 10000
gpu = 0 # GPU to use, set to -1 if no GPU.
batch_size = 256

outdir = './result'
steps = 10 ** 6
eval_n_runs = 10
eval_interval = 5000
log_interval = 1000

env_id = 'Pendulum-v0'


In [34]:
def make_env(process_idx, test):
    env = gym.make(env_id)
    assert isinstance(env, gym.wrappers.TimeLimit)
    env = env.env
    process_seed = int(process_seeds[process_idx])
    env_seed = 2 ** 32 - 1 - process_seed if test else process_seed
    env.seed(env_seed)
    # Cast observations to float32 because our model uses float32
    env = pfrl.wrappers.CastObservationToFloat32(env)
    # Normalize action space to [-1, 1]^n
    env = pfrl.wrappers.NormalizeActionSpace(env)
    # env = gym.wrappers.Monitor(env, outdir)
    return env

def make_batch_env(test):
    return pfrl.envs.MultiprocessVectorEnv(
        [functools.partial(make_env, idx, test) for idx, env in enumerate(range(num_envs))]
    )

In [35]:
sample_env = make_env(process_idx=0, test=False)
timestep_limit = sample_env.spec.max_episode_steps
obs_space = sample_env.observation_space
action_space = sample_env.action_space
obs_size = obs_space.low.size
action_size = action_space.low.size

print("Observation space:", obs_space)
print("Action space:", action_space)

Observation space: Box(-8.0, 8.0, (3,), float32)
Action space: Box(-1.0, 1.0, (1,), float32)


In [36]:
def squashed_diagonal_gaussian_head(x):
    assert x.shape[-1] == action_size * 2
    mean, log_scale = torch.chunk(x, 2, dim=1)
    log_scale = torch.clamp(log_scale, -20.0, 2.0)
    var = torch.exp(log_scale * 2)
    base_distribution = distributions.Independent(
        distributions.Normal(loc=mean, scale=torch.sqrt(var)), 1
    )
    # cache_size=1 is required for numerical stability
    return distributions.transformed_distribution.TransformedDistribution(
        base_distribution, [distributions.transforms.TanhTransform(cache_size=1)]
    )

In [37]:
policy = nn.Sequential(
    nn.Linear(obs_size, 256),
    nn.ReLU(),
    nn.Linear(256, 256),
    nn.ReLU(),
    nn.Linear(256, action_size * 2),
    Lambda(squashed_diagonal_gaussian_head),
)
torch.nn.init.xavier_uniform_(policy[0].weight)
torch.nn.init.xavier_uniform_(policy[2].weight)
torch.nn.init.xavier_uniform_(policy[4].weight, gain=1.0)
policy_optimizer = torch.optim.Adam(policy.parameters(), lr=3e-4)

def make_q_func_with_optimizer():
    q_func = nn.Sequential(
        pfrl.nn.ConcatObsAndAction(),
        nn.Linear(obs_size + action_size, 256),
        nn.ReLU(),
        nn.Linear(256, 256),
        nn.ReLU(),
        nn.Linear(256, 1),
    )
    torch.nn.init.xavier_uniform_(q_func[1].weight)
    torch.nn.init.xavier_uniform_(q_func[3].weight)
    torch.nn.init.xavier_uniform_(q_func[5].weight)
    q_func_optimizer = torch.optim.Adam(q_func.parameters(), lr=3e-4)
    return q_func, q_func_optimizer

q_func1, q_func1_optimizer = make_q_func_with_optimizer()
q_func2, q_func2_optimizer = make_q_func_with_optimizer()

In [38]:
rbuf = replay_buffers.ReplayBuffer(10 ** 6)

def burnin_action_func():
    """Select random actions until model is updated one or more times."""
    return np.random.uniform(action_space.low, action_space.high).astype(np.float32)

agent = pfrl.agents.SoftActorCritic(
    policy, 
    q_func1, 
    q_func2, 
    policy_optimizer,
    q_func1_optimizer,
    q_func2_optimizer,
    rbuf,
    gamma=0.99,
    replay_start_size=replay_start_size,
    gpu=gpu,
    minibatch_size=batch_size,
    burnin_action_func=burnin_action_func,
    entropy_target=-action_size,
    temperature_optimizer_lr=3e-4,
)

In [39]:
experiments.train_agent_batch_with_evaluation(
    agent=agent,
    env=make_batch_env(test=False),
    eval_env=make_batch_env(test=True),
    outdir=outdir,
    steps=steps,
    eval_n_steps=None,
    eval_n_episodes=eval_n_runs,
    eval_interval=eval_interval,
    log_interval=log_interval,
    max_episode_len=timestep_limit,
)

average_q1': -14.144865,
   'average_q2': -14.140223,
   'average_q_func1_loss': 0.24412721380591393,
   'average_q_func2_loss': 0.1845402367040515,
   'n_updates': 650001,
   'average_entropy': -1.0178982,
   'temperature': 0.010784977115690708,
   'eval_score': -95.86305477320238},
  {'average_q1': -13.967822,
   'average_q2': -13.918497,
   'average_q_func1_loss': 0.20213287226855756,
   'average_q_func2_loss': 0.18452599320560695,
   'n_updates': 655001,
   'average_entropy': -1.018843,
   'temperature': 0.012544878758490086,
   'eval_score': -140.47628986383558},
  {'average_q1': -13.549403,
   'average_q2': -13.551678,
   'average_q_func1_loss': 0.21373473547399044,
   'average_q_func2_loss': 0.22528013192117213,
   'n_updates': 660001,
   'average_entropy': -0.92505234,
   'temperature': 0.011618980206549168,
   'eval_score': -172.6230764127267},
  {'average_q1': -13.916123,
   'average_q2': -13.954864,
   'average_q_func1_loss': 0.26638334784656764,
   'average_q_func2_loss': 0