Skip to content

Commit

Permalink
Merge pull request #171 from zuoxingdong/step_info_trajectory
Browse files Browse the repository at this point in the history
update PPO
  • Loading branch information
zuoxingdong committed May 8, 2019
2 parents 351b45f + 5e2ca85 commit ba9f9c3
Show file tree
Hide file tree
Showing 106 changed files with 175 additions and 129 deletions.
94 changes: 61 additions & 33 deletions baselines/ppo/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
from baselines.ppo.dataset import Dataset


class MLP(Module):
class Actor(Module):
def __init__(self, config, env, device, **kwargs):
super().__init__(**kwargs)
self.config = config
Expand All @@ -39,39 +39,64 @@ def __init__(self, config, env, device, **kwargs):
for layer in self.feature_layers:
ortho_init(layer, nonlinearity='tanh', constant_bias=0.0)

feature_dim = config['nn.sizes'][-1]
if isinstance(env.action_space, Discrete):
self.action_head = CategoricalHead(feature_dim, env.action_space.n, device, **kwargs)
elif isinstance(env.action_space, Box):
self.action_head = DiagGaussianHead(feature_dim, flatdim(env.action_space), device, config['agent.std0'], **kwargs)

self.to(self.device)

def forward(self, x):
for layer in self.feature_layers:
x = torch.tanh(layer(x))
return x
action_dist = self.action_head(x)
return action_dist


class Agent(BaseAgent):
class Critic(Module):
def __init__(self, config, env, device, **kwargs):
super().__init__(config, env, device, **kwargs)
super().__init__(**kwargs)
self.config = config
self.env = env
self.device = device

self.feature_layers = make_fc(flatdim(env.observation_space), config['nn.sizes'])
for layer in self.feature_layers:
ortho_init(layer, nonlinearity='tanh', constant_bias=0.0)

feature_dim = config['nn.sizes'][-1]
self.feature_network = MLP(config, env, device, **kwargs)
if isinstance(env.action_space, Discrete):
self.action_head = CategoricalHead(feature_dim, env.action_space.n, device, **kwargs)
elif isinstance(env.action_space, Box):
self.action_head = DiagGaussianHead(feature_dim, flatdim(env.action_space), device, config['agent.std0'], **kwargs)
self.V_head = nn.Linear(feature_dim, 1).to(device)
ortho_init(self.V_head, weight_scale=1.0, constant_bias=0.0)

self.to(self.device)

def forward(self, x):
for layer in self.feature_layers:
x = torch.tanh(layer(x))
V = self.V_head(x)
return V


class Agent(BaseAgent):
def __init__(self, config, env, device, **kwargs):
super().__init__(config, env, device, **kwargs)

self.policy = Actor(config, env, device, **kwargs)
self.value = Critic(config, env, device, **kwargs)

self.total_timestep = 0

self.optimizer = optim.Adam(self.parameters(), lr=config['agent.lr'])
self.policy_optimizer = optim.Adam(self.policy.parameters(), lr=config['agent.policy_lr'])
self.value_optimizer = optim.Adam(self.value.parameters(), lr=config['agent.value_lr'])
if config['agent.use_lr_scheduler']:
self.lr_scheduler = linear_lr_scheduler(self.optimizer, config['train.timestep'], min_lr=1e-8)
self.policy_lr_scheduler = linear_lr_scheduler(self.policy_optimizer, config['train.timestep'], min_lr=1e-8)

def choose_action(self, obs, **kwargs):
obs = tensorify(obs, self.device)
out = {}
features = self.feature_network(obs)

action_dist = self.action_head(features)
action_dist = self.policy(obs)
out['action_dist'] = action_dist
out['entropy'] = action_dist.entropy()

Expand All @@ -80,12 +105,12 @@ def choose_action(self, obs, **kwargs):
out['raw_action'] = numpify(action, 'float')
out['action_logprob'] = action_dist.log_prob(action.detach())

V = self.V_head(features)
V = self.value(obs)
out['V'] = V
return out

def learn_one_update(self, data):
data = [d.to(self.device) for d in data]
data = [d.detach().to(self.device) for d in data]
observations, old_actions, old_logprobs, old_entropies, old_Vs, old_Qs, old_As = data

out = self.choose_action(observations)
Expand All @@ -97,26 +122,30 @@ def learn_one_update(self, data):
eps = self.config['agent.clip_range']
policy_loss = -torch.min(ratio*old_As,
torch.clamp(ratio, 1.0 - eps, 1.0 + eps)*old_As)
entropy_loss = -entropies
policy_loss = policy_loss.mean(0)

self.policy_optimizer.zero_grad()
policy_loss.backward()
policy_grad_norm = nn.utils.clip_grad_norm_(self.policy.parameters(), self.config['agent.max_grad_norm'])
if self.config['agent.use_lr_scheduler']:
self.policy_lr_scheduler.step(self.total_timestep)
self.policy_optimizer.step()

clipped_Vs = old_Vs + torch.clamp(Vs - old_Vs, -eps, eps)
value_loss = torch.max(F.mse_loss(Vs, old_Qs, reduction='none'),
F.mse_loss(clipped_Vs, old_Qs, reduction='none'))
loss = policy_loss + self.config['agent.value_coef']*value_loss + self.config['agent.entropy_coef']*entropy_loss
loss = loss.mean()
value_loss = value_loss.mean(0)

self.optimizer.zero_grad()
loss.backward()
grad_norm = nn.utils.clip_grad_norm_(self.parameters(), self.config['agent.max_grad_norm'])
if self.config['agent.use_lr_scheduler']:
self.lr_scheduler.step(self.total_timestep)
self.optimizer.step()
self.value_optimizer.zero_grad()
value_loss.backward()
value_grad_norm = nn.utils.clip_grad_norm_(self.value.parameters(), self.config['agent.max_grad_norm'])
self.value_optimizer.step()

out = {}
out['loss'] = loss.item()
out['grad_norm'] = grad_norm
out['policy_grad_norm'] = policy_grad_norm
out['value_grad_norm'] = value_grad_norm
out['policy_loss'] = policy_loss.mean().item()
out['entropy_loss'] = entropy_loss.mean().item()
out['policy_entropy'] = -entropy_loss.mean().item()
out['policy_entropy'] = entropies.mean().item()
out['value_loss'] = value_loss.mean().item()
out['explained_variance'] = ev(y_true=numpify(old_Qs, 'float'), y_pred=numpify(Vs, 'float'))
out['approx_kl'] = torch.mean(old_logprobs - logprobs).item()
Expand All @@ -131,7 +160,7 @@ def learn(self, D, **kwargs):

with torch.no_grad():
last_observations = tensorify(np.concatenate([traj.last_observation for traj in D], 0), self.device)
last_Vs = self.V_head(self.feature_network(last_observations)).squeeze(-1)
last_Vs = self.value(last_observations).squeeze(-1)
Qs = [bootstrapped_returns(self.config['agent.gamma'], traj, last_V)
for traj, last_V in zip(D, last_Vs)]
As = [gae(self.config['agent.gamma'], self.config['agent.gae_lambda'], traj, V, last_V)
Expand All @@ -153,11 +182,10 @@ def learn(self, D, **kwargs):
self.total_timestep += sum([len(traj) for traj in D])
out = {}
if self.config['agent.use_lr_scheduler']:
out['current_lr'] = self.lr_scheduler.get_lr()
out['loss'] = np.mean([item['loss'] for item in logs])
out['grad_norm'] = np.mean([item['grad_norm'] for item in logs])
out['current_lr'] = self.policy_lr_scheduler.get_lr()
out['policy_grad_norm'] = np.mean([item['policy_grad_norm'] for item in logs])
out['value_grad_norm'] = np.mean([item['value_grad_norm'] for item in logs])
out['policy_loss'] = np.mean([item['policy_loss'] for item in logs])
out['entropy_loss'] = np.mean([item['entropy_loss'] for item in logs])
out['policy_entropy'] = np.mean([item['policy_entropy'] for item in logs])
out['value_loss'] = np.mean([item['value_loss'] for item in logs])
out['explained_variance'] = np.mean([item['explained_variance'] for item in logs])
Expand Down
11 changes: 5 additions & 6 deletions baselines/ppo/dataset.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,17 @@
import numpy as np

from torch.utils import data
from lagom.utils import numpify


class Dataset(data.Dataset):
def __init__(self, D, logprobs, entropies, Vs, Qs, As):
self.observations = np.concatenate([np.concatenate(traj.observations[:-1], 0) for traj in D], 0).astype(np.float32)
self.actions = np.concatenate([traj.numpy_actions for traj in D], 0).astype(np.float32)
self.logprobs = numpify(logprobs, 'float32')
self.entropies = numpify(entropies, 'float32')
self.Vs = numpify(Vs, 'float32')
self.Qs = numpify(Qs, 'float32')
self.As = numpify(As, 'float32')
self.logprobs = logprobs
self.entropies = entropies
self.Vs = Vs
self.Qs = Qs
self.As = As

assert self.actions.shape[0] == len(self)
assert all([item.shape == (len(self),) for item in [self.logprobs, self.entropies,
Expand Down
22 changes: 12 additions & 10 deletions baselines/ppo/experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,16 +17,15 @@
from lagom.envs.wrappers import VecMonitor
from lagom.envs.wrappers import VecStandardizeObservation
from lagom.envs.wrappers import VecStandardizeReward
from lagom.envs.wrappers import VecStepInfo
from lagom.runner import EpisodeRunner

from baselines.ppo.agent import Agent
from baselines.ppo.engine import Engine


config = Config(
{'cuda': True,
'log.dir': 'logs/default',
'log.freq': 10,
{'log.freq': 10,
'checkpoint.num': 3,

'env.id': Grid(['HalfCheetah-v3', 'Hopper-v3', 'Walker2d-v3', 'Swimmer-v3']),
Expand All @@ -35,19 +34,18 @@

'nn.sizes': [64, 64],

'agent.lr': 3e-4,
'agent.policy_lr': 3e-4,
'agent.use_lr_scheduler': True,
'agent.value_lr': 1e-3,
'agent.gamma': 0.99,
'agent.gae_lambda': 0.95,
'agent.standardize_adv': True, # standardize advantage estimates
'agent.max_grad_norm': 0.5, # grad clipping by norm
'agent.entropy_coef': 0.0, # PPO: no entropy bobus
'agent.value_coef': 0.5,
'agent.clip_range': 0.2, # ratio clipping

# only for continuous control
'env.clip_action': True, # clip action within valid bound before step()
'agent.std0': 0.5, # initial std
'agent.std0': 0.6, # initial std

'train.timestep': int(1e6), # total number of training (environmental) timesteps
'train.timestep_per_iter': 2048, # number of timesteps per iteration
Expand All @@ -68,16 +66,16 @@ def _make_env():
return env


def run(config, seed, device):
def run(config, seed, device, logdir):
set_global_seeds(seed)
logdir = Path(config['log.dir']) / str(config['ID']) / str(seed)

env = make_env(config, seed)
env = VecMonitor(env)
if config['env.standardize_obs']:
env = VecStandardizeObservation(env, clip=5.)
if config['env.standardize_reward']:
env = VecStandardizeReward(env, clip=10., gamma=config['agent.gamma'])
env = VecStepInfo(env)

agent = Agent(config, env, device)
runner = EpisodeRunner(reset_on_call=False)
Expand All @@ -102,4 +100,8 @@ def run(config, seed, device):
run_experiment(run=run,
config=config,
seeds=[1770966829, 1500925526, 2054191100],
num_worker=os.cpu_count())
log_dir='logs/default',
max_workers=os.cpu_count(),
chunksize=1,
use_gpu=False, # CPU a bit faster
gpu_ids=None)
Binary file modified baselines/ppo/logs/default/0/1500925526/agent_1.pth
Binary file not shown.
Binary file modified baselines/ppo/logs/default/0/1500925526/agent_245.pth
Binary file not shown.
Binary file modified baselines/ppo/logs/default/0/1500925526/agent_489.pth
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"initial_reset_timestamp": 1557319611.092186, "timestamps": [1557319622.9522312], "episode_lengths": [1000], "episode_rewards": [4558.386166059976], "episode_types": ["t", "t"]}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"stats": "openaigym.episode_batch.9.2837515.stats.json", "videos": [["openaigym.video.9.2837515.video000000.mp4", "openaigym.video.9.2837515.video000000.meta.json"], ["openaigym.video.9.2837515.video000001.mp4", "openaigym.video.9.2837515.video000001.meta.json"]], "env_info": {"gym_version": "0.12.1", "env_id": "HalfCheetah-v3"}}
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"episode_id": 0, "content_type": "video/mp4", "encoder_version": {"backend": "ffmpeg", "version": "b'ffmpeg version 3.4.4-0ubuntu0.18.04.1 Copyright (c) 2000-2018 the FFmpeg developers\\nbuilt with gcc 7 (Ubuntu 7.3.0-16ubuntu3)\\nconfiguration: --prefix=/usr --extra-version=0ubuntu0.18.04.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --enable-gpl --disable-stripping --enable-avresample --enable-avisynth --enable-gnutls --enable-ladspa --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librubberband --enable-librsvg --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvorbis --enable-libvpx --enable-libwavpack --enable-libwebp --enable-libx265 --enable-libxml2 --enable-libxvid --enable-libzmq --enable-libzvbi --enable-omx --enable-openal --enable-opengl --enable-sdl2 --enable-libdc1394 --enable-libdrm --enable-libiec61883 --enable-chromaprint --enable-frei0r --enable-libopencv --enable-libx264 --enable-shared\\nlibavutil 55. 78.100 / 55. 78.100\\nlibavcodec 57.107.100 / 57.107.100\\nlibavformat 57. 83.100 / 57. 83.100\\nlibavdevice 57. 10.100 / 57. 10.100\\nlibavfilter 6.107.100 / 6.107.100\\nlibavresample 3. 7. 0 / 3. 7. 0\\nlibswscale 4. 8.100 / 4. 8.100\\nlibswresample 2. 9.100 / 2. 9.100\\nlibpostproc 54. 7.100 / 54. 7.100\\n'", "cmdline": ["ffmpeg", "-nostats", "-loglevel", "error", "-y", "-r", "125", "-f", "rawvideo", "-s:v", "500x500", "-pix_fmt", "rgb24", "-i", "-", "-vf", "scale=trunc(iw/2)*2:trunc(ih/2)*2", "-vcodec", "libx264", "-pix_fmt", "yuv420p", "/home/zuo/Code/tmp/lagom/baselines/ppo/logs/default/1/1500925526/anim/openaigym.video.1.347927.video000000.mp4"]}}
{"episode_id": 0, "content_type": "video/mp4", "encoder_version": {"backend": "ffmpeg", "version": "b'ffmpeg version 3.4.4-0ubuntu0.18.04.1 Copyright (c) 2000-2018 the FFmpeg developers\\nbuilt with gcc 7 (Ubuntu 7.3.0-16ubuntu3)\\nconfiguration: --prefix=/usr --extra-version=0ubuntu0.18.04.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --enable-gpl --disable-stripping --enable-avresample --enable-avisynth --enable-gnutls --enable-ladspa --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librubberband --enable-librsvg --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvorbis --enable-libvpx --enable-libwavpack --enable-libwebp --enable-libx265 --enable-libxml2 --enable-libxvid --enable-libzmq --enable-libzvbi --enable-omx --enable-openal --enable-opengl --enable-sdl2 --enable-libdc1394 --enable-libdrm --enable-libiec61883 --enable-chromaprint --enable-frei0r --enable-libopencv --enable-libx264 --enable-shared\\nlibavutil 55. 78.100 / 55. 78.100\\nlibavcodec 57.107.100 / 57.107.100\\nlibavformat 57. 83.100 / 57. 83.100\\nlibavdevice 57. 10.100 / 57. 10.100\\nlibavfilter 6.107.100 / 6.107.100\\nlibavresample 3. 7. 0 / 3. 7. 0\\nlibswscale 4. 8.100 / 4. 8.100\\nlibswresample 2. 9.100 / 2. 9.100\\nlibpostproc 54. 7.100 / 54. 7.100\\n'", "cmdline": ["ffmpeg", "-nostats", "-loglevel", "error", "-y", "-r", "20", "-f", "rawvideo", "-s:v", "500x500", "-pix_fmt", "rgb24", "-i", "-", "-vf", "scale=trunc(iw/2)*2:trunc(ih/2)*2", "-vcodec", "libx264", "-pix_fmt", "yuv420p", "/home/zuo/Code/lagom/baselines/ppo/logs/default/0/1500925526/anim/openaigym.video.9.2837515.video000000.mp4"]}}
Binary file not shown.

0 comments on commit ba9f9c3

Please sign in to comment.