Skip to content

Commit

Permalink
Update baselines: merge DDPG and TD3 into one single folder
Browse files Browse the repository at this point in the history
  • Loading branch information
zuoxingdong committed Aug 7, 2019
1 parent 4fd4679 commit 23482e6
Show file tree
Hide file tree
Showing 7 changed files with 512 additions and 0 deletions.
18 changes: 18 additions & 0 deletions baselines/ddpg_td3/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Deep Deterministic Policy Gradients (DDPG) / Twin Delayed DDPG (TD3)

This is an implementation of both [DDPG](https://arxiv.org/abs/1509.02971) and [TD3](https://arxiv.org/abs/1802.09477).

One can choose to use it in [experiment.py](./experiment.py): `'agent.use_td3': True`

# Usage

Run the following command to start parallelized training:

```bash
python experiment.py
```

One could modify [experiment.py](./experiment.py) to quickly set up different configurations.

# Results
<img src='logs/default/result.png' width='100%'>
Empty file added baselines/ddpg_td3/__init__.py
Empty file.
138 changes: 138 additions & 0 deletions baselines/ddpg_td3/ddpg_agent.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from gym.spaces import flatdim
from lagom import BaseAgent
from lagom.utils import tensorify
from lagom.utils import numpify
from lagom.networks import Module
from lagom.networks import make_fc
from lagom.networks import ortho_init
from lagom.transform import describe


class Actor(Module):
def __init__(self, config, env, device, **kwargs):
super().__init__(**kwargs)
self.config = config
self.env = env
self.device = device

self.feature_layers = make_fc(flatdim(env.observation_space), [400, 300])
self.action_head = nn.Linear(300, flatdim(env.action_space))

assert np.unique(env.action_space.high).size == 1
assert -np.unique(env.action_space.low).item() == np.unique(env.action_space.high).item()
self.max_action = env.action_space.high[0]

self.to(self.device)

def forward(self, x):
for layer in self.feature_layers:
x = F.relu(layer(x))
x = self.max_action*torch.tanh(self.action_head(x))
return x


class Critic(Module):
def __init__(self, config, env, device, **kwargs):
super().__init__(**kwargs)
self.config = config
self.env = env
self.device = device

self.feature_layers = make_fc(flatdim(env.observation_space) + flatdim(env.action_space), [400, 300])
self.Q_head = nn.Linear(300, 1)

self.to(self.device)

def forward(self, x, action):
x = torch.cat([x, action], dim=-1)
for layer in self.feature_layers:
x = F.relu(layer(x))
x = self.Q_head(x)
return x


class Agent(BaseAgent):
def __init__(self, config, env, device, **kwargs):
super().__init__(config, env, device, **kwargs)

self.actor = Actor(config, env, device, **kwargs)
self.actor_target = Actor(config, env, device, **kwargs)
self.actor_target.load_state_dict(self.actor.state_dict())
self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=config['agent.actor.lr'])

self.critic = Critic(config, env, device, **kwargs)
self.critic_target = Critic(config, env, device, **kwargs)
self.critic_target.load_state_dict(self.critic.state_dict())
self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=config['agent.critic.lr'])

self.total_timestep = 0

def polyak_update_target(self):
p = self.config['agent.polyak']
for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
target_param.data.copy_(p*target_param.data + (1 - p)*param.data)
for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
target_param.data.copy_(p*target_param.data + (1 - p)*param.data)

def choose_action(self, x, **kwargs):
obs = tensorify(x.observation, self.device).unsqueeze(0)
with torch.no_grad():
action = numpify(self.actor(obs).squeeze(0), 'float')
if kwargs['mode'] == 'train':
eps = np.random.normal(0.0, self.config['agent.action_noise'], size=action.shape)
action = np.clip(action + eps, self.env.action_space.low, self.env.action_space.high)
out = {}
out['raw_action'] = action
return out

def learn(self, D, **kwargs):
replay = kwargs['replay']
T = kwargs['T']
list_actor_loss = []
list_critic_loss = []
Q_vals = []
for i in range(T):
observations, actions, rewards, next_observations, masks = replay.sample(self.config['replay.batch_size'])

Qs = self.critic(observations, actions)
with torch.no_grad():
next_Qs = self.critic_target(next_observations, self.actor_target(next_observations))
targets = rewards + self.config['agent.gamma']*masks*next_Qs
critic_loss = F.mse_loss(Qs, targets.detach())
self.actor_optimizer.zero_grad()
self.critic_optimizer.zero_grad()
critic_loss.backward()
critic_grad_norm = nn.utils.clip_grad_norm_(self.critic.parameters(), self.config['agent.max_grad_norm'])
self.critic_optimizer.step()

actor_loss = -self.critic(observations, self.actor(observations)).mean()
self.actor_optimizer.zero_grad()
self.critic_optimizer.zero_grad()
actor_loss.backward()
actor_grad_norm = nn.utils.clip_grad_norm_(self.actor.parameters(), self.config['agent.max_grad_norm'])
self.actor_optimizer.step()

self.polyak_update_target()

list_actor_loss.append(actor_loss)
list_critic_loss.append(critic_loss)
Q_vals.append(Qs)
self.total_timestep += T

out = {}
out['actor_loss'] = torch.tensor(list_actor_loss).mean(0).item()
out['actor_grad_norm'] = actor_grad_norm
out['critic_loss'] = torch.tensor(list_critic_loss).mean(0).item()
out['critic_grad_norm'] = critic_grad_norm
describe_it = lambda x: describe(numpify(torch.cat(x), 'float').squeeze(), axis=-1, repr_indent=1, repr_prefix='\n')
out['Q'] = describe_it(Q_vals)
return out

def checkpoint(self, logdir, num_iter):
self.save(logdir/f'agent_{num_iter}.pth')
60 changes: 60 additions & 0 deletions baselines/ddpg_td3/engine.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import time
from itertools import count

import torch
from lagom import Logger
from lagom import BaseEngine
from lagom.transform import describe
from lagom.utils import color_str


class Engine(BaseEngine):
def train(self, n=None, **kwargs):
train_logs, eval_logs = [], []
checkpoint_count = 0
for iteration in count():
if self.agent.total_timestep >= self.config['train.timestep']:
break
t0 = time.perf_counter()

if iteration < self.config['replay.init_trial']:
[traj] = self.runner(self.random_agent, self.env, 1)
else:
[traj] = self.runner(self.agent, self.env, 1, mode='train')
self.replay.add(traj)
# Number of gradient updates = collected episode length
out_agent = self.agent.learn(D=None, replay=self.replay, T=traj.T)

logger = Logger()
logger('train_iteration', iteration+1)
logger('num_seconds', round(time.perf_counter() - t0, 1))
[logger(key, value) for key, value in out_agent.items()]
logger('episode_return', sum(traj.rewards))
logger('episode_horizon', traj.T)
logger('accumulated_trained_timesteps', self.agent.total_timestep)
train_logs.append(logger.logs)
if iteration == 0 or (iteration+1) % self.config['log.freq'] == 0:
logger.dump(keys=None, index=0, indent=0, border='-'*50)
if self.agent.total_timestep >= int(self.config['train.timestep']*(checkpoint_count/(self.config['checkpoint.num'] - 1))):
self.agent.checkpoint(self.logdir, iteration + 1)
checkpoint_count += 1

if self.agent.total_timestep >= int(self.config['train.timestep']*(len(eval_logs)/(self.config['eval.num'] - 1))):
eval_logs.append(self.eval(n=len(eval_logs)))
return train_logs, eval_logs

def eval(self, n=None, **kwargs):
t0 = time.perf_counter()
with torch.no_grad():
D = self.runner(self.agent, self.eval_env, 10, mode='eval')

logger = Logger()
logger('eval_iteration', n+1)
logger('num_seconds', round(time.perf_counter() - t0, 1))
logger('accumulated_trained_timesteps', self.agent.total_timestep)
logger('online_return', describe([sum(traj.rewards) for traj in D], axis=-1, repr_indent=1, repr_prefix='\n'))
logger('online_horizon', describe([traj.T for traj in D], axis=-1, repr_indent=1, repr_prefix='\n'))
logger('running_return', describe(self.eval_env.return_queue, axis=-1, repr_indent=1, repr_prefix='\n'))
logger('running_horizon', describe(self.eval_env.horizon_queue, axis=-1, repr_indent=1, repr_prefix='\n'))
logger.dump(keys=None, index=0, indent=0, border=color_str('+'*50, color='green'))
return logger.logs
90 changes: 90 additions & 0 deletions baselines/ddpg_td3/experiment.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
import os
import gym

from lagom import EpisodeRunner
from lagom import RandomAgent
from lagom.utils import pickle_dump
from lagom.utils import set_global_seeds
from lagom.experiment import Config
from lagom.experiment import Grid
from lagom.experiment import run_experiment
from lagom.envs import RecordEpisodeStatistics
from lagom.envs import TimeStepEnv

from baselines.ddpg_td3.ddpg_agent import Agent as DDPGAgent
from baselines.ddpg_td3.td3_agent import Agent as TD3Agent
from baselines.ddpg_td3.engine import Engine
from baselines.ddpg_td3.replay_buffer import ReplayBuffer


config = Config(
{'log.freq': 10,
'checkpoint.num': 3,

'env.id': 'Hopper-v3', ###Grid(['HalfCheetah-v3', 'Hopper-v3', 'Walker2d-v3', 'Swimmer-v3']),

'agent.gamma': 0.99,
'agent.polyak': 0.995, # polyak averaging coefficient for targets update
'agent.actor.lr': 1e-3,
'agent.actor.use_lr_scheduler': False,
'agent.critic.lr': 1e-3,
'agent.critic.use_lr_scheduler': False,
'agent.action_noise': 0.1,
'agent.max_grad_norm': 999999, # grad clipping by norm

# TD3 hyperparams
'agent.use_td3': True,
'agent.target_noise': 0.2,
'agent.target_noise_clip': 0.5,
'agent.policy_delay': 2,

'replay.capacity': 1000000,
'replay.init_trial': 10, # number of random rollouts initially
'replay.batch_size': 100,

'train.timestep': int(1e6), # total number of training (environmental) timesteps
'eval.num': 200
})


def make_env(config, seed, mode):
assert mode in ['train', 'eval']
env = gym.make(config['env.id'])
env.seed(seed)
env.observation_space.seed(seed)
env.action_space.seed(seed)
if mode == 'eval':
env = RecordEpisodeStatistics(env, deque_size=100)
env = TimeStepEnv(env)
return env


def run(config, seed, device, logdir):
set_global_seeds(seed)

env = make_env(config, seed, 'train')
eval_env = make_env(config, seed, 'eval')
random_agent = RandomAgent(config, env, device)
if config['agent.use_td3']:
agent = TD3Agent(config, env, device)
else:
agent = DDPGAgent(config, env, device)
runner = EpisodeRunner()
replay = ReplayBuffer(env, config['replay.capacity'], device)
engine = Engine(config, agent=agent, random_agent=random_agent, env=env, eval_env=eval_env, runner=runner, replay=replay, logdir=logdir)

train_logs, eval_logs = engine.train()
pickle_dump(obj=train_logs, f=logdir/'train_logs', ext='.pkl')
pickle_dump(obj=eval_logs, f=logdir/'eval_logs', ext='.pkl')
return None


if __name__ == '__main__':
run_experiment(run=run,
config=config,
seeds=[4153361530], ###[4153361530, 3503522377, 2876994566, 172236777, 3949341511],
log_dir='logs/default2',
max_workers=os.cpu_count(),
chunksize=1,
use_gpu=True, # GPU much faster, note that performance differs between CPU/GPU
gpu_ids=None)
44 changes: 44 additions & 0 deletions baselines/ddpg_td3/replay_buffer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import numpy as np
from gym.spaces import flatdim
from lagom.utils import tensorify


class ReplayBuffer(object):
def __init__(self, env, capacity, device):
self.env = env
self.capacity = capacity
self.device = device

self.observations = np.zeros([capacity, flatdim(env.observation_space)], dtype=np.float32)
self.actions = np.zeros([capacity, flatdim(env.action_space)], dtype=np.float32)
self.rewards = np.zeros([capacity, 1], dtype=np.float32)
self.next_observations = np.zeros([capacity, flatdim(env.observation_space)], dtype=np.float32)
self.masks = np.zeros([capacity, 1], dtype=np.float32)

self.size = 0
self.pointer = 0

def __len__(self):
return self.size

def _add(self, observation, action, reward, next_observation, terminal):
self.observations[self.pointer] = observation
self.actions[self.pointer] = action
self.rewards[self.pointer] = reward
self.next_observations[self.pointer] = next_observation
self.masks[self.pointer] = 1. - terminal

self.pointer = (self.pointer+1) % self.capacity
self.size = min(self.size + 1, self.capacity)

def add(self, traj):
for t in range(1, traj.T+1):
self._add(traj[t-1].observation, traj.actions[t-1], traj[t].reward, traj[t].observation, traj[t].terminal())

def sample(self, batch_size):
idx = np.random.randint(0, self.size, size=batch_size)
return list(map(lambda x: tensorify(x, self.device), [self.observations[idx],
self.actions[idx],
self.rewards[idx],
self.next_observations[idx],
self.masks[idx]]))

0 comments on commit 23482e6

Please sign in to comment.