-
Notifications
You must be signed in to change notification settings - Fork 31
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Update baselines: merge DDPG and TD3 into one single folder
- Loading branch information
1 parent
4fd4679
commit 23482e6
Showing
7 changed files
with
512 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
# Deep Deterministic Policy Gradients (DDPG) / Twin Delayed DDPG (TD3) | ||
|
||
This is an implementation of both [DDPG](https://arxiv.org/abs/1509.02971) and [TD3](https://arxiv.org/abs/1802.09477). | ||
|
||
One can choose to use it in [experiment.py](./experiment.py): `'agent.use_td3': True` | ||
|
||
# Usage | ||
|
||
Run the following command to start parallelized training: | ||
|
||
```bash | ||
python experiment.py | ||
``` | ||
|
||
One could modify [experiment.py](./experiment.py) to quickly set up different configurations. | ||
|
||
# Results | ||
<img src='logs/default/result.png' width='100%'> |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,138 @@ | ||
import numpy as np | ||
import torch | ||
import torch.nn as nn | ||
import torch.nn.functional as F | ||
import torch.optim as optim | ||
|
||
from gym.spaces import flatdim | ||
from lagom import BaseAgent | ||
from lagom.utils import tensorify | ||
from lagom.utils import numpify | ||
from lagom.networks import Module | ||
from lagom.networks import make_fc | ||
from lagom.networks import ortho_init | ||
from lagom.transform import describe | ||
|
||
|
||
class Actor(Module): | ||
def __init__(self, config, env, device, **kwargs): | ||
super().__init__(**kwargs) | ||
self.config = config | ||
self.env = env | ||
self.device = device | ||
|
||
self.feature_layers = make_fc(flatdim(env.observation_space), [400, 300]) | ||
self.action_head = nn.Linear(300, flatdim(env.action_space)) | ||
|
||
assert np.unique(env.action_space.high).size == 1 | ||
assert -np.unique(env.action_space.low).item() == np.unique(env.action_space.high).item() | ||
self.max_action = env.action_space.high[0] | ||
|
||
self.to(self.device) | ||
|
||
def forward(self, x): | ||
for layer in self.feature_layers: | ||
x = F.relu(layer(x)) | ||
x = self.max_action*torch.tanh(self.action_head(x)) | ||
return x | ||
|
||
|
||
class Critic(Module): | ||
def __init__(self, config, env, device, **kwargs): | ||
super().__init__(**kwargs) | ||
self.config = config | ||
self.env = env | ||
self.device = device | ||
|
||
self.feature_layers = make_fc(flatdim(env.observation_space) + flatdim(env.action_space), [400, 300]) | ||
self.Q_head = nn.Linear(300, 1) | ||
|
||
self.to(self.device) | ||
|
||
def forward(self, x, action): | ||
x = torch.cat([x, action], dim=-1) | ||
for layer in self.feature_layers: | ||
x = F.relu(layer(x)) | ||
x = self.Q_head(x) | ||
return x | ||
|
||
|
||
class Agent(BaseAgent): | ||
def __init__(self, config, env, device, **kwargs): | ||
super().__init__(config, env, device, **kwargs) | ||
|
||
self.actor = Actor(config, env, device, **kwargs) | ||
self.actor_target = Actor(config, env, device, **kwargs) | ||
self.actor_target.load_state_dict(self.actor.state_dict()) | ||
self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=config['agent.actor.lr']) | ||
|
||
self.critic = Critic(config, env, device, **kwargs) | ||
self.critic_target = Critic(config, env, device, **kwargs) | ||
self.critic_target.load_state_dict(self.critic.state_dict()) | ||
self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=config['agent.critic.lr']) | ||
|
||
self.total_timestep = 0 | ||
|
||
def polyak_update_target(self): | ||
p = self.config['agent.polyak'] | ||
for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()): | ||
target_param.data.copy_(p*target_param.data + (1 - p)*param.data) | ||
for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()): | ||
target_param.data.copy_(p*target_param.data + (1 - p)*param.data) | ||
|
||
def choose_action(self, x, **kwargs): | ||
obs = tensorify(x.observation, self.device).unsqueeze(0) | ||
with torch.no_grad(): | ||
action = numpify(self.actor(obs).squeeze(0), 'float') | ||
if kwargs['mode'] == 'train': | ||
eps = np.random.normal(0.0, self.config['agent.action_noise'], size=action.shape) | ||
action = np.clip(action + eps, self.env.action_space.low, self.env.action_space.high) | ||
out = {} | ||
out['raw_action'] = action | ||
return out | ||
|
||
def learn(self, D, **kwargs): | ||
replay = kwargs['replay'] | ||
T = kwargs['T'] | ||
list_actor_loss = [] | ||
list_critic_loss = [] | ||
Q_vals = [] | ||
for i in range(T): | ||
observations, actions, rewards, next_observations, masks = replay.sample(self.config['replay.batch_size']) | ||
|
||
Qs = self.critic(observations, actions) | ||
with torch.no_grad(): | ||
next_Qs = self.critic_target(next_observations, self.actor_target(next_observations)) | ||
targets = rewards + self.config['agent.gamma']*masks*next_Qs | ||
critic_loss = F.mse_loss(Qs, targets.detach()) | ||
self.actor_optimizer.zero_grad() | ||
self.critic_optimizer.zero_grad() | ||
critic_loss.backward() | ||
critic_grad_norm = nn.utils.clip_grad_norm_(self.critic.parameters(), self.config['agent.max_grad_norm']) | ||
self.critic_optimizer.step() | ||
|
||
actor_loss = -self.critic(observations, self.actor(observations)).mean() | ||
self.actor_optimizer.zero_grad() | ||
self.critic_optimizer.zero_grad() | ||
actor_loss.backward() | ||
actor_grad_norm = nn.utils.clip_grad_norm_(self.actor.parameters(), self.config['agent.max_grad_norm']) | ||
self.actor_optimizer.step() | ||
|
||
self.polyak_update_target() | ||
|
||
list_actor_loss.append(actor_loss) | ||
list_critic_loss.append(critic_loss) | ||
Q_vals.append(Qs) | ||
self.total_timestep += T | ||
|
||
out = {} | ||
out['actor_loss'] = torch.tensor(list_actor_loss).mean(0).item() | ||
out['actor_grad_norm'] = actor_grad_norm | ||
out['critic_loss'] = torch.tensor(list_critic_loss).mean(0).item() | ||
out['critic_grad_norm'] = critic_grad_norm | ||
describe_it = lambda x: describe(numpify(torch.cat(x), 'float').squeeze(), axis=-1, repr_indent=1, repr_prefix='\n') | ||
out['Q'] = describe_it(Q_vals) | ||
return out | ||
|
||
def checkpoint(self, logdir, num_iter): | ||
self.save(logdir/f'agent_{num_iter}.pth') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
import time | ||
from itertools import count | ||
|
||
import torch | ||
from lagom import Logger | ||
from lagom import BaseEngine | ||
from lagom.transform import describe | ||
from lagom.utils import color_str | ||
|
||
|
||
class Engine(BaseEngine): | ||
def train(self, n=None, **kwargs): | ||
train_logs, eval_logs = [], [] | ||
checkpoint_count = 0 | ||
for iteration in count(): | ||
if self.agent.total_timestep >= self.config['train.timestep']: | ||
break | ||
t0 = time.perf_counter() | ||
|
||
if iteration < self.config['replay.init_trial']: | ||
[traj] = self.runner(self.random_agent, self.env, 1) | ||
else: | ||
[traj] = self.runner(self.agent, self.env, 1, mode='train') | ||
self.replay.add(traj) | ||
# Number of gradient updates = collected episode length | ||
out_agent = self.agent.learn(D=None, replay=self.replay, T=traj.T) | ||
|
||
logger = Logger() | ||
logger('train_iteration', iteration+1) | ||
logger('num_seconds', round(time.perf_counter() - t0, 1)) | ||
[logger(key, value) for key, value in out_agent.items()] | ||
logger('episode_return', sum(traj.rewards)) | ||
logger('episode_horizon', traj.T) | ||
logger('accumulated_trained_timesteps', self.agent.total_timestep) | ||
train_logs.append(logger.logs) | ||
if iteration == 0 or (iteration+1) % self.config['log.freq'] == 0: | ||
logger.dump(keys=None, index=0, indent=0, border='-'*50) | ||
if self.agent.total_timestep >= int(self.config['train.timestep']*(checkpoint_count/(self.config['checkpoint.num'] - 1))): | ||
self.agent.checkpoint(self.logdir, iteration + 1) | ||
checkpoint_count += 1 | ||
|
||
if self.agent.total_timestep >= int(self.config['train.timestep']*(len(eval_logs)/(self.config['eval.num'] - 1))): | ||
eval_logs.append(self.eval(n=len(eval_logs))) | ||
return train_logs, eval_logs | ||
|
||
def eval(self, n=None, **kwargs): | ||
t0 = time.perf_counter() | ||
with torch.no_grad(): | ||
D = self.runner(self.agent, self.eval_env, 10, mode='eval') | ||
|
||
logger = Logger() | ||
logger('eval_iteration', n+1) | ||
logger('num_seconds', round(time.perf_counter() - t0, 1)) | ||
logger('accumulated_trained_timesteps', self.agent.total_timestep) | ||
logger('online_return', describe([sum(traj.rewards) for traj in D], axis=-1, repr_indent=1, repr_prefix='\n')) | ||
logger('online_horizon', describe([traj.T for traj in D], axis=-1, repr_indent=1, repr_prefix='\n')) | ||
logger('running_return', describe(self.eval_env.return_queue, axis=-1, repr_indent=1, repr_prefix='\n')) | ||
logger('running_horizon', describe(self.eval_env.horizon_queue, axis=-1, repr_indent=1, repr_prefix='\n')) | ||
logger.dump(keys=None, index=0, indent=0, border=color_str('+'*50, color='green')) | ||
return logger.logs |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,90 @@ | ||
import os | ||
import gym | ||
|
||
from lagom import EpisodeRunner | ||
from lagom import RandomAgent | ||
from lagom.utils import pickle_dump | ||
from lagom.utils import set_global_seeds | ||
from lagom.experiment import Config | ||
from lagom.experiment import Grid | ||
from lagom.experiment import run_experiment | ||
from lagom.envs import RecordEpisodeStatistics | ||
from lagom.envs import TimeStepEnv | ||
|
||
from baselines.ddpg_td3.ddpg_agent import Agent as DDPGAgent | ||
from baselines.ddpg_td3.td3_agent import Agent as TD3Agent | ||
from baselines.ddpg_td3.engine import Engine | ||
from baselines.ddpg_td3.replay_buffer import ReplayBuffer | ||
|
||
|
||
config = Config( | ||
{'log.freq': 10, | ||
'checkpoint.num': 3, | ||
|
||
'env.id': 'Hopper-v3', ###Grid(['HalfCheetah-v3', 'Hopper-v3', 'Walker2d-v3', 'Swimmer-v3']), | ||
|
||
'agent.gamma': 0.99, | ||
'agent.polyak': 0.995, # polyak averaging coefficient for targets update | ||
'agent.actor.lr': 1e-3, | ||
'agent.actor.use_lr_scheduler': False, | ||
'agent.critic.lr': 1e-3, | ||
'agent.critic.use_lr_scheduler': False, | ||
'agent.action_noise': 0.1, | ||
'agent.max_grad_norm': 999999, # grad clipping by norm | ||
|
||
# TD3 hyperparams | ||
'agent.use_td3': True, | ||
'agent.target_noise': 0.2, | ||
'agent.target_noise_clip': 0.5, | ||
'agent.policy_delay': 2, | ||
|
||
'replay.capacity': 1000000, | ||
'replay.init_trial': 10, # number of random rollouts initially | ||
'replay.batch_size': 100, | ||
|
||
'train.timestep': int(1e6), # total number of training (environmental) timesteps | ||
'eval.num': 200 | ||
}) | ||
|
||
|
||
def make_env(config, seed, mode): | ||
assert mode in ['train', 'eval'] | ||
env = gym.make(config['env.id']) | ||
env.seed(seed) | ||
env.observation_space.seed(seed) | ||
env.action_space.seed(seed) | ||
if mode == 'eval': | ||
env = RecordEpisodeStatistics(env, deque_size=100) | ||
env = TimeStepEnv(env) | ||
return env | ||
|
||
|
||
def run(config, seed, device, logdir): | ||
set_global_seeds(seed) | ||
|
||
env = make_env(config, seed, 'train') | ||
eval_env = make_env(config, seed, 'eval') | ||
random_agent = RandomAgent(config, env, device) | ||
if config['agent.use_td3']: | ||
agent = TD3Agent(config, env, device) | ||
else: | ||
agent = DDPGAgent(config, env, device) | ||
runner = EpisodeRunner() | ||
replay = ReplayBuffer(env, config['replay.capacity'], device) | ||
engine = Engine(config, agent=agent, random_agent=random_agent, env=env, eval_env=eval_env, runner=runner, replay=replay, logdir=logdir) | ||
|
||
train_logs, eval_logs = engine.train() | ||
pickle_dump(obj=train_logs, f=logdir/'train_logs', ext='.pkl') | ||
pickle_dump(obj=eval_logs, f=logdir/'eval_logs', ext='.pkl') | ||
return None | ||
|
||
|
||
if __name__ == '__main__': | ||
run_experiment(run=run, | ||
config=config, | ||
seeds=[4153361530], ###[4153361530, 3503522377, 2876994566, 172236777, 3949341511], | ||
log_dir='logs/default2', | ||
max_workers=os.cpu_count(), | ||
chunksize=1, | ||
use_gpu=True, # GPU much faster, note that performance differs between CPU/GPU | ||
gpu_ids=None) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
import numpy as np | ||
from gym.spaces import flatdim | ||
from lagom.utils import tensorify | ||
|
||
|
||
class ReplayBuffer(object): | ||
def __init__(self, env, capacity, device): | ||
self.env = env | ||
self.capacity = capacity | ||
self.device = device | ||
|
||
self.observations = np.zeros([capacity, flatdim(env.observation_space)], dtype=np.float32) | ||
self.actions = np.zeros([capacity, flatdim(env.action_space)], dtype=np.float32) | ||
self.rewards = np.zeros([capacity, 1], dtype=np.float32) | ||
self.next_observations = np.zeros([capacity, flatdim(env.observation_space)], dtype=np.float32) | ||
self.masks = np.zeros([capacity, 1], dtype=np.float32) | ||
|
||
self.size = 0 | ||
self.pointer = 0 | ||
|
||
def __len__(self): | ||
return self.size | ||
|
||
def _add(self, observation, action, reward, next_observation, terminal): | ||
self.observations[self.pointer] = observation | ||
self.actions[self.pointer] = action | ||
self.rewards[self.pointer] = reward | ||
self.next_observations[self.pointer] = next_observation | ||
self.masks[self.pointer] = 1. - terminal | ||
|
||
self.pointer = (self.pointer+1) % self.capacity | ||
self.size = min(self.size + 1, self.capacity) | ||
|
||
def add(self, traj): | ||
for t in range(1, traj.T+1): | ||
self._add(traj[t-1].observation, traj.actions[t-1], traj[t].reward, traj[t].observation, traj[t].terminal()) | ||
|
||
def sample(self, batch_size): | ||
idx = np.random.randint(0, self.size, size=batch_size) | ||
return list(map(lambda x: tensorify(x, self.device), [self.observations[idx], | ||
self.actions[idx], | ||
self.rewards[idx], | ||
self.next_observations[idx], | ||
self.masks[idx]])) |
Oops, something went wrong.