Skip to content

Commit

Permalink
Merge pull request #162 from zuoxingdong/add_ddpg2
Browse files Browse the repository at this point in the history
minor update
  • Loading branch information
zuoxingdong committed May 6, 2019
2 parents 736fb1f + e70920f commit ebea0eb
Show file tree
Hide file tree
Showing 154 changed files with 2,460 additions and 604 deletions.
21 changes: 14 additions & 7 deletions baselines/bb_functions.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
{
"data": {
"text/plain": [
"<matplotlib.colorbar.Colorbar at 0x7f30649b0940>"
"<matplotlib.colorbar.Colorbar at 0x7f9a2307d940>"
]
},
"execution_count": 2,
Expand Down Expand Up @@ -64,14 +64,14 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(16_w,32)-aCMA-ES (mu_w=9.2,w_1=19%) in dimension 100 (seed=1, Fri Apr 12 12:10:14 2019)\n",
"(16_w,32)-aCMA-ES (mu_w=9.2,w_1=19%) in dimension 100 (seed=1, Fri Apr 26 17:41:13 2019)\n",
"Generation # 1: 8923.084084029262\n",
"Generation # 100: 1173.329487436927\n",
"Generation # 200: 970.7187856119405\n",
Expand All @@ -84,7 +84,7 @@
"Generation # 900: 197.99649019892877\n",
"Generation # 1000: 197.99648923701295\n",
"\n",
"Total time: 0:02:37\n",
"Total time: 0:00:21\n",
"Generation # 1: 8953.041771621152\n",
"Generation # 100: 1415.3618620891684\n",
"Generation # 200: 1367.605103552275\n",
Expand All @@ -97,7 +97,7 @@
"Generation # 900: 376.50773327624256\n",
"Generation # 1000: 376.50773327624256\n",
"\n",
"Total time: 0:00:07\n",
"Total time: 0:00:04\n",
"Generation # 1: 8923.0869140625\n",
"Generation # 100: 3107.207275390625\n",
"Generation # 200: 1245.6773681640625\n",
Expand All @@ -110,7 +110,7 @@
"Generation # 900: 488.6480407714844\n",
"Generation # 1000: 543.9141845703125\n",
"\n",
"Total time: 0:00:07\n"
"Total time: 0:00:05\n"
]
},
{
Expand All @@ -119,7 +119,7 @@
"Text(0.5, 1.0, 'Rastrigin function - 100 dim')"
]
},
"execution_count": 4,
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
},
Expand Down Expand Up @@ -180,6 +180,13 @@
" \n",
"ax.set_title('Rastrigin function - 100 dim')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
Expand Down
Binary file added baselines/benchmark_es.png
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
6 changes: 3 additions & 3 deletions baselines/ddpg/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
from lagom import BaseAgent
from lagom.transform import describe
from lagom.utils import pickle_dump
from lagom.utils import tensorify
from lagom.utils import numpify
from lagom.envs import flatdim
from lagom.networks import Module
from lagom.networks import make_fc
Expand Down Expand Up @@ -82,13 +84,11 @@ def polyak_update_target(self):
target_param.data.copy_(p*target_param.data + (1 - p)*param.data)

def choose_action(self, obs, **kwargs):
mode = kwargs['mode']
assert mode in ['train', 'eval']
if not torch.is_tensor(obs):
obs = torch.from_numpy(np.asarray(obs)).float().to(self.device)
with torch.no_grad():
action = self.actor(obs).detach().cpu().numpy()
if mode == 'train':
if kwargs['mode'] == 'train':
eps = np.random.normal(0.0, self.action_noise, size=action.shape)
action = np.clip(action + eps, self.env.action_space.low, self.env.action_space.high)
out = {}
Expand Down
18 changes: 10 additions & 8 deletions baselines/ddpg/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,20 +16,20 @@ def train(self, n=None, **kwargs):
train_logs = []
eval_logs = []
eval_togo = 0
checkpoint_togo = 0
dump_togo = 0
num_episode = 0
checkpoint_count = 0
observation = self.env.reset()
for i in count():
if i >= self.config['train.timestep']:
break

if i < self.config['replay.init_size']:
action = [self.env.action_space.sample()]
else:
action = self.agent.choose_action(observation, mode='train')['action']
next_observation, reward, done, info = self.env.step(action)
eval_togo += 1
checkpoint_togo += 1
dump_togo += 1
if done[0]: # [0] due to single environment
start_time = perf_counter()
# NOTE: must use latest TimeLimit
Expand All @@ -40,10 +40,9 @@ def train(self, n=None, **kwargs):
# updates in the end of episode, for each time step
out_agent = self.agent.learn(D=None, replay=self.replay, episode_length=info[0]['episode']['horizon'])
num_episode += 1
if checkpoint_togo >= self.config['checkpoint.freq']:
checkpoint_togo %= self.config['checkpoint.freq']
if (i+1) >= int(self.config['train.timestep']*(checkpoint_count/(self.config['checkpoint.num'] - 1))):
self.agent.checkpoint(self.logdir, num_episode)

checkpoint_count += 1
logger = Logger()
logger('num_seconds', round(perf_counter() - start_time, 1))
logger('accumulated_trained_timesteps', i + 1)
Expand All @@ -52,16 +51,19 @@ def train(self, n=None, **kwargs):
logger('episode_return', info[0]['episode']['return'])
logger('episode_horizon', info[0]['episode']['horizon'])
train_logs.append(logger.logs)
if num_episode == 1 or num_episode % self.config['log.freq'] == 0:
if dump_togo >= self.config['log.freq']:
dump_togo %= self.config['log.freq']
logger.dump(keys=None, index=0, indent=0, border='-'*50)

if eval_togo >= self.config['eval.freq']:
eval_togo %= self.config['eval.freq']
eval_logs.append(self.eval(accumulated_trained_timesteps=(i+1),
accumulated_trained_episodes=num_episode))
else:
self.replay.add(observation[0], action[0], reward[0], next_observation[0], done[0])
observation = next_observation
if checkpoint_count < self.config['checkpoint.num']:
self.agent.checkpoint(self.logdir, num_episode)
checkpoint_count += 1
return train_logs, eval_logs

def eval(self, n=None, **kwargs):
Expand Down
6 changes: 3 additions & 3 deletions baselines/ddpg/experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,9 @@
from lagom.envs.wrappers import ClipAction
from lagom.envs.wrappers import VecMonitor

from agent import Agent
from engine import Engine
from replay_buffer import ReplayBuffer
from .agent import Agent
from .engine import Engine
from .replay_buffer import ReplayBuffer
# Test for obs/reward normalization
#from new_engine import Engine
#from new_replay_buffer import ReplayBuffer
Expand Down
58 changes: 29 additions & 29 deletions baselines/ddpg/replay_buffer.py
Original file line number Diff line number Diff line change
@@ -1,41 +1,41 @@
from collections import deque

import random
import numpy as np
import torch

from lagom.envs import flatdim
from lagom.utils import tensorify


class ReplayBuffer(object):
r"""A deque-based buffer of bounded size that implements experience replay.
.. note:
Difference with DQN replay buffer: we handle raw observation (no pixel) for continuous control
Thus we do not have transformation to and from 255. and np.uint8
Args:
capacity (int): max capacity of transition storage in the buffer. When the buffer overflows the
old transitions are dropped.
device (Device): PyTorch device
"""
def __init__(self, capacity, device):
def __init__(self, env, capacity, device):
self.env = env
self.capacity = capacity
self.device = device
self.buffer = deque(maxlen=capacity)

self.observations = np.zeros([capacity, flatdim(env.observation_space)], dtype=np.float32)
self.actions = np.zeros([capacity, flatdim(env.action_space)], dtype=np.float32)
self.rewards = np.zeros(capacity, dtype=np.float32)
self.next_observations = np.zeros([capacity, flatdim(env.observation_space)], dtype=np.float32)
self.masks = np.zeros(capacity, dtype=np.float32)

self.size = 0
self.pointer = 0

def __len__(self):
return len(self.buffer)
return self.size

def add(self, observation, action, reward, next_observation, done): # input must be non-batched
to_float = lambda x: np.asarray(x, dtype=np.float32) # save half memory than float64
transition = (to_float(observation), to_float(action), reward, to_float(next_observation), done)
self.buffer.append(transition)
self.observations[self.pointer] = observation
self.actions[self.pointer] = action
self.rewards[self.pointer] = reward
self.next_observations[self.pointer] = next_observation
self.masks[self.pointer] = 1. - done

self.pointer = (self.pointer+1) % self.capacity
self.size = min(self.size + 1, self.capacity)

def sample(self, batch_size):
D = random.choices(self.buffer, k=batch_size)
D = zip(*D)
observations, actions, rewards, next_observations, dones = list(map(lambda x: np.asarray(x), D))
masks = 1. - dones
D = (observations, actions, rewards, next_observations, masks)
D = list(map(lambda x: torch.from_numpy(x).float().to(self.device), D))
return D
idx = np.random.randint(0, self.size, size=batch_size)
return list(map(lambda x: tensorify(x, self.device), [self.observations[idx],
self.actions[idx],
self.rewards[idx],
self.next_observations[idx],
self.masks[idx]]))

0 comments on commit ebea0eb

Please sign in to comment.