Merge pull request #162 from zuoxingdong/add_ddpg2

minor update
zuoxingdong · May 6, 2019 · ebea0eb · ebea0eb
2 parents 736fb1f + e70920f
commit ebea0eb
Show file tree

Hide file tree

Showing 154 changed files with 2,460 additions and 604 deletions.
diff --git a/baselines/bb_functions.ipynb b/baselines/bb_functions.ipynb
@@ -22,7 +22,7 @@
     {
      "data": {
       "text/plain": [
-       "<matplotlib.colorbar.Colorbar at 0x7f30649b0940>"
+       "<matplotlib.colorbar.Colorbar at 0x7f9a2307d940>"
       ]
      },
      "execution_count": 2,
@@ -64,14 +64,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "(16_w,32)-aCMA-ES (mu_w=9.2,w_1=19%) in dimension 100 (seed=1, Fri Apr 12 12:10:14 2019)\n",
+      "(16_w,32)-aCMA-ES (mu_w=9.2,w_1=19%) in dimension 100 (seed=1, Fri Apr 26 17:41:13 2019)\n",
       "Generation # 1: 8923.084084029262\n",
       "Generation # 100: 1173.329487436927\n",
       "Generation # 200: 970.7187856119405\n",
@@ -84,7 +84,7 @@
       "Generation # 900: 197.99649019892877\n",
       "Generation # 1000: 197.99648923701295\n",
       "\n",
-      "Total time: 0:02:37\n",
+      "Total time: 0:00:21\n",
       "Generation # 1: 8953.041771621152\n",
       "Generation # 100: 1415.3618620891684\n",
       "Generation # 200: 1367.605103552275\n",
@@ -97,7 +97,7 @@
       "Generation # 900: 376.50773327624256\n",
       "Generation # 1000: 376.50773327624256\n",
       "\n",
-      "Total time: 0:00:07\n",
+      "Total time: 0:00:04\n",
       "Generation # 1: 8923.0869140625\n",
       "Generation # 100: 3107.207275390625\n",
       "Generation # 200: 1245.6773681640625\n",
@@ -110,7 +110,7 @@
       "Generation # 900: 488.6480407714844\n",
       "Generation # 1000: 543.9141845703125\n",
       "\n",
-      "Total time: 0:00:07\n"
+      "Total time: 0:00:05\n"
      ]
     },
     {
@@ -119,7 +119,7 @@
        "Text(0.5, 1.0, 'Rastrigin function - 100 dim')"
       ]
      },
-     "execution_count": 4,
+     "execution_count": 3,
      "metadata": {},
      "output_type": "execute_result"
     },
@@ -180,6 +180,13 @@
     "    \n",
     "ax.set_title('Rastrigin function - 100 dim')"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {

diff --git a/baselines/benchmark_es.png b/baselines/benchmark_es.png
diff --git a/baselines/ddpg/agent.py b/baselines/ddpg/agent.py
@@ -7,6 +7,8 @@
 from lagom import BaseAgent
 from lagom.transform import describe
 from lagom.utils import pickle_dump
+from lagom.utils import tensorify
+from lagom.utils import numpify
 from lagom.envs import flatdim
 from lagom.networks import Module
 from lagom.networks import make_fc
@@ -82,13 +84,11 @@ def polyak_update_target(self):
             target_param.data.copy_(p*target_param.data + (1 - p)*param.data)
 
     def choose_action(self, obs, **kwargs):
-        mode = kwargs['mode']
-        assert mode in ['train', 'eval']
         if not torch.is_tensor(obs):
             obs = torch.from_numpy(np.asarray(obs)).float().to(self.device)
         with torch.no_grad():
             action = self.actor(obs).detach().cpu().numpy()
-        if mode == 'train':
+        if kwargs['mode'] == 'train':
             eps = np.random.normal(0.0, self.action_noise, size=action.shape)
             action = np.clip(action + eps, self.env.action_space.low, self.env.action_space.high)
         out = {}

diff --git a/baselines/ddpg/engine.py b/baselines/ddpg/engine.py
@@ -16,20 +16,20 @@ def train(self, n=None, **kwargs):
         train_logs = []
         eval_logs = []
         eval_togo = 0
-        checkpoint_togo = 0
+        dump_togo = 0
         num_episode = 0
+        checkpoint_count = 0
         observation = self.env.reset()
         for i in count():
             if i >= self.config['train.timestep']:
                 break
-
             if i < self.config['replay.init_size']:
                 action = [self.env.action_space.sample()]
             else:
                 action = self.agent.choose_action(observation, mode='train')['action']
             next_observation, reward, done, info = self.env.step(action)
             eval_togo += 1
-            checkpoint_togo += 1
+            dump_togo += 1
             if done[0]:  # [0] due to single environment
                 start_time = perf_counter()
                 # NOTE: must use latest TimeLimit
@@ -40,10 +40,9 @@ def train(self, n=None, **kwargs):
                 # updates in the end of episode, for each time step
                 out_agent = self.agent.learn(D=None, replay=self.replay, episode_length=info[0]['episode']['horizon'])
                 num_episode += 1
-                if checkpoint_togo >= self.config['checkpoint.freq']:
-                    checkpoint_togo %= self.config['checkpoint.freq']
+                if (i+1) >= int(self.config['train.timestep']*(checkpoint_count/(self.config['checkpoint.num'] - 1))):
                     self.agent.checkpoint(self.logdir, num_episode)
-
+                    checkpoint_count += 1
                 logger = Logger()
                 logger('num_seconds', round(perf_counter() - start_time, 1))
                 logger('accumulated_trained_timesteps', i + 1)
@@ -52,16 +51,19 @@ def train(self, n=None, **kwargs):
                 logger('episode_return', info[0]['episode']['return'])
                 logger('episode_horizon', info[0]['episode']['horizon'])
                 train_logs.append(logger.logs)
-                if num_episode == 1 or num_episode % self.config['log.freq'] == 0:
+                if dump_togo >= self.config['log.freq']:
+                    dump_togo %= self.config['log.freq']
                     logger.dump(keys=None, index=0, indent=0, border='-'*50)
-
                 if eval_togo >= self.config['eval.freq']:
                     eval_togo %= self.config['eval.freq']
                     eval_logs.append(self.eval(accumulated_trained_timesteps=(i+1), 
                                                accumulated_trained_episodes=num_episode))
             else:
                 self.replay.add(observation[0], action[0], reward[0], next_observation[0], done[0])
             observation = next_observation
+        if checkpoint_count < self.config['checkpoint.num']:
+            self.agent.checkpoint(self.logdir, num_episode)
+            checkpoint_count += 1
         return train_logs, eval_logs
 
     def eval(self, n=None, **kwargs):

diff --git a/baselines/ddpg/experiment.py b/baselines/ddpg/experiment.py
@@ -15,9 +15,9 @@
 from lagom.envs.wrappers import ClipAction
 from lagom.envs.wrappers import VecMonitor
 
-from agent import Agent
-from engine import Engine
-from replay_buffer import ReplayBuffer
+from .agent import Agent
+from .engine import Engine
+from .replay_buffer import ReplayBuffer
 # Test for obs/reward normalization
 #from new_engine import Engine
 #from new_replay_buffer import ReplayBuffer

diff --git a/baselines/ddpg/replay_buffer.py b/baselines/ddpg/replay_buffer.py
@@ -1,41 +1,41 @@
-from collections import deque
-
-import random
 import numpy as np
-import torch
+
+from lagom.envs import flatdim
+from lagom.utils import tensorify
 
 
 class ReplayBuffer(object):
-    r"""A deque-based buffer of bounded size that implements experience replay.
-    
-    .. note:
-        Difference with DQN replay buffer: we handle raw observation (no pixel) for continuous control
-        Thus we do not have transformation to and from 255. and np.uint8
-    
-    Args:
-        capacity (int): max capacity of transition storage in the buffer. When the buffer overflows the
-            old transitions are dropped.
-        device (Device): PyTorch device
-        
-    """
-    def __init__(self, capacity, device):
+    def __init__(self, env, capacity, device):
+        self.env = env
         self.capacity = capacity
         self.device = device
-        self.buffer = deque(maxlen=capacity)
+
+        self.observations = np.zeros([capacity, flatdim(env.observation_space)], dtype=np.float32)
+        self.actions = np.zeros([capacity, flatdim(env.action_space)], dtype=np.float32)
+        self.rewards = np.zeros(capacity, dtype=np.float32)
+        self.next_observations = np.zeros([capacity, flatdim(env.observation_space)], dtype=np.float32)
+        self.masks = np.zeros(capacity, dtype=np.float32)
+
+        self.size = 0
+        self.pointer = 0
 
     def __len__(self):
-        return len(self.buffer)
+        return self.size
 
     def add(self, observation, action, reward, next_observation, done):  # input must be non-batched
-        to_float = lambda x: np.asarray(x, dtype=np.float32)  # save half memory than float64
-        transition = (to_float(observation), to_float(action), reward, to_float(next_observation), done)
-        self.buffer.append(transition)
+        self.observations[self.pointer] = observation
+        self.actions[self.pointer] = action
+        self.rewards[self.pointer] = reward
+        self.next_observations[self.pointer] = next_observation
+        self.masks[self.pointer] = 1. - done
 
+        self.pointer = (self.pointer+1) % self.capacity
+        self.size = min(self.size + 1, self.capacity)
+
     def sample(self, batch_size):
-        D = random.choices(self.buffer, k=batch_size)
-        D = zip(*D)
-        observations, actions, rewards, next_observations, dones = list(map(lambda x: np.asarray(x), D))
-        masks = 1. - dones
-        D = (observations, actions, rewards, next_observations, masks)
-        D = list(map(lambda x: torch.from_numpy(x).float().to(self.device), D))
-        return D
+        idx = np.random.randint(0, self.size, size=batch_size)
+        return list(map(lambda x: tensorify(x, self.device), [self.observations[idx], 
+                                                              self.actions[idx], 
+                                                              self.rewards[idx], 
+                                                              self.next_observations[idx], 
+                                                              self.masks[idx]]))