Skip to content

Commit

Permalink
Add NormalizeObservation and NormalizeReward
Browse files Browse the repository at this point in the history
  • Loading branch information
zuoxingdong committed Aug 2, 2019
1 parent 214ad21 commit 0bdc4f0
Show file tree
Hide file tree
Showing 5 changed files with 142 additions and 0 deletions.
6 changes: 6 additions & 0 deletions docs/source/envs.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,12 @@ lagom.envs

.. autoclass:: RecordEpisodeStatistics
:members:

.. autoclass:: NormalizeObservation
:members:

.. autoclass:: NormalizeReward
:members:

.. autoclass:: VecEnv
:members:
Expand Down
2 changes: 2 additions & 0 deletions lagom/envs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,5 @@
from .make_vec_env import make_vec_env

from .record_episode_statistics import RecordEpisodeStatistics
from .normalize_observation import NormalizeObservation
from .normalize_reward import NormalizeReward
27 changes: 27 additions & 0 deletions lagom/envs/normalize_observation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import numpy as np
import gym

from lagom.transform import RunningMeanVar


class NormalizeObservation(gym.ObservationWrapper):
def __init__(self, env, clip=5., constant_moments=None):
super().__init__(env)
self.clip = clip
self.constant_moments = constant_moments
self.eps = 1e-8
if constant_moments is None:
self.obs_moments = RunningMeanVar(shape=env.observation_space.shape)
else:
self.constant_mean, self.constant_var = constant_moments

def observation(self, observation):
if self.constant_moments is None:
self.obs_moments([observation])
mean = self.obs_moments.mean
std = np.sqrt(self.obs_moments.var + self.eps)
else:
mean = self.constant_mean
std = np.sqrt(self.constant_var + self.eps)
observation = np.clip((observation - mean)/std, -self.clip, self.clip)
return observation
42 changes: 42 additions & 0 deletions lagom/envs/normalize_reward.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import numpy as np
import gym

from lagom.transform import RunningMeanVar


class NormalizeReward(gym.RewardWrapper):
def __init__(self, env, clip=10., gamma=0.99, constant_var=None):
super().__init__(env)
self.clip = clip
assert gamma > 0.0 and gamma < 1.0, 'we do not allow discounted factor as 1.0. See docstring for details. '
self.gamma = gamma
self.constant_var = constant_var
self.eps = 1e-8
if constant_var is None:
self.reward_moments = RunningMeanVar(shape=())

# Buffer to save discounted returns from each environment
self.all_returns = 0.0

def reset(self):
# Reset returns buffer
self.all_returns = 0.0
return super().reset()

def step(self, action):
observation, reward, done, info = super().step(action)
# Set discounted return buffer as zero if episode terminates
if done:
self.all_returns = 0.0
return observation, reward, done, info

def reward(self, reward):
if self.constant_var is None:
self.all_returns = reward + self.gamma*self.all_returns
self.reward_moments([self.all_returns])
std = np.sqrt(self.reward_moments.var + self.eps)
else:
std = np.sqrt(self.constant_var + self.eps)
# Do NOT subtract from mean, but only divided by std
reward = np.clip(reward/std, -self.clip, self.clip)
return reward
65 changes: 65 additions & 0 deletions test/test_envs.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
from gym.wrappers import ClipReward

from lagom.envs import RecordEpisodeStatistics
from lagom.envs import NormalizeObservation
from lagom.envs import NormalizeReward
from lagom.envs import make_vec_env
from lagom.envs import VecEnv
from lagom.envs.wrappers import get_wrapper
Expand Down Expand Up @@ -76,6 +78,69 @@ def test_record_episode_statistics(env_id, deque_size):
assert len(env.return_queue) == deque_size
assert len(env.horizon_queue) == deque_size


@pytest.mark.parametrize('env_id', ['CartPole-v1', 'Pendulum-v0'])
def test_normalize_observation(env_id):
env = gym.make(env_id)
wrapped_env = NormalizeObservation(gym.make(env_id))
unbiased = []

env.seed(0)
wrapped_env.seed(0)

obs = env.reset()
wrapped_obs = wrapped_env.reset()
unbiased.append(obs)

for t in range(env.spec.max_episode_steps):
action = env.action_space.sample()
obs, _, done, _ = env.step(action)
wrapped_obs, _, wrapped_done, _ = wrapped_env.step(action)
unbiased.append(obs)

mean = np.mean(unbiased, 0)
var = np.var(unbiased, 0)
assert np.allclose(wrapped_env.obs_moments.mean, mean, atol=1e-5)
assert np.allclose(wrapped_env.obs_moments.var, var, atol=1e-4)

assert done == wrapped_done
if done:
break


@pytest.mark.parametrize('env_id', ['CartPole-v1', 'Pendulum-v0'])
@pytest.mark.parametrize('gamma', [0.5, 0.99])
def test_normalize_reward(env_id, gamma):
env = gym.make(env_id)
wrapped_env = NormalizeReward(gym.make(env_id), gamma=gamma)
unbiased = []

env.seed(0)
wrapped_env.seed(0)

for n in range(10):
obs = env.reset()
wrapped_obs = wrapped_env.reset()
G = 0.0
for t in range(env.spec.max_episode_steps):
action = env.action_space.sample()
_, reward, done, _ = env.step(action)
_, wrapped_reward, wrapped_done, _ = wrapped_env.step(action)
assert done == wrapped_done

G = reward + gamma*G
unbiased.append(G)

if done:
break

mean = np.mean(unbiased, 0)
var = np.var(unbiased, 0)
assert wrapped_env.all_returns == G

assert np.allclose(wrapped_env.reward_moments.mean, mean, atol=1e-4)
assert np.allclose(wrapped_env.reward_moments.var, var, atol=1e-3)


@pytest.mark.parametrize('env_id', ['CartPole-v0', 'Pendulum-v0'])
@pytest.mark.parametrize('num_env', [1, 3, 5])
Expand Down

0 comments on commit 0bdc4f0

Please sign in to comment.