Skip to content
Branch: master
Find file Copy path
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
242 lines (199 sloc) 7.58 KB
import numpy as np
from collections import deque
import gym
from gym import spaces
import cv2
from copy import copy
def unwrap(env):
if hasattr(env, "unwrapped"):
return env.unwrapped
elif hasattr(env, "env"):
return unwrap(env.env)
elif hasattr(env, "leg_env"):
return unwrap(env.leg_env)
return env
class MaxAndSkipEnv(gym.Wrapper):
def __init__(self, env, skip=4):
"""Return only every `skip`-th frame"""
gym.Wrapper.__init__(self, env)
# most recent raw observations (for max pooling across time steps)
self._obs_buffer = np.zeros((2,)+env.observation_space.shape, dtype=np.uint8)
self._skip = skip
def step(self, action):
"""Repeat action, sum reward, and max over last observations."""
total_reward = 0.0
done = None
for i in range(self._skip):
obs, reward, done, info = self.env.step(action)
if i == self._skip - 2: self._obs_buffer[0] = obs
if i == self._skip - 1: self._obs_buffer[1] = obs
total_reward += reward
if done:
# Note that the observation on the done=True frame
# doesn't matter
max_frame = self._obs_buffer.max(axis=0)
return max_frame, total_reward, done, info
def reset(self, **kwargs):
return self.env.reset(**kwargs)
class ClipRewardEnv(gym.RewardWrapper):
def __init__(self, env):
gym.RewardWrapper.__init__(self, env)
def reward(self, reward):
"""Bin reward to {+1, 0, -1} by its sign."""
return float(np.sign(reward))
class WarpFrame(gym.ObservationWrapper):
def __init__(self, env):
"""Warp frames to 84x84 as done in the Nature paper and later work."""
gym.ObservationWrapper.__init__(self, env)
self.width = 84
self.height = 84
self.observation_space = spaces.Box(low=0, high=255,
shape=(self.height, self.width, 1), dtype=np.uint8)
def observation(self, frame):
frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
frame = cv2.resize(frame, (self.width, self.height), interpolation=cv2.INTER_AREA)
return frame[:, :, None]
class FrameStack(gym.Wrapper):
def __init__(self, env, k):
"""Stack k last frames.
Returns lazy array, which is much more memory efficient.
See Also
gym.Wrapper.__init__(self, env)
self.k = k
self.frames = deque([], maxlen=k)
shp = env.observation_space.shape
self.observation_space = spaces.Box(low=0, high=255, shape=(shp[0], shp[1], shp[2] * k), dtype=np.uint8)
def reset(self):
ob = self.env.reset()
for _ in range(self.k):
return self._get_ob()
def step(self, action):
ob, reward, done, info = self.env.step(action)
return self._get_ob(), reward, done, info
def _get_ob(self):
assert len(self.frames) == self.k
return LazyFrames(list(self.frames))
class ScaledFloatFrame(gym.ObservationWrapper):
def __init__(self, env):
gym.ObservationWrapper.__init__(self, env)
def observation(self, observation):
# careful! This undoes the memory optimization, use
# with smaller replay buffers only.
return np.array(observation).astype(np.float32) / 255.0
class LazyFrames(object):
def __init__(self, frames):
"""This object ensures that common frames between the observations are only stored once.
It exists purely to optimize memory usage which can be huge for DQN's 1M frames replay
This object should only be converted to numpy array before being passed to the model.
You'd not believe how complex the previous solution was."""
self._frames = frames
self._out = None
def _force(self):
if self._out is None:
self._out = np.concatenate(self._frames, axis=2)
self._frames = None
return self._out
def __array__(self, dtype=None):
out = self._force()
if dtype is not None:
out = out.astype(dtype)
return out
def __len__(self):
return len(self._force())
def __getitem__(self, i):
return self._force()[i]
class MontezumaInfoWrapper(gym.Wrapper):
def __init__(self, env, room_address):
super(MontezumaInfoWrapper, self).__init__(env)
self.room_address = room_address
self.visited_rooms = set()
def get_current_room(self):
ram = unwrap(self.env).ale.getRAM()
assert len(ram) == 128
return int(ram[self.room_address])
def step(self, action):
obs, rew, done, info = self.env.step(action)
if done:
if 'episode' not in info:
info['episode'] = {}
return obs, rew, done, info
def reset(self):
return self.env.reset()
class DummyMontezumaInfoWrapper(gym.Wrapper):
def __init__(self, env):
super(DummyMontezumaInfoWrapper, self).__init__(env)
def step(self, action):
obs, rew, done, info = self.env.step(action)
if done:
if 'episode' not in info:
info['episode'] = {}
return obs, rew, done, info
def reset(self):
return self.env.reset()
class AddRandomStateToInfo(gym.Wrapper):
def __init__(self, env):
"""Adds the random state to the info field on the first step after reset
gym.Wrapper.__init__(self, env)
def step(self, action):
ob, r, d, info = self.env.step(action)
if d:
if 'episode' not in info:
info['episode'] = {}
info['episode']['rng_at_episode_start'] = self.rng_at_episode_start
return ob, r, d, info
def reset(self, **kwargs):
self.rng_at_episode_start = copy(self.unwrapped.np_random)
return self.env.reset(**kwargs)
def make_atari(env_id, max_episode_steps=4500):
env = gym.make(env_id)
env._max_episode_steps = max_episode_steps*4
assert 'NoFrameskip' in
env = StickyActionEnv(env)
env = MaxAndSkipEnv(env, skip=4)
if "Montezuma" in env_id or "Pitfall" in env_id:
env = MontezumaInfoWrapper(env, room_address=3 if "Montezuma" in env_id else 1)
env = DummyMontezumaInfoWrapper(env)
env = AddRandomStateToInfo(env)
return env
def wrap_deepmind(env, clip_rewards=True, frame_stack=False, scale=False):
"""Configure environment for DeepMind-style Atari.
env = WarpFrame(env)
if scale:
env = ScaledFloatFrame(env)
if clip_rewards:
env = ClipRewardEnv(env)
if frame_stack:
env = FrameStack(env, 4)
# env = NormalizeObservation(env)
return env
class StickyActionEnv(gym.Wrapper):
def __init__(self, env, p=0.25):
super(StickyActionEnv, self).__init__(env)
self.p = p
self.last_action = 0
def reset(self):
self.last_action = 0
return self.env.reset()
def step(self, action):
if self.unwrapped.np_random.uniform() < self.p:
action = self.last_action
self.last_action = action
obs, reward, done, info = self.env.step(action)
return obs, reward, done, info
You can’t perform that action at this time.