Skip to content
Permalink
Browse files

code release

  • Loading branch information...
whyjay committed Jun 11, 2019
0 parents commit 75490184af952b2c2e64ed47738d214bbe1eb8cf
@@ -0,0 +1,111 @@
#**.pyc
#**/*.pyc
# Data
src
results
data
exps
samples
*.zip
*.ckpt*
*.gz
*events.*
log
log_*

web/js/gen_layers.js

# checkpoint
dataset
dataset/*
pretrained

# gym
gym/*
gym

# trash
.dropbox

# Created by https://www.gitignore.io/api/python,vim

### Python ###
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
env/
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
*.egg-info/
.installed.cfg
*.egg
.cenv

# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*,cover
.hypothesis/

# Translations
*.mo
*.pot

# Django stuff:
*.log

# Sphinx documentation
docs/_build/

# PyBuilder
target/


### Vim ###
[._]*.s[a-w][a-z]
[._]s[a-w][a-z]
*.un~
Session.vim
.netrwhist
*~

# Visualization
graphs
graphs/*
*.csv

# other sources
src
*screenshots*
*checkpoints
@@ -0,0 +1,35 @@
## Curiosity-Bottleneck: Exploration by Distilling Task-Specific Novelty ##

Youngjin Kim, Wontae Nam*, Hyunwoo Kim*, Jihoon Kim, Gunhee Kim<br/>
&#42;equal contribution

Vision and Learning Lab., Seoul National University<br/>
Clova, Naver

#### Installation
```
pip install -r requirements
```

#### Run

The following command should train an PPO agent with Curiosity-Bottleneck on Gravitar
```bash
python run_atari.py
```

#### Acknowledgements
This code is based on [RND](https://github.com/openai/random-network-distillation) implementation by Yuri Burda

#### Citation

```
@inproceedings{
kim2019curiositybottleneck,
title={Curiosity-Bottleneck: Exploration by Distilling Task-Specific Novelty},
author={Youngjin Kim and Wontae Nam and Hyunwoo Kim and Jihoon Kim and Gunhee Kim},
booktitle={International Conference on Machine Learning},
year={2019}
}
```

@@ -0,0 +1,242 @@
import numpy as np
from collections import deque
import gym
from gym import spaces
import cv2
from copy import copy

cv2.ocl.setUseOpenCL(False)

def unwrap(env):
if hasattr(env, "unwrapped"):
return env.unwrapped
elif hasattr(env, "env"):
return unwrap(env.env)
elif hasattr(env, "leg_env"):
return unwrap(env.leg_env)
else:
return env

class MaxAndSkipEnv(gym.Wrapper):
def __init__(self, env, skip=4):
"""Return only every `skip`-th frame"""
gym.Wrapper.__init__(self, env)
# most recent raw observations (for max pooling across time steps)
self._obs_buffer = np.zeros((2,)+env.observation_space.shape, dtype=np.uint8)
self._skip = skip

def step(self, action):
"""Repeat action, sum reward, and max over last observations."""
total_reward = 0.0
done = None
for i in range(self._skip):
obs, reward, done, info = self.env.step(action)
if i == self._skip - 2: self._obs_buffer[0] = obs
if i == self._skip - 1: self._obs_buffer[1] = obs
total_reward += reward
if done:
break
# Note that the observation on the done=True frame
# doesn't matter
max_frame = self._obs_buffer.max(axis=0)

return max_frame, total_reward, done, info

def reset(self, **kwargs):
return self.env.reset(**kwargs)

class ClipRewardEnv(gym.RewardWrapper):
def __init__(self, env):
gym.RewardWrapper.__init__(self, env)

def reward(self, reward):
"""Bin reward to {+1, 0, -1} by its sign."""
return float(np.sign(reward))

class WarpFrame(gym.ObservationWrapper):
def __init__(self, env):
"""Warp frames to 84x84 as done in the Nature paper and later work."""
gym.ObservationWrapper.__init__(self, env)
self.width = 84
self.height = 84
self.observation_space = spaces.Box(low=0, high=255,
shape=(self.height, self.width, 1), dtype=np.uint8)

def observation(self, frame):
frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
frame = cv2.resize(frame, (self.width, self.height), interpolation=cv2.INTER_AREA)
return frame[:, :, None]

class FrameStack(gym.Wrapper):
def __init__(self, env, k):
"""Stack k last frames.
Returns lazy array, which is much more memory efficient.
See Also
--------
rl_common.atari_wrappers.LazyFrames
"""
gym.Wrapper.__init__(self, env)
self.k = k
self.frames = deque([], maxlen=k)
shp = env.observation_space.shape
self.observation_space = spaces.Box(low=0, high=255, shape=(shp[0], shp[1], shp[2] * k), dtype=np.uint8)

def reset(self):
ob = self.env.reset()
for _ in range(self.k):
self.frames.append(ob)
return self._get_ob()

def step(self, action):
ob, reward, done, info = self.env.step(action)
self.frames.append(ob)
return self._get_ob(), reward, done, info

def _get_ob(self):
assert len(self.frames) == self.k
return LazyFrames(list(self.frames))

class ScaledFloatFrame(gym.ObservationWrapper):
def __init__(self, env):
gym.ObservationWrapper.__init__(self, env)

def observation(self, observation):
# careful! This undoes the memory optimization, use
# with smaller replay buffers only.
return np.array(observation).astype(np.float32) / 255.0

class LazyFrames(object):
def __init__(self, frames):
"""This object ensures that common frames between the observations are only stored once.
It exists purely to optimize memory usage which can be huge for DQN's 1M frames replay
buffers.
This object should only be converted to numpy array before being passed to the model.
You'd not believe how complex the previous solution was."""
self._frames = frames
self._out = None

def _force(self):
if self._out is None:
self._out = np.concatenate(self._frames, axis=2)
self._frames = None
return self._out

def __array__(self, dtype=None):
out = self._force()
if dtype is not None:
out = out.astype(dtype)
return out

def __len__(self):
return len(self._force())

def __getitem__(self, i):
return self._force()[i]

class MontezumaInfoWrapper(gym.Wrapper):
def __init__(self, env, room_address):
super(MontezumaInfoWrapper, self).__init__(env)
self.room_address = room_address
self.visited_rooms = set()

def get_current_room(self):
ram = unwrap(self.env).ale.getRAM()
assert len(ram) == 128
return int(ram[self.room_address])

def step(self, action):
obs, rew, done, info = self.env.step(action)
self.visited_rooms.add(self.get_current_room())
if done:
if 'episode' not in info:
info['episode'] = {}
info['episode'].update(visited_rooms=copy(self.visited_rooms))
self.visited_rooms.clear()
return obs, rew, done, info

def reset(self):
return self.env.reset()

class DummyMontezumaInfoWrapper(gym.Wrapper):

def __init__(self, env):
super(DummyMontezumaInfoWrapper, self).__init__(env)

def step(self, action):
obs, rew, done, info = self.env.step(action)
if done:
if 'episode' not in info:
info['episode'] = {}
info['episode'].update(pos_count=0,
visited_rooms=set([0]))
return obs, rew, done, info

def reset(self):
return self.env.reset()

class AddRandomStateToInfo(gym.Wrapper):
def __init__(self, env):
"""Adds the random state to the info field on the first step after reset
"""
gym.Wrapper.__init__(self, env)

def step(self, action):
ob, r, d, info = self.env.step(action)
if d:
if 'episode' not in info:
info['episode'] = {}
info['episode']['rng_at_episode_start'] = self.rng_at_episode_start
return ob, r, d, info

def reset(self, **kwargs):
self.rng_at_episode_start = copy(self.unwrapped.np_random)
return self.env.reset(**kwargs)


def make_atari(env_id, max_episode_steps=4500):
env = gym.make(env_id)
env._max_episode_steps = max_episode_steps*4
assert 'NoFrameskip' in env.spec.id
env = StickyActionEnv(env)
env = MaxAndSkipEnv(env, skip=4)
if "Montezuma" in env_id or "Pitfall" in env_id:
env = MontezumaInfoWrapper(env, room_address=3 if "Montezuma" in env_id else 1)
else:
env = DummyMontezumaInfoWrapper(env)
env = AddRandomStateToInfo(env)
return env

def wrap_deepmind(env, clip_rewards=True, frame_stack=False, scale=False):
"""Configure environment for DeepMind-style Atari.
"""
env = WarpFrame(env)
if scale:
env = ScaledFloatFrame(env)
if clip_rewards:
env = ClipRewardEnv(env)
if frame_stack:
env = FrameStack(env, 4)
# env = NormalizeObservation(env)
return env


class StickyActionEnv(gym.Wrapper):
def __init__(self, env, p=0.25):
super(StickyActionEnv, self).__init__(env)
self.p = p
self.last_action = 0

def reset(self):
self.last_action = 0
return self.env.reset()

def step(self, action):
if self.unwrapped.np_random.uniform() < self.p:
action = self.last_action
self.last_action = action
obs, reward, done, info = self.env.step(action)
return obs, reward, done, info
No changes.

0 comments on commit 7549018

Please sign in to comment.
You can’t perform that action at this time.