In [1]:
from __future__ import division
from PIL import Image

import numpy as np
import gym

from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten, Convolution2D, Permute
from keras.optimizers import Adam

import keras.backend as K

from rl.agents.dqn import DQNAgent
from rl.policy import BoltzmannQPolicy, LinearAnnealedPolicy, EpsGreedyQPolicy
from rl.memory import SequentialMemory
from rl.core import Processor

from rl.callbacks import FileLogger, ModelIntervalCheckpoint

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
INPUT_SHAPE = (84, 84)
WINDOW_LENGTH = 4

In [3]:
# In this example, we need to preprocess the observations
class AtariProcessor(Processor):
    def process_observation(self, observation):
        assert observation.ndim == 3  # (height, width, channel)
        img = Image.fromarray(observation)
        img = img.resize(INPUT_SHAPE).convert('L')
        processed_observation = np.array(img)
        assert processed_observation.shape == INPUT_SHAPE
        return processed_observation.astype('uint8')

    def process_state_batch(self, batch):
        processed_batch = batch.astype('float32') / 255.
        return processed_batch

    def process_reward(self, reward):
        return np.clip(reward, -1., 1.)

In [4]:
ENV_NAME = 'BreakoutDeterministic-v4'

In [5]:
# Get the environment and extract the number of actions.
env = gym.make(ENV_NAME)
nb_actions = env.action_space.n

# random seed
np.random.seed(123)
env.seed(123)

[123, 151010689]

In [6]:
input_shape = (WINDOW_LENGTH,) + INPUT_SHAPE

In [7]:
# Next, we build our model. We use the same model that was described by Mnih et al. (2015).
def createModel():
  model = Sequential()

  if K.common.image_dim_ordering() == 'tf':
      # (width, height, channels)
      model.add(Permute((2, 3, 1), input_shape=input_shape))
  elif K.common.image_dim_ordering() == 'th':
      # (channels, width, height)
      model.add(Permute((1, 2, 3), input_shape=input_shape))
  else:
      raise RuntimeError('Unknown image_dim_ordering.')

  model.add(Convolution2D(32, (8, 8), strides=(4, 4)))
  model.add(Activation('relu'))
  model.add(Convolution2D(64, (4, 4), strides=(2, 2)))
  model.add(Activation('relu'))
  model.add(Convolution2D(64, (3, 3), strides=(1, 1)))
  model.add(Activation('relu'))
  model.add(Flatten())
  model.add(Dense(512))
  model.add(Activation('relu'))
  model.add(Dense(nb_actions))
  model.add(Activation('linear'))
  return model

In [17]:
model = createModel()
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
permute_6 (Permute)          (None, 84, 84, 4)         0         
_________________________________________________________________
conv2d_16 (Conv2D)           (None, 20, 20, 32)        8224      
_________________________________________________________________
activation_26 (Activation)   (None, 20, 20, 32)        0         
_________________________________________________________________
conv2d_17 (Conv2D)           (None, 9, 9, 64)          32832     
_________________________________________________________________
activation_27 (Activation)   (None, 9, 9, 64)          0         
_________________________________________________________________
conv2d_18 (Conv2D)           (None, 7, 7, 64)          36928     
_________________________________________________________________
activation_28 (Activation)   (None, 7, 7, 64)          0         
__________

In [14]:
# Let's define the memory for storing the experience
memory = SequentialMemory(limit=1000000, window_length=WINDOW_LENGTH)
processor = AtariProcessor()

In [15]:
# Define the policy that our agent will follow
policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.1, value_test=.05,
                              nb_steps=1000000)

In [18]:
# Define the agent
dqn = DQNAgent(model=model, nb_actions=nb_actions, policy=policy, memory=memory,
               processor=processor, nb_steps_warmup=50000, gamma=.99, target_model_update=10000,
               train_interval=20)
dqn.compile(Adam(lr=.00025), metrics=['mae'])

dqn.load_weights(weights_filename)

In [19]:
weights_filename = 'dqn_{}_weights.h5f'.format(ENV_NAME)

In [None]:
# Training part
checkpoint_weights_filename = 'dqn_' + ENV_NAME + '_weights_{step}.h5f'
log_filename = 'dqn_{}_log.json'.format(ENV_NAME)
callbacks = [ModelIntervalCheckpoint(checkpoint_weights_filename, interval=50000)]
callbacks += [FileLogger(log_filename, interval=100)]

dqn.fit(env, callbacks=callbacks, nb_steps=500000, log_interval=10000, visualize=True)

dqn.save_weights(weights_filename, overwrite=True)

Training for 1750000 steps ...
Interval 1 (0 steps performed)
56 episodes - episode_reward: 1.107 [0.000, 6.000] - ale.lives: 2.969

Interval 2 (10000 steps performed)
55 episodes - episode_reward: 1.127 [0.000, 5.000] - ale.lives: 2.910

Interval 3 (20000 steps performed)
56 episodes - episode_reward: 1.018 [0.000, 3.000] - ale.lives: 2.949

Interval 4 (30000 steps performed)
58 episodes - episode_reward: 0.931 [0.000, 4.000] - ale.lives: 2.992

Interval 5 (40000 steps performed)
55 episodes - episode_reward: 1.073 [0.000, 4.000] - ale.lives: 2.963

Interval 6 (50000 steps performed)
57 episodes - episode_reward: 0.965 [0.000, 4.000] - loss: 0.003 - mean_absolute_error: 0.011 - mean_q: 0.000 - mean_eps: 0.950 - ale.lives: 2.917

Interval 7 (60000 steps performed)
57 episodes - episode_reward: 1.211 [0.000, 4.000] - loss: 0.003 - mean_absolute_error: 0.010 - mean_q: 0.013 - mean_eps: 0.942 - ale.lives: 2.946

Interval 8 (70000 steps performed)
52 episodes - episode_reward: 1.481 [0.000

In [12]:
weights_filename = 'dqn_{}_weights{}.h5f'.format(ENV_NAME, '_500000')

new_model = createModel()
new_policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.1, value_test=.05,
                              nb_steps=1000000)

new_memory = SequentialMemory(limit=1000000, window_length=WINDOW_LENGTH)
new_processor = AtariProcessor()

new_dqn = DQNAgent(model=new_model, nb_actions=nb_actions, policy=new_policy, memory=new_memory,
               processor=new_processor, nb_steps_warmup=50000, gamma=.99, target_model_update=10000,
               train_interval=20)
new_dqn.compile(Adam(lr=.00025), metrics=['mae'])


new_dqn.load_weights(weights_filename)

In [13]:
# Finally, evaluate our algorithm for 5 episodes.
# Testing part
new_dqn.test(env, nb_episodes=10, visualize=True)

Testing for 10 episodes ...


KeyboardInterrupt: 