In [1]:
import numpy as np
import gym
import matplotlib.pyplot as plt

from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.optimizers import Adam

from rl.agents.dqn import DQNAgent
from rl.policy import BoltzmannQPolicy
from rl.memory import SequentialMemory


ENV_NAME = 'SpaceInvaders-v0'


# Get the environment and extract the number of actions.
env = gym.make(ENV_NAME)
np.random.seed(123)
env.seed(123)
nb_actions = env.action_space.n

# Next, we build a very simple model regardless of the dueling architecture
# if you enable dueling network in DQN , DQN will build a dueling network base on your model automatically
# Also, you can build a dueling network by yourself and turn off the dueling network in DQN.
model = Sequential()
model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(nb_actions, activation='linear'))
print(model.summary())
train_reward=[]
train_history=[]
# Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
# even the metrics!
memory = SequentialMemory(limit=50000, window_length=1)
policy = BoltzmannQPolicy()
# enable the dueling network
# you can specify the dueling_type to one of {'avg','max','naive'}
dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10,
               enable_dueling_network=True, dueling_type='avg', target_model_update=1e-2, policy=policy)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])

dqn.load_weights('duel_dqn_SpaceInvaders-v0_weights.h5f')

# Okay, now it's time to learn something! We visualize the training here for show, but this
# slows down training quite a lot. You can always safely abort the training prematurely using
# Ctrl + C.
train_history=dqn.fit(env, nb_steps=50000, visualize=True, verbose=2)
# After training is done, we save the final weights.
dqn.save_weights('duel_dqn_{}_weights.h5f'.format(ENV_NAME), overwrite=True)
train_reward=train_history.history['episode_reward']
plt.plot(train_reward)
plt.savefig('ddqn_cartpole')
# Finally, evaluate our algorithm for 5 episodes.
dqn.test(env, nb_episodes=5, visualize=True)



Using TensorFlow backend.


Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_1 (Flatten)          (None, 100800)            0         
_________________________________________________________________
dense_1 (Dense)              (None, 16)                1612816   
_________________________________________________________________
activation_1 (Activation)    (None, 16)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 16)                272       
_________________________________________________________________
activation_2 (Activation)    (None, 16)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 16)                272       
_________________________________________________________________
acti



Instructions for updating:
Use tf.cast instead.
   928/50000: episode: 1, duration: 84.244s, episode steps: 928, steps per second: 11, episode reward: 150.000, mean reward: 0.162 [0.000, 30.000], mean action: 2.610 [0.000, 5.000], mean observation: 9.880 [0.000, 181.000], loss: 0.604495, mean_absolute_error: 19.030613, mean_q: 22.943710
  1526/50000: episode: 2, duration: 50.948s, episode steps: 598, steps per second: 12, episode reward: 105.000, mean reward: 0.176 [0.000, 30.000], mean action: 2.589 [0.000, 5.000], mean observation: 10.427 [0.000, 181.000], loss: 1.668927, mean_absolute_error: 19.105341, mean_q: 23.046541
  2241/50000: episode: 3, duration: 64.203s, episode steps: 715, steps per second: 11, episode reward: 210.000, mean reward: 0.294 [0.000, 30.000], mean action: 2.674 [0.000, 5.000], mean observation: 9.862 [0.000, 181.000], loss: 2.359293, mean_absolute_error: 19.705481, mean_q: 23.864161
  2735/50000: episode: 4, duration: 41.007s, episode steps: 494, steps per sec

 19745/50000: episode: 29, duration: 56.665s, episode steps: 561, steps per second: 10, episode reward: 60.000, mean reward: 0.107 [0.000, 25.000], mean action: 2.542 [0.000, 5.000], mean observation: 10.277 [0.000, 181.000], loss: 4.709900, mean_absolute_error: 19.572119, mean_q: 23.536726
 20138/50000: episode: 30, duration: 37.602s, episode steps: 393, steps per second: 10, episode reward: 80.000, mean reward: 0.204 [0.000, 30.000], mean action: 2.575 [0.000, 5.000], mean observation: 10.531 [0.000, 181.000], loss: 3.720822, mean_absolute_error: 19.744579, mean_q: 23.741854
 20701/50000: episode: 31, duration: 50.633s, episode steps: 563, steps per second: 11, episode reward: 125.000, mean reward: 0.222 [0.000, 25.000], mean action: 2.501 [0.000, 5.000], mean observation: 10.131 [0.000, 181.000], loss: 4.837087, mean_absolute_error: 20.098534, mean_q: 24.212934
 21081/50000: episode: 32, duration: 32.877s, episode steps: 380, steps per second: 12, episode reward: 50.000, mean reward

 38846/50000: episode: 58, duration: 53.326s, episode steps: 522, steps per second: 10, episode reward: 155.000, mean reward: 0.297 [0.000, 30.000], mean action: 2.425 [0.000, 5.000], mean observation: 10.151 [0.000, 181.000], loss: 3.351343, mean_absolute_error: 19.306299, mean_q: 23.223066
 39789/50000: episode: 59, duration: 89.951s, episode steps: 943, steps per second: 10, episode reward: 115.000, mean reward: 0.122 [0.000, 30.000], mean action: 2.478 [0.000, 5.000], mean observation: 9.929 [0.000, 181.000], loss: 3.219973, mean_absolute_error: 19.528711, mean_q: 23.489939
 40318/50000: episode: 60, duration: 51.227s, episode steps: 529, steps per second: 10, episode reward: 125.000, mean reward: 0.236 [0.000, 25.000], mean action: 2.512 [0.000, 5.000], mean observation: 10.219 [0.000, 181.000], loss: 3.572593, mean_absolute_error: 19.472437, mean_q: 23.378935
 40840/50000: episode: 61, duration: 50.910s, episode steps: 522, steps per second: 10, episode reward: 55.000, mean rewar

<keras.callbacks.History at 0xb32d94b70>