# Cart-Pole Balancing with Deep Q Network

In [1]:
import numpy as np
import gym

from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.optimizers import Adam

from rl.agents.dqn import DQNAgent
from rl.policy import BoltzmannQPolicy
from rl.memory import SequentialMemory

ENV_NAME = 'CartPole-v0'

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
# Get the environment and extract the number of actions.
env = gym.make(ENV_NAME)
np.random.seed(123)
env.seed(123)

nb_actions = env.action_space.n
obs_dim = env.observation_space.shape[0]

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [3]:
model = Sequential()
model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('linear'))
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_1 (Flatten)          (None, 4)                 0         
_________________________________________________________________
dense_1 (Dense)              (None, 16)                80        
_________________________________________________________________
activation_1 (Activation)    (None, 16)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 16)                272       
_________________________________________________________________
activation_2 (Activation)    (None, 16)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 16)                272       
_________________________________________________________________
activation_3 (Activation)    (None, 16)                0         
__________

In [5]:
# Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
# even the metrics!
memory = SequentialMemory(limit=50000, window_length=1)
policy = BoltzmannQPolicy()
dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10,
               target_model_update=1e-2, policy=policy)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])

dqn.fit(env, nb_steps=50000, visualize=True, verbose=2)

# After training is done, we save the final weights.
dqn.save_weights('dqn_{}_weights.h5f'.format(ENV_NAME), overwrite=True)

Training for 50000 steps ...




    26/50000: episode: 1, duration: 4.275s, episode steps: 26, steps per second: 6, episode reward: 26.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.423 [0.000, 1.000], mean observation: 0.073 [-1.313, 1.982], loss: 0.491183, mean_absolute_error: 0.570008, mean_q: 0.180484




    58/50000: episode: 2, duration: 0.532s, episode steps: 32, steps per second: 60, episode reward: 32.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.344 [0.000, 1.000], mean observation: 0.006 [-1.883, 2.595], loss: 0.292626, mean_absolute_error: 0.598018, mean_q: 0.497574
    71/50000: episode: 3, duration: 0.217s, episode steps: 13, steps per second: 60, episode reward: 13.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.231 [0.000, 1.000], mean observation: 0.094 [-1.396, 2.260], loss: 0.146170, mean_absolute_error: 0.699270, mean_q: 0.963982
    99/50000: episode: 4, duration: 0.455s, episode steps: 28, steps per second: 62, episode reward: 28.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.357 [0.000, 1.000], mean observation: 0.050 [-1.531, 2.527], loss: 0.082399, mean_absolute_error: 0.748076, mean_q: 1.243288
   123/50000: episode: 5, duration: 0.400s, episode steps: 24, steps per second: 60, episode reward: 24.000, mean reward: 1.000 [1.000, 1.000], mean

   653/50000: episode: 31, duration: 0.263s, episode steps: 16, steps per second: 61, episode reward: 16.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.312 [0.000, 1.000], mean observation: 0.086 [-1.232, 2.202], loss: 0.249145, mean_absolute_error: 2.906821, mean_q: 5.621923
   690/50000: episode: 32, duration: 0.619s, episode steps: 37, steps per second: 60, episode reward: 37.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.514 [0.000, 1.000], mean observation: -0.054 [-1.314, 0.627], loss: 0.279943, mean_absolute_error: 3.016434, mean_q: 5.814441
   717/50000: episode: 33, duration: 0.440s, episode steps: 27, steps per second: 61, episode reward: 27.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.481 [0.000, 1.000], mean observation: -0.040 [-1.483, 1.027], loss: 0.278819, mean_absolute_error: 3.129076, mean_q: 6.035770
   732/50000: episode: 34, duration: 0.251s, episode steps: 15, steps per second: 60, episode reward: 15.000, mean reward: 1.000 [1.000, 1.000]

  1737/50000: episode: 60, duration: 1.756s, episode steps: 106, steps per second: 60, episode reward: 106.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.462 [0.000, 1.000], mean observation: -0.290 [-1.684, 0.799], loss: 0.588892, mean_absolute_error: 6.701797, mean_q: 13.388493
  1763/50000: episode: 61, duration: 0.434s, episode steps: 26, steps per second: 60, episode reward: 26.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.049 [-1.459, 0.980], loss: 0.515016, mean_absolute_error: 6.955304, mean_q: 13.981734
  1847/50000: episode: 62, duration: 1.386s, episode steps: 84, steps per second: 61, episode reward: 84.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.476 [0.000, 1.000], mean observation: -0.231 [-0.942, 0.658], loss: 0.509246, mean_absolute_error: 7.115612, mean_q: 14.307262
  1954/50000: episode: 63, duration: 1.781s, episode steps: 107, steps per second: 60, episode reward: 107.000, mean reward: 1.000 [1.000

In [21]:
# Finally, evaluate our algorithm for 5 episodes.
dqn.test(env, nb_episodes=5, visualize=True)

Testing for 5 episodes ...
Episode 1: reward: 200.000, steps: 200
Episode 2: reward: 200.000, steps: 200
Episode 3: reward: 200.000, steps: 200
Episode 4: reward: 200.000, steps: 200
Episode 5: reward: 200.000, steps: 200


<keras.callbacks.History at 0x7f62ffb96278>