# Cart-Pole Balancing with Deep Q Network

In [16]:
import numpy as np
import gym

from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.optimizers import Adam

from rl.agents.dqn import DQNAgent
from rl.policy import BoltzmannQPolicy
from rl.memory import SequentialMemory

ENV_NAME = 'CartPole-v0'

In [17]:
# Get the environment and extract the number of actions.
env = gym.make(ENV_NAME)
np.random.seed(123)
env.seed(123)

nb_actions = env.action_space.n
obs_dim = env.observation_space.shape[0]

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [19]:
model = Sequential()
model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('linear'))
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_1 (Flatten)          (None, 4)                 0         
_________________________________________________________________
dense_1 (Dense)              (None, 16)                80        
_________________________________________________________________
activation_1 (Activation)    (None, 16)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 16)                272       
_________________________________________________________________
activation_2 (Activation)    (None, 16)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 16)                272       
_________________________________________________________________
activation_3 (Activation)    (None, 16)                0         
__________

In [20]:
# Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
# even the metrics!
memory = SequentialMemory(limit=50000, window_length=1)
policy = BoltzmannQPolicy()
dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10,
               target_model_update=1e-2, policy=policy)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])

dqn.fit(env, nb_steps=50000, visualize=False, verbose=2)

# After training is done, we save the final weights.
dqn.save_weights('dqn_{}_weights.h5f'.format(ENV_NAME), overwrite=True)

Training for 50000 steps ...




    31/50000: episode: 1, duration: 1.624s, episode steps: 31, steps per second: 19, episode reward: 31.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.419 [0.000, 1.000], mean observation: 0.013 [-1.185, 1.776], loss: 0.463339, mean_absolute_error: 0.520827, mean_q: 0.095035
    44/50000: episode: 2, duration: 0.058s, episode steps: 13, steps per second: 223, episode reward: 13.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.692 [0.000, 1.000], mean observation: -0.069 [-1.867, 1.224], loss: 0.349585, mean_absolute_error: 0.544662, mean_q: 0.295623
    64/50000: episode: 3, duration: 0.092s, episode steps: 20, steps per second: 218, episode reward: 20.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.550 [0.000, 1.000], mean observation: -0.070 [-1.375, 0.833], loss: 0.230839, mean_absolute_error: 0.553566, mean_q: 0.485147




    83/50000: episode: 4, duration: 0.101s, episode steps: 19, steps per second: 189, episode reward: 19.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.421 [0.000, 1.000], mean observation: 0.074 [-0.817, 1.505], loss: 0.117445, mean_absolute_error: 0.601478, mean_q: 0.810982
    98/50000: episode: 5, duration: 0.073s, episode steps: 15, steps per second: 206, episode reward: 15.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.600 [0.000, 1.000], mean observation: -0.090 [-1.368, 0.825], loss: 0.056746, mean_absolute_error: 0.685648, mean_q: 1.166542
   114/50000: episode: 6, duration: 0.066s, episode steps: 16, steps per second: 242, episode reward: 16.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.750 [0.000, 1.000], mean observation: -0.069 [-2.711, 1.766], loss: 0.035798, mean_absolute_error: 0.729140, mean_q: 1.335475
   128/50000: episode: 7, duration: 0.058s, episode steps: 14, steps per second: 242, episode reward: 14.000, mean reward: 1.000 [1.000, 1.000]

   700/50000: episode: 34, duration: 0.094s, episode steps: 20, steps per second: 213, episode reward: 20.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.400 [0.000, 1.000], mean observation: 0.095 [-0.749, 1.651], loss: 0.233555, mean_absolute_error: 3.132175, mean_q: 6.065133
   755/50000: episode: 35, duration: 0.230s, episode steps: 55, steps per second: 240, episode reward: 55.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.455 [0.000, 1.000], mean observation: 0.097 [-1.211, 2.213], loss: 0.282160, mean_absolute_error: 3.259107, mean_q: 6.318562
   809/50000: episode: 36, duration: 0.219s, episode steps: 54, steps per second: 247, episode reward: 54.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.173 [-1.634, 0.532], loss: 0.288126, mean_absolute_error: 3.505757, mean_q: 6.856623
   862/50000: episode: 37, duration: 0.214s, episode steps: 53, steps per second: 248, episode reward: 53.000, mean reward: 1.000 [1.000, 1.0

  4891/50000: episode: 63, duration: 0.721s, episode steps: 170, steps per second: 236, episode reward: 170.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.471 [0.000, 1.000], mean observation: -0.391 [-2.428, 0.763], loss: 2.105090, mean_absolute_error: 20.913395, mean_q: 42.403671
  5054/50000: episode: 64, duration: 0.706s, episode steps: 163, steps per second: 231, episode reward: 163.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.466 [0.000, 1.000], mean observation: -0.404 [-2.426, 0.864], loss: 2.083196, mean_absolute_error: 21.504913, mean_q: 43.680882
  5209/50000: episode: 65, duration: 0.723s, episode steps: 155, steps per second: 214, episode reward: 155.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.477 [0.000, 1.000], mean observation: -0.427 [-2.418, 0.656], loss: 2.009528, mean_absolute_error: 22.155777, mean_q: 44.914158
  5381/50000: episode: 66, duration: 0.874s, episode steps: 172, steps per second: 197, episode reward: 172.000, mean reward: 1

 10266/50000: episode: 92, duration: 0.830s, episode steps: 200, steps per second: 241, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.485 [0.000, 1.000], mean observation: -0.198 [-1.515, 0.647], loss: 3.230336, mean_absolute_error: 34.296108, mean_q: 69.266769
 10457/50000: episode: 93, duration: 0.826s, episode steps: 191, steps per second: 231, episode reward: 191.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.476 [0.000, 1.000], mean observation: -0.346 [-2.417, 0.784], loss: 3.689327, mean_absolute_error: 34.418411, mean_q: 69.475700
 10636/50000: episode: 94, duration: 0.755s, episode steps: 179, steps per second: 237, episode reward: 179.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.475 [0.000, 1.000], mean observation: -0.369 [-2.423, 0.973], loss: 2.471273, mean_absolute_error: 34.712635, mean_q: 70.216537
 10836/50000: episode: 95, duration: 0.874s, episode steps: 200, steps per second: 229, episode reward: 200.000, mean reward: 1

 15865/50000: episode: 121, duration: 0.764s, episode steps: 185, steps per second: 242, episode reward: 185.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.481 [0.000, 1.000], mean observation: -0.382 [-2.400, 0.973], loss: 1.537015, mean_absolute_error: 38.575970, mean_q: 77.801216
 16065/50000: episode: 122, duration: 0.830s, episode steps: 200, steps per second: 241, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.490 [0.000, 1.000], mean observation: -0.056 [-0.912, 0.781], loss: 3.815359, mean_absolute_error: 38.581142, mean_q: 77.638458
 16248/50000: episode: 123, duration: 0.761s, episode steps: 183, steps per second: 241, episode reward: 183.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.470 [0.000, 1.000], mean observation: -0.390 [-2.414, 1.107], loss: 3.995685, mean_absolute_error: 38.416828, mean_q: 77.314659
 16448/50000: episode: 124, duration: 0.833s, episode steps: 200, steps per second: 240, episode reward: 200.000, mean rewar

 21617/50000: episode: 150, duration: 0.843s, episode steps: 200, steps per second: 237, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.490 [0.000, 1.000], mean observation: -0.042 [-0.772, 0.763], loss: 5.097879, mean_absolute_error: 40.026676, mean_q: 80.552597
 21817/50000: episode: 151, duration: 0.855s, episode steps: 200, steps per second: 234, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.249 [-0.807, 1.515], loss: 6.423345, mean_absolute_error: 40.349888, mean_q: 80.917641
 22017/50000: episode: 152, duration: 0.857s, episode steps: 200, steps per second: 233, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: -0.057 [-0.994, 0.958], loss: 7.028378, mean_absolute_error: 40.294884, mean_q: 81.072205
 22217/50000: episode: 153, duration: 0.857s, episode steps: 200, steps per second: 233, episode reward: 200.000, mean reward

 27417/50000: episode: 179, duration: 0.876s, episode steps: 200, steps per second: 228, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.164 [-1.005, 1.012], loss: 11.310450, mean_absolute_error: 43.008614, mean_q: 86.343987
 27617/50000: episode: 180, duration: 0.906s, episode steps: 200, steps per second: 221, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.133 [-1.039, 1.055], loss: 11.946851, mean_absolute_error: 43.028397, mean_q: 86.319023
 27817/50000: episode: 181, duration: 0.878s, episode steps: 200, steps per second: 228, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.198 [-0.973, 1.155], loss: 14.558741, mean_absolute_error: 43.552303, mean_q: 87.156250
 28017/50000: episode: 182, duration: 0.874s, episode steps: 200, steps per second: 229, episode reward: 200.000, mean rewar

 33217/50000: episode: 208, duration: 0.900s, episode steps: 200, steps per second: 222, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.214 [-0.720, 1.207], loss: 5.913559, mean_absolute_error: 44.419140, mean_q: 89.389915
 33417/50000: episode: 209, duration: 0.905s, episode steps: 200, steps per second: 221, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.222 [-0.756, 1.247], loss: 10.174154, mean_absolute_error: 44.421486, mean_q: 89.321671
 33617/50000: episode: 210, duration: 0.899s, episode steps: 200, steps per second: 222, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.220 [-0.652, 1.209], loss: 7.305597, mean_absolute_error: 44.269444, mean_q: 89.116844
 33817/50000: episode: 211, duration: 0.902s, episode steps: 200, steps per second: 222, episode reward: 200.000, mean reward:

 39017/50000: episode: 237, duration: 0.953s, episode steps: 200, steps per second: 210, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.190 [-0.755, 0.994], loss: 7.166111, mean_absolute_error: 44.388760, mean_q: 89.305275
 39217/50000: episode: 238, duration: 0.924s, episode steps: 200, steps per second: 217, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.162 [-0.774, 0.901], loss: 4.585834, mean_absolute_error: 44.625320, mean_q: 89.897285
 39417/50000: episode: 239, duration: 0.924s, episode steps: 200, steps per second: 217, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.165 [-0.628, 0.878], loss: 7.684904, mean_absolute_error: 44.497292, mean_q: 89.498535
 39617/50000: episode: 240, duration: 0.923s, episode steps: 200, steps per second: 217, episode reward: 200.000, mean reward: 

 44817/50000: episode: 266, duration: 1.001s, episode steps: 200, steps per second: 200, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.156 [-0.869, 0.952], loss: 4.752415, mean_absolute_error: 43.869923, mean_q: 88.317093
 45017/50000: episode: 267, duration: 1.034s, episode steps: 200, steps per second: 193, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.485 [0.000, 1.000], mean observation: -0.167 [-1.347, 1.252], loss: 4.641644, mean_absolute_error: 44.294567, mean_q: 89.175140
 45217/50000: episode: 268, duration: 0.975s, episode steps: 200, steps per second: 205, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.123 [-0.803, 0.749], loss: 6.366070, mean_absolute_error: 44.172874, mean_q: 88.802155
 45417/50000: episode: 269, duration: 1.042s, episode steps: 200, steps per second: 192, episode reward: 200.000, mean reward:

In [21]:
# Finally, evaluate our algorithm for 5 episodes.
dqn.test(env, nb_episodes=5, visualize=True)

Testing for 5 episodes ...
Episode 1: reward: 200.000, steps: 200
Episode 2: reward: 200.000, steps: 200
Episode 3: reward: 200.000, steps: 200
Episode 4: reward: 200.000, steps: 200
Episode 5: reward: 200.000, steps: 200


<keras.callbacks.History at 0x7f62ffb96278>