https://bi3mer.github.io/blog/post_18/index.html

In [9]:
import gym
env = gym.make('MountainCar-v0')

In [10]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Flatten
from tensorflow.keras.optimizers import Adam

model = Sequential()
model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
model.add(Dense(128))
model.add(Activation('relu'))
model.add(Dense(64))
model.add(Activation('relu'))
model.add(Dense(32))
model.add(Activation('relu'))
model.add(Dense(env.action_space.n))
model.add(Activation('linear'))
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten (Flatten)            (None, 2)                 0         
_________________________________________________________________
dense (Dense)                (None, 128)               384       
_________________________________________________________________
activation (Activation)      (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 64)                8256      
_________________________________________________________________
activation_1 (Activation)    (None, 64)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 32)                2080      
_________________________________________________________________
activation_2 (Activation)    (None, 32)                0

In [11]:
from rl.memory import SequentialMemory
from rl.policy import BoltzmannQPolicy
from rl.agents import DQNAgent

dqn = DQNAgent(
    model=model, 
    nb_actions=env.action_space.n, 
    memory=SequentialMemory(limit=50000, window_length=1), 
    nb_steps_warmup=10,
    target_model_update=1e-2, 
    policy=BoltzmannQPolicy()
)

dqn.compile(Adam(lr=1e-3), metrics=['mae'])




In [None]:

dqn.fit(env, nb_steps=150000, visualize=False, verbose=2)

Training for 150000 steps ...




    200/150000: episode: 1, duration: 1.886s, episode steps: 200, steps per second: 106, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 0.990 [0.000, 2.000],  loss: 0.069910, mae: 0.804387, mean_q: -0.994688
    400/150000: episode: 2, duration: 1.330s, episode steps: 200, steps per second: 150, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 0.995 [0.000, 2.000],  loss: 0.012567, mae: 1.743582, mean_q: -2.554291
    600/150000: episode: 3, duration: 1.322s, episode steps: 200, steps per second: 151, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.025 [0.000, 2.000],  loss: 0.025732, mae: 2.846496, mean_q: -4.193378
    800/150000: episode: 4, duration: 1.310s, episode steps: 200, steps per second: 153, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 0.945 [0.000, 2.000],  loss: 0.057200, mae: 3.982826, mean_q: -5.873297
   1000/150000: episode: 5, duration: 1.316s, ep

   7200/150000: episode: 36, duration: 1.395s, episode steps: 200, steps per second: 143, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.010 [0.000, 2.000],  loss: 4.250932, mae: 26.688129, mean_q: -39.531445
   7400/150000: episode: 37, duration: 1.422s, episode steps: 200, steps per second: 141, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 0.980 [0.000, 2.000],  loss: 3.937739, mae: 27.099297, mean_q: -40.117085
   7600/150000: episode: 38, duration: 1.374s, episode steps: 200, steps per second: 146, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.090 [0.000, 2.000],  loss: 3.950129, mae: 27.450401, mean_q: -40.555809
   7800/150000: episode: 39, duration: 1.381s, episode steps: 200, steps per second: 145, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.050 [0.000, 2.000],  loss: 3.305066, mae: 27.903048, mean_q: -41.356415
   8000/150000: episode: 40, duratio

  14000/150000: episode: 70, duration: 1.928s, episode steps: 200, steps per second: 104, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.060 [0.000, 2.000],  loss: 4.693427, mae: 35.217049, mean_q: -52.313652
  14200/150000: episode: 71, duration: 1.945s, episode steps: 200, steps per second: 103, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.090 [0.000, 2.000],  loss: 7.421144, mae: 35.383717, mean_q: -52.398514
  14400/150000: episode: 72, duration: 1.903s, episode steps: 200, steps per second: 105, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.205 [0.000, 2.000],  loss: 5.477354, mae: 35.538548, mean_q: -52.675816
  14600/150000: episode: 73, duration: 1.945s, episode steps: 200, steps per second: 103, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.055 [0.000, 2.000],  loss: 7.124365, mae: 35.651695, mean_q: -52.805889
  14800/150000: episode: 74, duratio

  20800/150000: episode: 104, duration: 1.874s, episode steps: 200, steps per second: 107, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 0.935 [0.000, 2.000],  loss: 9.403881, mae: 39.088802, mean_q: -57.776768
  21000/150000: episode: 105, duration: 1.971s, episode steps: 200, steps per second: 101, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 0.890 [0.000, 2.000],  loss: 7.847235, mae: 39.129921, mean_q: -57.987656
  21200/150000: episode: 106, duration: 1.947s, episode steps: 200, steps per second: 103, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.015 [0.000, 2.000],  loss: 10.902219, mae: 39.077709, mean_q: -57.683800
  21400/150000: episode: 107, duration: 1.836s, episode steps: 200, steps per second: 109, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.115 [0.000, 2.000],  loss: 7.019603, mae: 39.000702, mean_q: -57.860695
  21600/150000: episode: 108, d

  27600/150000: episode: 138, duration: 1.971s, episode steps: 200, steps per second: 101, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.000 [0.000, 2.000],  loss: 8.565435, mae: 40.452793, mean_q: -60.042198
  27800/150000: episode: 139, duration: 2.011s, episode steps: 200, steps per second:  99, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 0.980 [0.000, 2.000],  loss: 10.492897, mae: 40.455051, mean_q: -59.954678
  28000/150000: episode: 140, duration: 1.950s, episode steps: 200, steps per second: 103, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.100 [0.000, 2.000],  loss: 6.526506, mae: 40.610634, mean_q: -60.322617
  28200/150000: episode: 141, duration: 1.960s, episode steps: 200, steps per second: 102, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.085 [0.000, 2.000],  loss: 10.573242, mae: 40.655262, mean_q: -60.119755
  28400/150000: episode: 142, 

  34400/150000: episode: 172, duration: 1.959s, episode steps: 200, steps per second: 102, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.020 [0.000, 2.000],  loss: 6.791001, mae: 41.648418, mean_q: -61.924911
  34600/150000: episode: 173, duration: 1.962s, episode steps: 200, steps per second: 102, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 0.980 [0.000, 2.000],  loss: 6.834288, mae: 41.872997, mean_q: -62.198589
  34800/150000: episode: 174, duration: 1.953s, episode steps: 200, steps per second: 102, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 0.905 [0.000, 2.000],  loss: 8.412276, mae: 42.006851, mean_q: -62.392967
  35000/150000: episode: 175, duration: 2.026s, episode steps: 200, steps per second:  99, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.055 [0.000, 2.000],  loss: 13.170584, mae: 41.950665, mean_q: -61.975292
  35200/150000: episode: 176, d

  41200/150000: episode: 206, duration: 2.208s, episode steps: 200, steps per second:  91, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.020 [0.000, 2.000],  loss: 8.284353, mae: 41.906727, mean_q: -62.225292
  41400/150000: episode: 207, duration: 2.123s, episode steps: 200, steps per second:  94, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.155 [0.000, 2.000],  loss: 10.326740, mae: 41.873096, mean_q: -61.965878
  41600/150000: episode: 208, duration: 2.251s, episode steps: 200, steps per second:  89, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.010 [0.000, 2.000],  loss: 9.155169, mae: 41.733654, mean_q: -61.848507
  41800/150000: episode: 209, duration: 2.202s, episode steps: 200, steps per second:  91, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.045 [0.000, 2.000],  loss: 7.180951, mae: 41.730968, mean_q: -61.933090
  42000/150000: episode: 210, d

  48000/150000: episode: 240, duration: 2.219s, episode steps: 200, steps per second:  90, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.105 [0.000, 2.000],  loss: 8.670348, mae: 40.990143, mean_q: -60.771217
  48200/150000: episode: 241, duration: 2.206s, episode steps: 200, steps per second:  91, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.135 [0.000, 2.000],  loss: 11.419734, mae: 40.925201, mean_q: -60.493927
  48400/150000: episode: 242, duration: 2.147s, episode steps: 200, steps per second:  93, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 0.965 [0.000, 2.000],  loss: 9.410836, mae: 40.762085, mean_q: -60.440289
  48600/150000: episode: 243, duration: 2.212s, episode steps: 200, steps per second:  90, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.060 [0.000, 2.000],  loss: 8.089491, mae: 40.711414, mean_q: -60.367393
  48800/150000: episode: 244, d

  54800/150000: episode: 274, duration: 2.239s, episode steps: 200, steps per second:  89, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.005 [0.000, 2.000],  loss: 8.973886, mae: 39.968472, mean_q: -59.242989
  55000/150000: episode: 275, duration: 2.242s, episode steps: 200, steps per second:  89, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.150 [0.000, 2.000],  loss: 7.536655, mae: 39.868748, mean_q: -59.173634
  55200/150000: episode: 276, duration: 2.242s, episode steps: 200, steps per second:  89, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.065 [0.000, 2.000],  loss: 7.199480, mae: 39.977833, mean_q: -59.383984
  55400/150000: episode: 277, duration: 2.224s, episode steps: 200, steps per second:  90, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.035 [0.000, 2.000],  loss: 8.508230, mae: 40.113705, mean_q: -59.444317
  55600/150000: episode: 278, du

  61600/150000: episode: 308, duration: 2.298s, episode steps: 200, steps per second:  87, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.025 [0.000, 2.000],  loss: 9.530616, mae: 40.186520, mean_q: -59.561241
  61800/150000: episode: 309, duration: 2.342s, episode steps: 200, steps per second:  85, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 0.935 [0.000, 2.000],  loss: 8.236637, mae: 40.301826, mean_q: -59.792305
  62000/150000: episode: 310, duration: 2.175s, episode steps: 200, steps per second:  92, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.090 [0.000, 2.000],  loss: 10.170484, mae: 40.248016, mean_q: -59.508842
  62200/150000: episode: 311, duration: 1.937s, episode steps: 200, steps per second: 103, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.035 [0.000, 2.000],  loss: 8.725368, mae: 40.309608, mean_q: -59.717850
  62400/150000: episode: 312, d

  68400/150000: episode: 342, duration: 2.122s, episode steps: 200, steps per second:  94, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.035 [0.000, 2.000],  loss: 5.345106, mae: 38.373234, mean_q: -56.948574
  68600/150000: episode: 343, duration: 2.142s, episode steps: 200, steps per second:  93, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.105 [0.000, 2.000],  loss: 9.482738, mae: 38.301041, mean_q: -56.671459
  68800/150000: episode: 344, duration: 2.532s, episode steps: 200, steps per second:  79, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.075 [0.000, 2.000],  loss: 7.829176, mae: 38.334564, mean_q: -56.855400
  69000/150000: episode: 345, duration: 2.642s, episode steps: 200, steps per second:  76, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.065 [0.000, 2.000],  loss: 7.838324, mae: 38.299213, mean_q: -56.705887
  69200/150000: episode: 346, du

  75200/150000: episode: 376, duration: 2.043s, episode steps: 200, steps per second:  98, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.015 [0.000, 2.000],  loss: 9.519953, mae: 38.989506, mean_q: -57.645351
  75400/150000: episode: 377, duration: 2.189s, episode steps: 200, steps per second:  91, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 0.875 [0.000, 2.000],  loss: 8.571259, mae: 38.945423, mean_q: -57.572906
  75600/150000: episode: 378, duration: 2.132s, episode steps: 200, steps per second:  94, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.035 [0.000, 2.000],  loss: 6.646763, mae: 38.790565, mean_q: -57.542206
  75800/150000: episode: 379, duration: 2.170s, episode steps: 200, steps per second:  92, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 0.990 [0.000, 2.000],  loss: 8.209053, mae: 38.788769, mean_q: -57.412811
  76000/150000: episode: 380, du

  82000/150000: episode: 410, duration: 2.081s, episode steps: 200, steps per second:  96, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 0.975 [0.000, 2.000],  loss: 6.327131, mae: 35.001495, mean_q: -51.826359
  82200/150000: episode: 411, duration: 1.971s, episode steps: 200, steps per second: 101, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.155 [0.000, 2.000],  loss: 5.656868, mae: 35.130318, mean_q: -52.060196
  82400/150000: episode: 412, duration: 2.042s, episode steps: 200, steps per second:  98, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 0.980 [0.000, 2.000],  loss: 5.990932, mae: 35.476608, mean_q: -52.626591
  82600/150000: episode: 413, duration: 2.126s, episode steps: 200, steps per second:  94, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 0.715 [0.000, 2.000],  loss: 6.908877, mae: 35.611637, mean_q: -52.602974
  82800/150000: episode: 414, du

  88800/150000: episode: 444, duration: 2.162s, episode steps: 200, steps per second:  93, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.275 [0.000, 2.000],  loss: 5.448565, mae: 32.920799, mean_q: -48.470226
  89000/150000: episode: 445, duration: 2.095s, episode steps: 200, steps per second:  95, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.315 [0.000, 2.000],  loss: 3.746906, mae: 32.873989, mean_q: -48.423981
  89200/150000: episode: 446, duration: 2.247s, episode steps: 200, steps per second:  89, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.240 [0.000, 2.000],  loss: 3.676062, mae: 32.701759, mean_q: -48.169388
  89400/150000: episode: 447, duration: 2.229s, episode steps: 200, steps per second:  90, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.295 [0.000, 2.000],  loss: 3.676231, mae: 32.477772, mean_q: -47.882835
  89600/150000: episode: 448, du

In [None]:
dqn.save_weights('model.mdl', overwrite=True)
dqn.test(env, nb_episodes=5, visualize=True)