https://bi3mer.github.io/blog/post_18/index.html

In [9]:
import gym
env = gym.make('MountainCar-v0')

In [10]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Flatten
from tensorflow.keras.optimizers import Adam

model = Sequential()
model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
model.add(Dense(128))
model.add(Activation('relu'))
model.add(Dense(64))
model.add(Activation('relu'))
model.add(Dense(32))
model.add(Activation('relu'))
model.add(Dense(env.action_space.n))
model.add(Activation('linear'))
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten (Flatten)            (None, 2)                 0         
_________________________________________________________________
dense (Dense)                (None, 128)               384       
_________________________________________________________________
activation (Activation)      (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 64)                8256      
_________________________________________________________________
activation_1 (Activation)    (None, 64)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 32)                2080      
_________________________________________________________________
activation_2 (Activation)    (None, 32)                0

In [11]:
from rl.memory import SequentialMemory
from rl.policy import BoltzmannQPolicy
from rl.agents import DQNAgent

dqn = DQNAgent(
    model=model, 
    nb_actions=env.action_space.n, 
    memory=SequentialMemory(limit=50000, window_length=1), 
    nb_steps_warmup=10,
    target_model_update=1e-2, 
    policy=BoltzmannQPolicy()
)

dqn.compile(Adam(lr=1e-3), metrics=['mae'])




In [None]:

dqn.fit(env, nb_steps=150000, visualize=False, verbose=2)

Training for 150000 steps ...




    200/150000: episode: 1, duration: 1.886s, episode steps: 200, steps per second: 106, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 0.990 [0.000, 2.000],  loss: 0.069910, mae: 0.804387, mean_q: -0.994688
    400/150000: episode: 2, duration: 1.330s, episode steps: 200, steps per second: 150, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 0.995 [0.000, 2.000],  loss: 0.012567, mae: 1.743582, mean_q: -2.554291
    600/150000: episode: 3, duration: 1.322s, episode steps: 200, steps per second: 151, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.025 [0.000, 2.000],  loss: 0.025732, mae: 2.846496, mean_q: -4.193378
    800/150000: episode: 4, duration: 1.310s, episode steps: 200, steps per second: 153, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 0.945 [0.000, 2.000],  loss: 0.057200, mae: 3.982826, mean_q: -5.873297
   1000/150000: episode: 5, duration: 1.316s, ep

   7200/150000: episode: 36, duration: 1.395s, episode steps: 200, steps per second: 143, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.010 [0.000, 2.000],  loss: 4.250932, mae: 26.688129, mean_q: -39.531445
   7400/150000: episode: 37, duration: 1.422s, episode steps: 200, steps per second: 141, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 0.980 [0.000, 2.000],  loss: 3.937739, mae: 27.099297, mean_q: -40.117085
   7600/150000: episode: 38, duration: 1.374s, episode steps: 200, steps per second: 146, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.090 [0.000, 2.000],  loss: 3.950129, mae: 27.450401, mean_q: -40.555809
   7800/150000: episode: 39, duration: 1.381s, episode steps: 200, steps per second: 145, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.050 [0.000, 2.000],  loss: 3.305066, mae: 27.903048, mean_q: -41.356415
   8000/150000: episode: 40, duratio

  14000/150000: episode: 70, duration: 1.928s, episode steps: 200, steps per second: 104, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.060 [0.000, 2.000],  loss: 4.693427, mae: 35.217049, mean_q: -52.313652
  14200/150000: episode: 71, duration: 1.945s, episode steps: 200, steps per second: 103, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.090 [0.000, 2.000],  loss: 7.421144, mae: 35.383717, mean_q: -52.398514
  14400/150000: episode: 72, duration: 1.903s, episode steps: 200, steps per second: 105, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.205 [0.000, 2.000],  loss: 5.477354, mae: 35.538548, mean_q: -52.675816
  14600/150000: episode: 73, duration: 1.945s, episode steps: 200, steps per second: 103, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.055 [0.000, 2.000],  loss: 7.124365, mae: 35.651695, mean_q: -52.805889
  14800/150000: episode: 74, duratio

  20800/150000: episode: 104, duration: 1.874s, episode steps: 200, steps per second: 107, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 0.935 [0.000, 2.000],  loss: 9.403881, mae: 39.088802, mean_q: -57.776768
  21000/150000: episode: 105, duration: 1.971s, episode steps: 200, steps per second: 101, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 0.890 [0.000, 2.000],  loss: 7.847235, mae: 39.129921, mean_q: -57.987656
  21200/150000: episode: 106, duration: 1.947s, episode steps: 200, steps per second: 103, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.015 [0.000, 2.000],  loss: 10.902219, mae: 39.077709, mean_q: -57.683800
  21400/150000: episode: 107, duration: 1.836s, episode steps: 200, steps per second: 109, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.115 [0.000, 2.000],  loss: 7.019603, mae: 39.000702, mean_q: -57.860695
  21600/150000: episode: 108, d

  27600/150000: episode: 138, duration: 1.971s, episode steps: 200, steps per second: 101, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.000 [0.000, 2.000],  loss: 8.565435, mae: 40.452793, mean_q: -60.042198
  27800/150000: episode: 139, duration: 2.011s, episode steps: 200, steps per second:  99, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 0.980 [0.000, 2.000],  loss: 10.492897, mae: 40.455051, mean_q: -59.954678
  28000/150000: episode: 140, duration: 1.950s, episode steps: 200, steps per second: 103, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.100 [0.000, 2.000],  loss: 6.526506, mae: 40.610634, mean_q: -60.322617
  28200/150000: episode: 141, duration: 1.960s, episode steps: 200, steps per second: 102, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.085 [0.000, 2.000],  loss: 10.573242, mae: 40.655262, mean_q: -60.119755
  28400/150000: episode: 142, 

  34400/150000: episode: 172, duration: 1.959s, episode steps: 200, steps per second: 102, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.020 [0.000, 2.000],  loss: 6.791001, mae: 41.648418, mean_q: -61.924911
  34600/150000: episode: 173, duration: 1.962s, episode steps: 200, steps per second: 102, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 0.980 [0.000, 2.000],  loss: 6.834288, mae: 41.872997, mean_q: -62.198589
  34800/150000: episode: 174, duration: 1.953s, episode steps: 200, steps per second: 102, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 0.905 [0.000, 2.000],  loss: 8.412276, mae: 42.006851, mean_q: -62.392967
  35000/150000: episode: 175, duration: 2.026s, episode steps: 200, steps per second:  99, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.055 [0.000, 2.000],  loss: 13.170584, mae: 41.950665, mean_q: -61.975292
  35200/150000: episode: 176, d

In [None]:
dqn.save_weights('model.mdl', overwrite=True)
dqn.test(env, nb_episodes=5, visualize=True)