Copy CartPole

In [21]:
import numpy as np
import gym

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Flatten
from tensorflow.keras.optimizers import Adam

from rl.agents.dqn import DQNAgent
from rl.policy import EpsGreedyQPolicy, BoltzmannQPolicy
from rl.memory import SequentialMemory

In [2]:
env = gym.make("MountainCar-v0")

In [14]:
nb_obs = env.observation_space.shape
nb_obs

(2,)

In [15]:
nb_act = env.action_space.n
nb_act

3

In [16]:
model = Sequential([
    Flatten(input_shape=(1,) + nb_obs),
    Dense(256, activation="relu"),
    Dense(256, activation="relu"),
    Dense(256, activation="relu"),
    Dense(256, activation="relu"),
    
    Dense(nb_act),
    Activation('linear')
])

In [22]:
policy = EpsGreedyQPolicy()
memory = SequentialMemory(limit=50000, window_length=1)
dqn = DQNAgent(model=model, nb_actions=nb_act, memory=memory, nb_steps_warmup=10,
target_model_update=1e-2, policy=BoltzmannQPolicy())
dqn.compile(Adam(lr=1e-3), metrics=['mae'])



In [23]:
# Okay, now it's time to learn something! We visualize the training here for show, but this slows down training quite a lot. 
dqn.fit(env, nb_steps=10_000, visualize=False, verbose=2)

Training for 10000 steps ...




  200/10000: episode: 1, duration: 2.218s, episode steps: 200, steps per second:  90, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.125 [0.000, 2.000],  loss: 0.058594, mae: 34.985944, mean_q: -52.278669
  400/10000: episode: 2, duration: 1.703s, episode steps: 200, steps per second: 117, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.205 [0.000, 2.000],  loss: 3.985336, mae: 35.423386, mean_q: -52.336426
  600/10000: episode: 3, duration: 1.679s, episode steps: 200, steps per second: 119, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.120 [0.000, 2.000],  loss: 5.391562, mae: 35.553185, mean_q: -52.532188
  800/10000: episode: 4, duration: 1.701s, episode steps: 200, steps per second: 118, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.070 [0.000, 2.000],  loss: 5.632994, mae: 35.751575, mean_q: -52.863682
 1000/10000: episode: 5, duration: 1.676s, episode s

 7200/10000: episode: 36, duration: 1.770s, episode steps: 200, steps per second: 113, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.020 [0.000, 2.000],  loss: 7.157710, mae: 37.121037, mean_q: -54.925102
 7400/10000: episode: 37, duration: 1.855s, episode steps: 200, steps per second: 108, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 0.950 [0.000, 2.000],  loss: 6.109393, mae: 37.346012, mean_q: -55.328224
 7600/10000: episode: 38, duration: 1.707s, episode steps: 200, steps per second: 117, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.020 [0.000, 2.000],  loss: 6.390052, mae: 37.558060, mean_q: -55.531738
 7800/10000: episode: 39, duration: 1.696s, episode steps: 200, steps per second: 118, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.110 [0.000, 2.000],  loss: 8.045536, mae: 37.818653, mean_q: -55.899200
 8000/10000: episode: 40, duration: 1.676s, epis

<tensorflow.python.keras.callbacks.History at 0x7fb14eaa41f0>

In [25]:
results = dqn.test(env, nb_episodes=100, visualize=True)
np.mean(results.history["episode_reward"])

Testing for 100 episodes ...
Episode 1: reward: -200.000, steps: 200
Episode 2: reward: -200.000, steps: 200
Episode 3: reward: -200.000, steps: 200
Episode 4: reward: -200.000, steps: 200
Episode 5: reward: -200.000, steps: 200
Episode 6: reward: -200.000, steps: 200
Episode 7: reward: -200.000, steps: 200
Episode 8: reward: -200.000, steps: 200
Episode 9: reward: -200.000, steps: 200
Episode 10: reward: -200.000, steps: 200
Episode 11: reward: -200.000, steps: 200
Episode 12: reward: -200.000, steps: 200
Episode 13: reward: -200.000, steps: 200
Episode 14: reward: -200.000, steps: 200
Episode 15: reward: -200.000, steps: 200
Episode 16: reward: -200.000, steps: 200
Episode 17: reward: -200.000, steps: 200
Episode 18: reward: -200.000, steps: 200
Episode 19: reward: -200.000, steps: 200
Episode 20: reward: -200.000, steps: 200
Episode 21: reward: -200.000, steps: 200
Episode 22: reward: -200.000, steps: 200
Episode 23: reward: -200.000, steps: 200
Episode 24: reward: -200.000, steps: 

KeyboardInterrupt: 