In [1]:
import gym
import math
import time
import numpy as np
import random
from sklearn.preprocessing import KBinsDiscretizer
import seaborn as sns
import matplotlib.pyplot as plt

from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.optimizers import Adam

from rl.agents.dqn import DQNAgent
from rl.policy import EpsGreedyQPolicy
from rl.memory import SequentialMemory

In [2]:
env = gym.make('FrozenLake-v0')
nb = env.action_space.n
env.seed(0)

[0]

In [22]:
model = Sequential()
model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
model.add(Dense())
model.add(Activation('relu'))
model.add(Dense(nb))
model.add(Activation('linear'))
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_1 (Flatten)          (None, 1)                 0         
_________________________________________________________________
dense_2 (Dense)              (None, 256)               512       
_________________________________________________________________
activation_2 (Activation)    (None, 256)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 4)                 1028      
_________________________________________________________________
activation_3 (Activation)    (None, 4)                 0         
Total params: 1,540
Trainable params: 1,540
Non-trainable params: 0
_________________________________________________________________
None


In [23]:
policy = EpsGreedyQPolicy(eps=0.05)
memory = SequentialMemory(limit=50000, window_length=1)
dqn = DQNAgent(model=model, nb_actions=nb, memory=memory, nb_steps_warmup=10, target_model_update=1e-2, policy=policy)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])

dqn.fit(env, nb_steps=5000, visualize=False, verbose=2)

Training for 5000 steps ...
    3/5000: episode: 1, duration: 0.136s, episode steps:   3, steps per second:  22, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 0.667 [0.000, 1.000],  loss: --, mae: --, mean_q: --




   11/5000: episode: 2, duration: 0.645s, episode steps:   8, steps per second:  12, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 0.625 [0.000, 1.000],  loss: --, mae: --, mean_q: --
   13/5000: episode: 3, duration: 0.016s, episode steps:   2, steps per second: 126, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 0.500 [0.000, 1.000],  loss: 0.364427, mae: 0.479813, mean_q: 0.681401
   21/5000: episode: 4, duration: 0.048s, episode steps:   8, steps per second: 165, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 0.875 [0.000, 2.000],  loss: 0.313392, mae: 0.444113, mean_q: 0.352176
   31/5000: episode: 5, duration: 0.059s, episode steps:  10, steps per second: 169, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 0.800 [0.000, 3.000],  loss: 0.194617, mae: 0.370866, mean_q: 0.515822
   33/5000: episode: 6, duration: 0.016s, episode steps:   2, steps per second: 126, episode r



   53/5000: episode: 8, duration: 0.098s, episode steps:  18, steps per second: 183, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 0.278 [0.000, 2.000],  loss: 0.107267, mae: 0.375097, mean_q: 0.645677
   70/5000: episode: 9, duration: 0.091s, episode steps:  17, steps per second: 187, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 0.118 [0.000, 1.000],  loss: 0.073360, mae: 0.223100, mean_q: 0.470998
   83/5000: episode: 10, duration: 0.071s, episode steps:  13, steps per second: 183, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 0.692 [0.000, 3.000],  loss: 0.067824, mae: 0.215468, mean_q: 0.420105
   88/5000: episode: 11, duration: 0.033s, episode steps:   5, steps per second: 153, episode reward:  0.000, mean reward:  0.000 [ 0.000,  0.000], mean action: 0.000 [0.000, 0.000],  loss: 0.078125, mae: 0.330706, mean_q: 0.497121
   98/5000: episode: 12, duration: 0.061s, episode steps:  10, steps per s

<tensorflow.python.keras.callbacks.History at 0x27248462850>

In [24]:
d = dqn.test(env, nb_episodes=100, visualize=False)

Testing for 100 episodes ...
Episode 1: reward: 0.000, steps: 44
Episode 2: reward: 0.000, steps: 8
Episode 3: reward: 0.000, steps: 33
Episode 4: reward: 0.000, steps: 15
Episode 5: reward: 0.000, steps: 8
Episode 6: reward: 0.000, steps: 5
Episode 7: reward: 0.000, steps: 16
Episode 8: reward: 0.000, steps: 8
Episode 9: reward: 0.000, steps: 7
Episode 10: reward: 1.000, steps: 25
Episode 11: reward: 0.000, steps: 6
Episode 12: reward: 0.000, steps: 5
Episode 13: reward: 0.000, steps: 25
Episode 14: reward: 0.000, steps: 19
Episode 15: reward: 0.000, steps: 8
Episode 16: reward: 1.000, steps: 17
Episode 17: reward: 0.000, steps: 9
Episode 18: reward: 0.000, steps: 22
Episode 19: reward: 0.000, steps: 6
Episode 20: reward: 0.000, steps: 33
Episode 21: reward: 0.000, steps: 19
Episode 22: reward: 0.000, steps: 9
Episode 23: reward: 0.000, steps: 6
Episode 24: reward: 0.000, steps: 37
Episode 25: reward: 0.000, steps: 11
Episode 26: reward: 0.000, steps: 9
Episode 27: reward: 0.000, step

In [25]:
np.sum(d.history['episode_reward'])

7.0