In [8]:
import random
import numpy as np
import gym
import sys
import h5py
from tensorflow import keras
from rl.agents.dqn import DQNAgent
from rl.policy import EpsGreedyQPolicy
from rl.memory import SequentialMemory

## Environment

In [28]:
env = gym.make('CartPole-v0')
state = env.reset()
env.seed(100)

states = env.observation_space.shape[0]
actions = env.action_space.n

In [29]:
print(state)
print(env.action_space)
print(env.observation_space)
print(env.observation_space.shape[0])

[ 0.02756547 -0.04249852  0.04271414  0.04248675]
Discrete(2)
Box([-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38], [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38], (4,), float32)
4


In [15]:
episodes = 10
for i in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0

    while not done:
        env.render()
        action=random.choice([0,1])
        n_state, reward, done, info = env.step(action)
        score += reward
    print('Episode:{} Score{}'.format(i, score))


Episode:1 Score26.0
Episode:2 Score19.0
Episode:3 Score13.0
Episode:4 Score29.0
Episode:5 Score14.0
Episode:6 Score18.0
Episode:7 Score15.0
Episode:8 Score15.0
Episode:9 Score22.0
Episode:10 Score22.0


## Model

In [18]:
model = keras.Sequential()
model.add(keras.Input(shape=(1, states)))
model.add(keras.layers.Dense(24, activation='relu'))
model.add(keras.layers.Dense(12, activation='relu'))
model.add(keras.layers.Dense(actions, activation='linear'))
model.add(keras.layers.Flatten())
print(model.summary())



Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_9 (Dense)              (None, 1, 24)             120       
_________________________________________________________________
dense_10 (Dense)             (None, 1, 12)             300       
_________________________________________________________________
dense_11 (Dense)             (None, 1, 2)              26        
_________________________________________________________________
flatten_3 (Flatten)          (None, 2)                 0         
Total params: 446
Trainable params: 446
Non-trainable params: 0
_________________________________________________________________
None


## Deep Q Training

In [22]:
policy = EpsGreedyQPolicy()
memory = SequentialMemory(limit=50000, window_length=1)
dqn = DQNAgent(model=model, memory=memory ,policy=policy, nb_steps_warmup=10, 
                nb_actions=actions, target_model_update=1e-2)


dqn.compile(keras.optimizers.Adam(lr=1e-3), metrics=['mae'])

dqn.fit(env, nb_steps=50000, visualize=False, verbose=1)



Training for 50000 steps ...
Interval 1 (0 steps performed)
    1/10000 [..............................] - ETA: 18:28 - reward: 1.0000



57 episodes - episode_reward: 173.526 [121.000, 200.000] - loss: 1.503 - mae: 24.174 - mean_q: 48.404

Interval 2 (10000 steps performed)
57 episodes - episode_reward: 176.509 [10.000, 200.000] - loss: 1.021 - mae: 28.593 - mean_q: 57.111

Interval 3 (20000 steps performed)
52 episodes - episode_reward: 191.385 [150.000, 200.000] - loss: 2.229 - mae: 30.256 - mean_q: 60.513

Interval 4 (30000 steps performed)
51 episodes - episode_reward: 196.647 [151.000, 200.000] - loss: 4.320 - mae: 32.734 - mean_q: 65.444

Interval 5 (40000 steps performed)
done, took 256.962 seconds


<keras.callbacks.History at 0x7fd72a54bd30>

In [23]:
dqn.test(env, nb_episodes=10, visualize=True)

Testing for 10 episodes ...
Episode 1: reward: 185.000, steps: 185
Episode 2: reward: 179.000, steps: 179
Episode 3: reward: 192.000, steps: 192
Episode 4: reward: 189.000, steps: 189
Episode 5: reward: 179.000, steps: 179
Episode 6: reward: 200.000, steps: 200
Episode 7: reward: 189.000, steps: 189
Episode 8: reward: 196.000, steps: 196
Episode 9: reward: 199.000, steps: 199
Episode 10: reward: 185.000, steps: 185


<keras.callbacks.History at 0x7fd72a54b8b0>