In [18]:
import random
import numpy as np
import gym
import sys
import h5py
from tensorflow import keras
from rl.agents.dqn import DQNAgent
from rl.policy import EpsGreedyQPolicy
from rl.memory import SequentialMemory

## Environment

In [19]:
env = gym.make('Blackjack-v1')
state = env.reset()
env.seed(100)

states = env.observation_space
actions = env.action_space.n

In [20]:
print(states)
print(actions)

Tuple(Discrete(32), Discrete(11), Discrete(2))
2


In [21]:
print(state)
print(env.action_space)
print(env.observation_space.shape)
print(env.observation_space.sample())
print(env.action_space.n)

(10, 9, False)
Discrete(2)
None
(23, 9, 1)
2


In [22]:
episodes = 10
for i in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0

    while not done:
        # env.render()
        action=random.choice([0,1])
        n_state, reward, done, info = env.step(action)
        score += reward
    print('Episode:{} Score:{}'.format(i, score))


Episode:1 Score:-1.0
Episode:2 Score:-1.0
Episode:3 Score:0.0
Episode:4 Score:-1.0
Episode:5 Score:1.0
Episode:6 Score:-1.0
Episode:7 Score:1.0
Episode:8 Score:-1.0
Episode:9 Score:-1.0
Episode:10 Score:1.0


## Model

In [29]:
model = keras.Sequential()
model.add(keras.Input(shape=(1, 3)))
model.add(keras.layers.Dense(36, activation='relu'))
model.add(keras.layers.Dense(18, activation='relu'))
model.add(keras.layers.Dense(6, activation='relu'))
model.add(keras.layers.Dense(actions, activation='linear'))
model.add(keras.layers.Flatten())
print(model.summary())



Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_9 (Dense)              (None, 1, 36)             144       
_________________________________________________________________
dense_10 (Dense)             (None, 1, 18)             666       
_________________________________________________________________
dense_11 (Dense)             (None, 1, 6)              114       
_________________________________________________________________
dense_12 (Dense)             (None, 1, 2)              14        
_________________________________________________________________
flatten_3 (Flatten)          (None, 2)                 0         
Total params: 938
Trainable params: 938
Non-trainable params: 0
_________________________________________________________________
None


## Deep Q Training

In [30]:
policy = EpsGreedyQPolicy()
memory = SequentialMemory(limit=50000, window_length=1)
dqn = DQNAgent(model=model, memory=memory ,policy=policy, nb_steps_warmup=10, 
                nb_actions=actions, target_model_update=1e-2)


dqn.compile(keras.optimizers.Adam(learning_rate=1e-3), metrics=['mae'])

dqn.fit(env, nb_steps=50000, visualize=False, verbose=1)

Training for 50000 steps ...
Interval 1 (0 steps performed)
6710 episodes - episode_reward: -0.095 [-1.000, 1.000] - loss: 0.257 - mae: 0.553 - mean_q: -0.028

Interval 2 (10000 steps performed)
6666 episodes - episode_reward: -0.110 [-1.000, 1.000] - loss: 0.246 - mae: 0.554 - mean_q: -0.037

Interval 3 (20000 steps performed)
6636 episodes - episode_reward: -0.103 [-1.000, 1.000] - loss: 0.244 - mae: 0.557 - mean_q: -0.040

Interval 4 (30000 steps performed)
6576 episodes - episode_reward: -0.089 [-1.000, 1.000] - loss: 0.241 - mae: 0.557 - mean_q: -0.046

Interval 5 (40000 steps performed)
done, took 495.586 seconds


<keras.callbacks.History at 0x7f96250026d0>

In [31]:
dqn.test(env, nb_episodes=1000, visualize=False)



Testing for 1000 episodes ...
Episode 1: reward: 1.000, steps: 1
Episode 2: reward: -1.000, steps: 1
Episode 3: reward: 1.000, steps: 2
Episode 4: reward: 0.000, steps: 1
Episode 5: reward: 0.000, steps: 2
Episode 6: reward: 0.000, steps: 1
Episode 7: reward: -1.000, steps: 1
Episode 8: reward: 1.000, steps: 1
Episode 9: reward: -1.000, steps: 1
Episode 10: reward: 1.000, steps: 1
Episode 11: reward: -1.000, steps: 1
Episode 12: reward: -1.000, steps: 1
Episode 13: reward: 0.000, steps: 1
Episode 14: reward: -1.000, steps: 2
Episode 15: reward: -1.000, steps: 3
Episode 16: reward: -1.000, steps: 1
Episode 17: reward: 1.000, steps: 1
Episode 18: reward: -1.000, steps: 2
Episode 19: reward: -1.000, steps: 1
Episode 20: reward: 0.000, steps: 3
Episode 21: reward: 1.000, steps: 1
Episode 22: reward: 1.000, steps: 1
Episode 23: reward: 1.000, steps: 1
Episode 24: reward: -1.000, steps: 1
Episode 25: reward: -1.000, steps: 1
Episode 26: reward: -1.000, steps: 1
Episode 27: reward: -1.000, st

<keras.callbacks.History at 0x7f9624fc1670>

In [42]:
episodes = 1000
wincounter = 0
for i in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0
    action = np.argmax(dqn.forward(state))
    while not done:
        # env.render()
        
        n_state, reward, done, info = env.step(action)
        score += reward
    if score == 1:
        wincounter += 1
    print('Episode:{} Score:{}'.format(i, score))
print('Wins:{} Total: 1000'.format(wincounter))

Episode:1 Score:-1.0
Episode:2 Score:1.0
Episode:3 Score:1.0
Episode:4 Score:1.0
Episode:5 Score:1.0
Episode:6 Score:-1.0
Episode:7 Score:1.0
Episode:8 Score:1.0
Episode:9 Score:0.0
Episode:10 Score:1.0
Episode:11 Score:-1.0
Episode:12 Score:0.0
Episode:13 Score:1.0
Episode:14 Score:-1.0
Episode:15 Score:-1.0
Episode:16 Score:1.0
Episode:17 Score:-1.0
Episode:18 Score:0.0
Episode:19 Score:-1.0
Episode:20 Score:1.0
Episode:21 Score:-1.0
Episode:22 Score:-1.0
Episode:23 Score:1.0
Episode:24 Score:-1.0
Episode:25 Score:1.0
Episode:26 Score:-1.0
Episode:27 Score:-1.0
Episode:28 Score:-1.0
Episode:29 Score:-1.0
Episode:30 Score:-1.0
Episode:31 Score:-1.0
Episode:32 Score:1.0
Episode:33 Score:1.0
Episode:34 Score:-1.0
Episode:35 Score:1.0
Episode:36 Score:-1.0
Episode:37 Score:-1.0
Episode:38 Score:-1.0
Episode:39 Score:-1.0
Episode:40 Score:1.0
Episode:41 Score:-1.0
Episode:42 Score:1.0
Episode:43 Score:1.0
Episode:44 Score:-1.0
Episode:45 Score:1.0
Episode:46 Score:1.0
Episode:47 Score:-1.