# DQN with TensorFlow & gym
<br>
In this notebook we'll build our very first Deep Q-Network with TensorFlow deep learning framework and OpenAI gym <a href='https://github.com/openai/gym/blob/master/gym/envs/classic_control/cartpole.py'>CartPole-v0</a> environment. <br><br><br>


In [1]:
import gym
import random
import numpy as np
import tensorflow as tf

In [2]:
tf.reset_default_graph()

In [3]:
class DQN:
    def __init__(self, input_shape:object = None, action_size:object = None, scope:object = None) -> object:

        if input_shape is None:
            input_shape = [None, 4]
        self.max_size = 1000000
        self.mem = []

        with tf.variable_scope(scope):
            self.scope = scope
            self.input_shape = input_shape
            self.action_size = action_size

            self.states = tf.placeholder(shape=input_shape, dtype=tf.float32)
            self.actions = tf.placeholder(shape=[None], dtype=tf.int32)
            self.actions_onehot = tf.one_hot(
                self.actions, action_size, dtype=tf.float32)

            fc1 = tf.layers.dense(self.states, 256, activation=tf.nn.relu)
            fc2 = tf.layers.dense(fc1, 512, activation=tf.nn.relu)
            fc3 = tf.layers.dense(fc2, 512, activation=tf.nn.relu)
            self.q = tf.layers.dense(fc3, action_size, activation=None)

            self.target_q = tf.placeholder(shape=[None], dtype=tf.float32)
            self.responsible_output = tf.reduce_sum(tf.multiply(
                self.q, self.actions_onehot), axis=1, keep_dims=False)
            self.loss = tf.reduce_mean(
                tf.square(self.responsible_output - self.target_q))

            self.update_model = tf.train.AdamOptimizer().minimize(self.loss)

    def action(self, sess, state):
        q = sess.run(self.q, feed_dict={self.states: state[np.newaxis, ...]})
        return np.argmax(q)

    def train(self, sess, batch, learning_rate, tnet):
        assert len(batch) > 0
        states = np.vstack(batch[:, 0])
        actions = np.array(batch[:, 1])
        rewards = batch[:, 2]
        next_states = np.vstack(batch[:, 3])
        dones = batch[:, 4]

        next_q = sess.run(tnet.q, feed_dict={tnet.states: next_states})

        next_q = rewards + (1. - dones.astype(np.float32)) * \
            learning_rate * np.amax(next_q, axis=1, keepdims=False)

        sess.run(self.update_model, feed_dict={self.states: states,
                                               self.actions: actions,
                                               self.target_q: next_q})

    def add(self, element):
        self.mem.append(element)

        if len(self.mem) > self.max_size:
            self.mem.pop(0)

    def sample(self, size):
        size = min(size, len(self.mem))
        return random.sample(self.mem, size)


    def update_graph(self, from_graph:object, to_graph:object) -> object:
        from_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, from_graph)
        to_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, to_graph)
    
        holder = []
        for from_var, to_var in zip(from_vars, to_vars):
            holder.append(to_var.assign(from_var))
        return holder

In [4]:
target_update = 200
epsilon_max = 1.0
epsilon_min = 0.01
epsilon_decay = 0.001
learning_rate = 0.99
batch_size = 64
EPISODES = 50



env = gym.make('CartPole-v0')
action_size = env.action_space.n
input_shape = [None] + list(env.observation_space.shape)

time_step = 0.0
epsilon = epsilon_max

Q_Network = DQN(input_shape=input_shape,
                action_size=action_size, scope='Q_Network')
Train_Network = DQN(input_shape=input_shape,
                    action_size=action_size, scope='Train_Network')

update_ops = Q_Network.update_graph('Q_Network', 'Train_Network')

saver = tf.train.Saver()

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    for epoch in range(EPISODES):
        total_reward = 0
        state = env.reset()
        while True:
            env.render()
            if np.random.rand() < epsilon:
                action = np.random.randint(action_size)
            else:
                action = Q_Network.action(sess, state)
            next_state, reward, done, _ = env.step(action)
            total_reward += reward

            Q_Network.add([state, action, reward, next_state, done])

            time_step += 1.
            epsilon = epsilon_min + \
                (epsilon_max - epsilon_min) * \
                np.exp(-epsilon_decay * time_step)

            batch = np.array(Q_Network.sample(batch_size))
            Q_Network.train(sess, batch, learning_rate, Train_Network)

            state = np.copy(next_state)

            if int(time_step) % target_update == 0:
                sess.run(update_ops)
                saver.save(sess, "./checkpoints/CartPole_DQN.ckpt")

            if done:
                print('Epoch:', epoch, 'Total Rewards:', total_reward)
                break
                
    
    
env.close()

Instructions for updating:
keep_dims is deprecated, use keepdims instead
Epoch: 0 Total Rewards: 16.0
Epoch: 1 Total Rewards: 11.0
Epoch: 2 Total Rewards: 13.0
Epoch: 3 Total Rewards: 16.0
Epoch: 4 Total Rewards: 33.0
Epoch: 5 Total Rewards: 23.0
Epoch: 6 Total Rewards: 12.0
Epoch: 7 Total Rewards: 15.0
Epoch: 8 Total Rewards: 21.0
Epoch: 9 Total Rewards: 27.0
Epoch: 10 Total Rewards: 12.0
Epoch: 11 Total Rewards: 25.0
Epoch: 12 Total Rewards: 19.0
Epoch: 13 Total Rewards: 16.0
Epoch: 14 Total Rewards: 19.0
Epoch: 15 Total Rewards: 28.0
Epoch: 16 Total Rewards: 12.0
Epoch: 17 Total Rewards: 16.0
Epoch: 18 Total Rewards: 17.0
Epoch: 19 Total Rewards: 15.0
Epoch: 20 Total Rewards: 27.0
Epoch: 21 Total Rewards: 12.0
Epoch: 22 Total Rewards: 23.0
Epoch: 23 Total Rewards: 17.0
Epoch: 24 Total Rewards: 18.0
Epoch: 25 Total Rewards: 31.0
Epoch: 26 Total Rewards: 32.0
Epoch: 27 Total Rewards: 16.0
Epoch: 28 Total Rewards: 68.0
Epoch: 29 Total Rewards: 27.0
Epoch: 30 Total Rewards: 8.0
Epoch: 3