In [2]:
import numpy as np
import gym
import tensorflow as tf
from tensorflow.contrib.layers import flatten, conv2d, fully_connected
from collections import deque, Counter
import random
from datetime import datetime
import matplotlib.pyplot as plt
import math

In [None]:
model_name = "Bootstrap"

In [None]:
gym.envs.register(
    id='CartPole-v3',
    entry_point='gym.envs.classic_control:CartPoleEnv',
    max_episode_steps=2000,
    reward_threshold=-110.0,
)

In [None]:
env = gym.make('CartPole-v3')
n_outputs = env.action_space.n

In [None]:
tf.reset_default_graph()
def q_network(X, name_scope, model_name):
    # Initialize layers
    if model_name == "Bootstrap":
        output = []
        
        initializer = tf.contrib.layers.variance_scaling_initializer()
        
        with tf.variable_scope(name_scope) as scope:
            fc1 = fully_connected(X, num_outputs = 24, activation_fn=tf.nn.relu, weights_initializer=initializer)
        
            for i in range(10):
                with tf.variable_scope("heads_" + str(i)):

                    fc2 = fully_connected(fc1, num_outputs = 24, activation_fn=tf.nn.relu, weights_initializer=initializer)

                    temp_output = fully_connected(fc2, num_outputs=n_outputs, activation_fn=None, weights_initializer=initializer)

                    output.append(temp_output)

            # Vars will store the parameters of the network such as weights
        varibles = {v.name[len(scope.name):]: v for v in tf.get_collection(key = tf.GraphKeys.TRAINABLE_VARIABLES, scope = scope.name)}
        return varibles, output
        
    if model_name == "DQN":
            
        # Initialize layers
        initializer = tf.contrib.layers.variance_scaling_initializer()

        with tf.variable_scope(name_scope) as scope: 

            fc1 = fully_connected(X, num_outputs = 24, activation_fn=tf.nn.relu, weights_initializer=initializer)
            tf.summary.histogram('fc1',fc1)
            
            fc2 = fully_connected(fc1, num_outputs = 24, activation_fn=tf.nn.relu, weights_initializer=initializer)
            tf.summary.histogram('fc2',fc2)

            output = fully_connected(fc2, num_outputs=n_outputs, activation_fn=None, weights_initializer=initializer)
            tf.summary.histogram('output',output)


            # Vars will store the parameters of the network such as weights
            varibles = {v.name[len(scope.name):]: v for v in tf.get_collection(key=tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope.name)} 
            return varibles, output

In [None]:
eps_min = 0.05
eps_max = 1.0
eps_decay = 200
steps_done = 0

def epsilon_greedy(action):
    global steps_done
    eps_threshold = eps_min+(eps_max-eps_min)*math.exp(-1.*steps_done/eps_decay)
    steps_done += 1
    if np.random.rand() < eps_threshold:
        return np.random.randint(n_outputs)
    else:
        if type(action) == np.ndarray and len(action) == 1:
            return action[0]
        return action

In [None]:
buffer_len = 10000
exp_buffer = deque(maxlen=buffer_len)

In [None]:
def sample_memories(batch_size):
    perm_batch = np.random.permutation(len(exp_buffer))[:batch_size]
    mem = np.array(exp_buffer)[perm_batch]
    return mem[:,0], mem[:,1], mem[:,2], mem[:,3], mem[:,4]

In [None]:
num_episodes = 20
batch_size = 128
input_shape = (None, 4,)
learning_rate = 0.001
X_shape = (None, 4,)
discount_factor = 0.999

global_step = 0
copy_steps = 100
steps_train = 4
start_steps = 2000

In [None]:
logdir = 'logs'
tf.reset_default_graph()

# Now we define the placeholder for our input i.e game state
X = tf.placeholder(tf.float32, shape=X_shape)

# we define a boolean called in_training_model to toggle the training
in_training_mode = tf.placeholder(tf.bool)

In [None]:
# we build our Q network, which takes the input X and generates Q values for all the actions in the state
mainQ, mainQ_outputs = q_network(X, 'mainQ',model_name)

# similarly we build our target Q network
targetQ, targetQ_outputs = q_network(X, 'targetQ',model_name)

In [None]:
# define the placeholder for our action values
X_action = tf.placeholder(tf.int32, shape=(None,))
Q_action = tf.reduce_sum(targetQ_outputs * tf.one_hot(X_action, n_outputs), axis=-1, keep_dims=True)

In [None]:
copy_op = [tf.assign(main_name, targetQ[var_name]) for var_name, main_name in mainQ.items()]
copy_target_to_main = tf.group(*copy_op)

In [None]:
# define a placeholder for our output i.e action
y = tf.placeholder(tf.float32, shape=(None,1))

# now we calculate the loss which is the difference between actual value and predicted value
loss = tf.reduce_mean(tf.square(y - Q_action))

# we use adam optimizer for minimizing the loss
optimizer = tf.train.AdamOptimizer(learning_rate)
training_op = optimizer.minimize(loss)

init = tf.global_variables_initializer()

loss_summary = tf.summary.scalar('LOSS', loss)
merge_summary = tf.summary.merge_all()
file_writer = tf.summary.FileWriter(logdir, tf.get_default_graph())

In [None]:
import pandas as pd
number_of_epochs = []
rewards_of_episode = []
with tf.Session() as sess:
    init.run()
    # for each episode
    for i in range(50000):
        done = False
        obs = env.reset()
        epoch = 0
        episodic_reward = 0
        actions_counter = Counter() 
        episodic_loss = []

        # while the state is not the terminal state
        if model_name == "Bootstrap":
            head = int(np.random.uniform() * 10)
        while not done:

            # feed the game screen and get the Q values for each action
            if model_name == "Bootstrap":
                actions = mainQ_outputs[head].eval(feed_dict={X:[obs], in_training_mode:False})
            elif model_name == "DQN":
                actions = mainQ_outputs.eval(feed_dict={X:[obs], in_training_mode:False})
            
            # get the action
            action = np.argmax(actions, axis=-1)
            actions_counter[str(action)] += 1 
            
            # select the action using epsilon greedy policy
            action = epsilon_greedy(action)
            
            # now perform the action and move to the next state, next_obs, receive reward
            next_obs, reward, done, _ = env.step(action)

            # Store this transistion as an experience in the replay buffer
            exp_buffer.append([obs, action, next_obs, reward, done])
            
            # After certain steps, we train our Q network with samples from the experience replay buffer
            if global_step % steps_train == 0 and global_step > batch_size:
                
                # sample experience
                o_obs, o_act, o_next_obs, o_rew, o_done = sample_memories(batch_size)

                # states
                o_obs = [x for x in o_obs]

                # next states
                o_next_obs = [x for x in o_next_obs]

                # next actions
                if model_name == "Bootstrap":
                    next_act = mainQ_outputs[head].eval(feed_dict={X:o_next_obs, in_training_mode:False})
                elif model_name == "DQN":
                    next_act = mainQ_outputs.eval(feed_dict={X:o_next_obs, in_training_mode:False})

                # reward
                y_batch = o_rew + discount_factor * np.max(next_act, axis = -1) * (1 - o_done) 

                # merge all summaries and write to the file
                mrg_summary = merge_summary.eval(feed_dict={X:o_obs, y:np.expand_dims(y_batch, axis=-1), X_action:o_act, in_training_mode:False})
                file_writer.add_summary(mrg_summary, global_step)

                # now we train the network and calculate loss
                train_loss, _ = sess.run([loss, training_op], feed_dict={X:o_obs, y:np.expand_dims(y_batch, axis=-1), X_action:o_act, in_training_mode:True})
                episodic_loss.append(train_loss)
            
            # after some interval we copy our main Q network weights to target Q network
            if (global_step + 1) % copy_steps == 0 and global_step > batch_size:
                copy_target_to_main.run()
                
            obs = next_obs
            epoch += 1
            global_step += 1
            episodic_reward += reward
            if i%100==0:
                result = pd.DataFrame(data = rewards_of_episode, columns = [['Rewards']])
                writer = pd.ExcelWriter('boot_cartpole_10heads_3.xlsx')
                result.to_excel(writer,'Sheet1')
                writer.save()
        rewards_of_episode.append(episodic_reward)
        number_of_epochs.append(epoch)
        print('Epoch', epoch, 'Reward', episodic_reward,'Steps',steps_done)