In [193]:
import numpy as np
import gym
import tensorflow as tf
from tensorflow.contrib.layers import flatten, conv2d, fully_connected
from collections import deque, Counter
import random
from datetime import datetime
import pandas as pd
from sklearn.cluster import KMeans
import cvxpy

In [194]:
model_name = "MDQN"

In [195]:
color = np.array([210, 164, 74]).mean()

def preprocess_observation(obs):

    # Crop and resize the image
    img = obs[1:176:2, ::2]

    # Convert the image to greyscale
    img = img.mean(axis=2)

    # Improve image contrast
    img[img==color] = 0

    # Next we normalize the image from -1 to +1
    img = (img - 128) / 128 - 1

    return img.reshape(88,80,1)

In [196]:
env = gym.make("Qbert-v0")
n_outputs = env.action_space.n

In [197]:
tf.reset_default_graph()
def q_network(X, name_scope, model_name):
    # Initialize layers
    if model_name == "Bootstrap" or model_name == "MDQN":
        initializer = tf.contrib.layers.variance_scaling_initializer()

        with tf.variable_scope(name_scope, reuse = tf.AUTO_REUSE) as scope:
            # initialize the convolutional layers
            with tf.variable_scope("convert"):
                    # initialize the convolutional layers
                    layer_1 = conv2d(X, num_outputs=32, kernel_size=(8,8), stride=4, padding='SAME', weights_initializer=initializer) 

                    layer_2 = conv2d(layer_1, num_outputs=64, kernel_size=(4,4), stride=2, padding='SAME', weights_initializer=initializer)

                    layer_3 = conv2d(layer_2, num_outputs=64, kernel_size=(3,3), stride=1, padding='SAME', weights_initializer=initializer)

            # Flatten the result of layer_3 before feeding to the fully connected layer
            flat = flatten(layer_3)
            output = []
            for i in range(10):

                with tf.variable_scope("heads_" + str(i)):

                    fc = fully_connected(flat, num_outputs=128, weights_initializer=initializer)

                    temp_output = fully_connected(fc, num_outputs=n_outputs, activation_fn=None, weights_initializer=initializer)

                    output.append(temp_output)

                # Vars will store the parameters of the network such as weights
            vars = {v.name[len(scope.name):]: v for v in tf.get_collection(key = tf.GraphKeys.TRAINABLE_VARIABLES, scope = scope.name)}
            return vars, output, flat
        
    if model_name == "DQN":
            
        # Initialize layers
        initializer = tf.contrib.layers.variance_scaling_initializer()

        with tf.variable_scope(name_scope) as scope: 

            # initialize the convolutional layers
            layer_1 = conv2d(X, num_outputs=32, kernel_size=(8,8), stride=4, padding='SAME', weights_initializer=initializer) 
            tf.summary.histogram('layer_1',layer_1)

            layer_2 = conv2d(layer_1, num_outputs=64, kernel_size=(4,4), stride=2, padding='SAME', weights_initializer=initializer)
            tf.summary.histogram('layer_2',layer_2)

            layer_3 = conv2d(layer_2, num_outputs=64, kernel_size=(3,3), stride=1, padding='SAME', weights_initializer=initializer)
            tf.summary.histogram('layer_3',layer_3)

            # Flatten the result of layer_3 before feeding to the fully connected layer
            flat = flatten(layer_3)

            fc = fully_connected(flat, num_outputs=128, weights_initializer=initializer)
            tf.summary.histogram('fc',fc)

            output = fully_connected(fc, num_outputs=n_outputs, activation_fn=None, weights_initializer=initializer)
            tf.summary.histogram('output',output)


            # Vars will store the parameters of the network such as weights
            varibles = {v.name[len(scope.name):]: v for v in tf.get_collection(key=tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope.name)} 
            return varibles, output,flat

In [243]:
epsilon = 0.5
eps_min = 0.05
eps_max = 1.0
eps_decay_steps = 500000

def find_first_same_array(list1, data):
    # compare if two arrays are same
    for i in range(len(data)):
        if np.array_equal(list1, data[i]):
            return i
    return False

def array_mse(list1, list2):
    # calculate mse of two arrays
    list1 = np.array(list1)
    list2 = np.array(list2)
    if len(list1) == 1:
        list1 = list1[0]
    if len(list2) == 1:
        list2 = list2[0]
    return sum((list1 - list2) ** 2) ** 0.5

def array_group_mse(list1, group):
    # calculate mse of one array and a bunch of arrays
    return np.array([array_mse(list1, group[i]) for i in range(len(group))])

def kmeans_farest_point(center_point, group_center, group):
    # do kmeans clustering and return farest points
    array_point = group - center_point
    array_group = group - group_center
    temp_dot = array_point * array_group
    temp_result = temp_dot.sum(axis = 1)
    return np.argmax(abs(temp_result))

def find_closet(match_list, benchmark, n = 3):
    # return the closest n match result
    # match_list is the current state, benchmark is the benchmark contain same number of sample as number of heads
    mse_list = np.array([array_mse(match_list, benchmark[i]) for i in range(len(benchmark))])
    return np.argsort(mse_list)[:3]

def find_benchmark(exp_buffer, n = 10):
    # using kmeans find farest point as benchmark
    state_list = np.array([exp_buffer[i][0] for i in range(len(exp_buffer))])
    with sess.as_default():
        temp_flat_list = np.array([main_con.eval(feed_dict={X:[state_list[i]], in_training_mode:False})[0] for i in range(len(state_list))])
    center_point = temp_flat_list.mean(axis = 0)
    kmeans = KMeans(n_clusters = n, random_state = 0).fit(temp_flat_list)
    centers = kmeans.cluster_centers_
    labels = kmeans.labels_
    benchmark = np.array([state_list[labels == i][kmeans_farest_point(center_point, centers[i], temp_flat_list[labels == i])] for i in range(n)])
    flat_benchmark = np.array([temp_flat_list[labels == i][kmeans_farest_point(center_point, centers[i], temp_flat_list[labels == i])] for i in range(n)])
    return benchmark, flat_benchmark

def find_closet_head(current_state, benchmark, n = 3):
    # return least mse benchmarks
    current = np.array(main_con.eval(feed_dict={X:[current_state], in_training_mode:False})[0])
    bench_list = np.array([main_con.eval(feed_dict={X:[benchmark[i]], in_training_mode:False})[0] for i in range(len(benchmark))])
    return find_closet(current, bench_list, n)
    
def epsilon_greedy(action, step):
    # dicision making function
    p = np.random.random(1).squeeze()
    epsilon = max(eps_min, eps_max - (eps_max-eps_min) * step/eps_decay_steps)
    if np.random.rand() < epsilon:
        return np.random.randint(n_outputs)
    else:
        return action
    
def update_benchmark(exp_buffer, old_benchmark, n = 10):
    # update benchmark using linear programming
    if len(old_benchmark) == 0:
        new_benchmark, new_flat = find_benchmark(exp_buffer, n)
        return new_benchmark
    n = len(benchmark)
    new_benchmark, new_flat = find_benchmark(exp_buffer, n)
    old_flat = np.array([main_con.eval(feed_dict={X:[benchmark[i]], in_training_mode:False})[0] for i in range(n)])
    distance_matrix = np.array([array_group_mse(new_flat[i], old_flat) for i in range(n)])
    selection = [cvxpy.Variable(distance_matrix[0].shape, boolean = True) for i in range(n)]
    constraint1 = [selection[i] * np.array([1,]*10) == 1 for i in range(n)]
    sum_selection = selection[0]
    for i in range(1, n):
        sum_selection += selection[i]
    constraint2 = [sum_selection == np.array([1,]*10)]
    total_utility = sum([-(selection[i] * distance_matrix[i]) for i in range(n)])
    lp_problem = cvxpy.Problem(cvxpy.Maximize(total_utility), constraint1 + constraint2)
    solver = lp_problem.solve()
    ind = np.array([np.argmax(selection[i].value) for i in range(n)])
    return new_benchmark[ind]

In [226]:
buffer_len = 20000
exp_buffer = deque(maxlen=buffer_len)

In [227]:
def sample_memories(batch_size, buffer):
    perm_batch = np.random.permutation(len(buffer))[:batch_size]
    mem = np.array(buffer)[perm_batch]
    return mem[:,0], mem[:,1], mem[:,2], mem[:,3], mem[:,4]

In [228]:
num_episodes = 20
batch_size = 48
input_shape = (None, 88, 80, 1)
learning_rate = 0.001
X_shape = (None, 88, 80, 1)
discount_factor = 0.97

global_step = 0
copy_steps = 100
steps_train = 4
start_steps = 2000

benchmark = np.array([])
mask_matrix = np.zeros((buffer_len, 10))
update_benchmark_step = buffer_len / 2

In [229]:
logdir = 'logs'
tf.reset_default_graph()

# Now we define the placeholder for our input i.e game state
X = tf.placeholder(tf.float32, shape=X_shape)

# we define a boolean called in_training_model to toggle the training
in_training_mode = tf.placeholder(tf.bool)

In [230]:
# we build our Q network, which takes the input X and generates Q values for all the actions in the state
mainQ, mainQ_outputs, main_con = q_network(X, 'mainQ',model_name)

# similarly we build our target Q network
targetQ, targetQ_outputs, target_con = q_network(X, 'targetQ',model_name)

In [231]:
# define the placeholder for our action values
X_action = tf.placeholder(tf.int32, shape=(None,))
Q_action = tf.reduce_sum(targetQ_outputs * tf.one_hot(X_action, n_outputs), axis=-1, keep_dims=True)

In [232]:
copy_op = [tf.assign(main_name, targetQ[var_name]) for var_name, main_name in mainQ.items()]
copy_target_to_main = tf.group(*copy_op)

In [233]:
# define a placeholder for our output i.e action
y = tf.placeholder(tf.float32, shape=(None,1))

# now we calculate the loss which is the difference between actual value and predicted value
loss = tf.reduce_mean(tf.square(y - Q_action))

# we use adam optimizer for minimizing the loss
optimizer = tf.train.AdamOptimizer(learning_rate)
training_op = optimizer.minimize(loss)

init = tf.global_variables_initializer()

loss_summary = tf.summary.scalar('LOSS', loss)
merge_summary = tf.summary.merge_all()
file_writer = tf.summary.FileWriter(logdir, tf.get_default_graph())

In [None]:
number_of_epochs = []
rewards_of_episode = []
sess = tf.Session()
with sess.as_default():
    init.run()
    # for each episode
    for i in range(1000000):
        done = False
        obs = env.reset()
        epoch = 0
        episodic_reward = 0
        actions_counter = Counter() 
        episodic_loss = []
        if model_name == "Bootstrap":
            head = int(np.random.uniform() * 10)
        if (model_name == "MDQN") and (len(exp_buffer) <= buffer_len / 2):
            head = int(np.random.uniform() * 10)
        # while the state is not the terminal state
        while not done:

            # get the preprocessed game screen
            obs = preprocess_observation(obs)

            # feed the game screen and get the Q values for each action
            if model_name == "MDQN":
                if len(exp_buffer) < buffer_len / 2:
                    actions = mainQ_outputs[head].eval(feed_dict={X:[obs], in_training_mode:False})
                    mask_array = np.array([1,]*10)
                else:
                    if len(benchmark) == 0:
                        benchmark = update_benchmark(exp_buffer, benchmark)
                    closest_head = find_closet_head(obs, benchmark)
                    head = closest_head[0]
                    actions = mainQ_outputs[head].eval(feed_dict={X:[obs], in_training_mode:False})
                    mask_array = np.array([0,]*10)
                    mask_array[closest_head] = 1
                    
            if model_name == "Bootstrap":
                actions = mainQ_outputs[head].eval(feed_dict={X:[obs], in_training_mode:False})
            elif model_name == "DQN":
                actions = mainQ_outputs.eval(feed_dict={X:[obs], in_training_mode:False})
            
            # get the action
            action = np.argmax(actions, axis=-1)
            actions_counter[str(action)] += 1 

            # select the action using epsilon greedy policy
            action = epsilon_greedy(action, global_step)
            
            # now perform the action and move to the next state, next_obs, receive reward
            next_obs, reward, done, _ = env.step(action)

            # Store this transistion as an experience in the replay buffer
            exp_buffer.append([obs, action, preprocess_observation(next_obs), reward, done])
            if model_name == "MDQN":
                mask_matrix = np.delete(mask_matrix,-1,0)
                mask_matrix = np.vstack((mask_array, mask_matrix))
            
            # After certain steps, we train our Q network with samples from the experience replay buffer
            if global_step % steps_train == 0 and global_step > start_steps:
                
                # sample experience
                if model_name == "MDQN":
                    select_buffer = mask_matrix[:, head]
                    select_buffer = select_buffer[:len(exp_buffer)]
                    select_buffer = np.array(exp_buffer)[select_buffer == 1]
                    o_obs, o_act, o_next_obs, o_rew, o_done = sample_memories(batch_size, select_buffer)
                
                else:
                    o_obs, o_act, o_next_obs, o_rew, o_done = sample_memories(batch_size, exp_buffer)

                # states
                o_obs = [x for x in o_obs]

                # next states
                o_next_obs = [x for x in o_next_obs]

                # next actions
                if model_name == "Bootstrap":
                    next_act = mainQ_outputs[head].eval(feed_dict={X:o_next_obs, in_training_mode:False})
                elif model_name == "DQN":
                    next_act = mainQ_outputs.eval(feed_dict={X:o_next_obs, in_training_mode:False})
                elif model_name == "MDQN":
                    next_act = mainQ_outputs[head].eval(feed_dict={X:o_next_obs, in_training_mode:False})

                # reward
                y_batch = o_rew + discount_factor * np.max(next_act, axis = -1) * (1 - o_done) 

                # merge all summaries and write to the file
                mrg_summary = merge_summary.eval(feed_dict={X:o_obs, y:np.expand_dims(y_batch, axis=-1), X_action:o_act, in_training_mode:False})
                file_writer.add_summary(mrg_summary, global_step)

                # now we train the network and calculate loss
                train_loss, _ = sess.run([loss, training_op], feed_dict={X:o_obs, y:np.expand_dims(y_batch, axis=-1), X_action:o_act, in_training_mode:True})
                episodic_loss.append(train_loss)
            
            # after some interval we copy our main Q network weights to target Q network
            if (global_step+1) % copy_steps == 0 and global_step > start_steps:
                copy_target_to_main.run()
                
            obs = next_obs
            epoch += 1
            global_step += 1
            episodic_reward += reward
            
            if model_name == "MDQN" and global_step % update_benchmark_step == 0:
                benchmark = update_benchmark(exp_buffer, benchmark)
            
        rewards_of_episode.append(episodic_reward)
        number_of_epochs.append(epoch)
        
        if i%100==0:
            result = pd.DataFrame(data = rewards_of_episode, columns = [['Rewards']])
            writer = pd.ExcelWriter('Qbert_MDQN.xlsx')
            result.to_excel(writer,'Sheet1')
            writer.save()
        print('Epoch', epoch, 'Reward', episodic_reward,)

In [None]:
session.close()