### this notebook is ready to run, default to test the recover models for three games for 100 test and get the mean performance in terms of mean of score and discounted reward, if you want to render, just change the render flag below to be true, then each game will render once. 

In [1]:
# If this is True, each of the three games will play once and only once with render
# If this False, each of the three games will play 100 times without render, 
# to calculate the mean of score and discounted returns
render_flag = False

# Recover models for games sequentially

In [2]:
import gym
import random
import numpy as np
import collections
import tensorflow as tf
from collections import deque
import matplotlib.pyplot as plt
import os,sys,time
import os
from PIL import Image

# a function to estimate the time remaining, home-made, skip reading it
class time_est():
    def __init__(self, total_len):
        self.t_start = time.time()
        self.total_len = total_len
        self.count = 0
        self.t_ref = time.time()
    
    def check(self,no_of_check=1,info=""):
        self.count += no_of_check
        if time.time() - self.t_ref > 1 and self.count > 0:
            t_used = time.time() - self.t_start
            t_total = t_used * self.total_len / self.count
            t_remain = t_total - t_used
            process_bar = "|"
            for i in range(40):
                if (i/40) < (self.count/self.total_len):
                    process_bar += "█"
                else:
                    process_bar += " "
            process_bar += "|"
            if info != "":
                info = str(info) + "  "
            print("\r" + (str(info) + "{:.2f}% ({}/{})  ".format(self.count * 100/self.total_len, self.count,self.total_len)) 
                  + str(process_bar).ljust(45) 
                  + "Used: {:02.0f}:{:02.0f}:{:02.0f}".format(int(t_used/3600), int(t_used/60)%60, t_used % 60).ljust(16) 
                  + "ETA: {:02.0f}:{:02.0f}:{:02.0f}".format(int(t_remain/3600), int(t_remain/60)%60, t_remain % 60),end="")
            self.t_ref = time.time()
        if self.count == self.total_len:
            t_used = time.time() - self.t_start
            if info != "":
                info = str(info) + "  "
            print("\r" + str(info) + "Finished in " 
                  + "{:02.0f}:{:02.0f}:{:02.0f}".format(int(t_used/3600), int(t_used/60)%60, t_used % 60).ljust(100))
    def get(self,no_of_check=1):
        process_bar = "|"
        for i in range(40):
            if (i/40) < (self.count/self.total_len):
                process_bar += "█"
            else:
                process_bar += " "
        process_bar += "|"
        self.count += no_of_check
        t_used = time.time() - self.t_start
        t_total = t_used * self.total_len / self.count
        t_remain = t_total - t_used
        return "{} ETA: {:02.0f}:{:02.0f}:{:02.0f}".format(process_bar, int(t_remain/3600), int(t_remain/60)%60, t_remain % 60)
# disable tensorflow debugging information
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

def testing(game_name, render_flag):
    tf.reset_default_graph()

    # print the game_name
    print(game_name)
    file_name = "B_4_3"
    path = "../models/{}/{}".format(file_name,game_name)


    optimizer = "RMS"

    # setup hyperparameter with respect to different games
    if game_name == 'Pong':
        env = gym.make("Pong-v4")
        env_1 = gym.make("Pong-v4")
        num_of_actions = 6
        max_runs = 2000000
        Learning_rate = 1e-4 # 1e-7
        final_greedy_rate = 0.1
        discount = .99
    elif game_name == 'MsPacgirl':
        env = gym.make("MsPacman-v4")
        env_1 = gym.make("MsPacman-v4")
        num_of_actions = 9
        max_runs = 2000000
        Learning_rate = 1e-4 # 1e-7
        final_greedy_rate = 0.1
        discount = .99
    elif game_name == 'Boxing':
        env = gym.make("Boxing-v4")
        env_1 = gym.make("Boxing-v4")
        num_of_actions = 18
        max_runs = 2000000
        Learning_rate = 1e-4 # 1e-7
        final_greedy_rate = 0.1
        discount = .99
    else:
        raise ValueError('Unidentified game mode')



    # a function to convert image from rgd to grey, since color is not useful, a grey image can simplify
    def rgb2gray(rgb):
        r, g, b = rgb[:,:,0], rgb[:,:,1], rgb[:,:,2]
        gray = 0.2989 * r + 0.5870 * g + 0.1140 * b
        return gray


    # some hyperparameters
    batch_size = 20
    #hyperparameters for CNN
    width = 6
    height = 6
    layer_1_unit = 16
    layer_2_unit = 32

    #size of the image
    image_size = 28
    
    # define tensorflow place holders
    reward_place = tf.placeholder(tf.float32, [None])
    obs_new = tf.placeholder(tf.float32, [None,image_size,image_size,4])
    obs = tf.placeholder(tf.float32, [None,image_size,image_size,4])
    act = tf.placeholder(tf.int32,[None])


    # two dimensional convolutional
    def convolution2D(DataIn,Weight):
        return tf.nn.conv2d(DataIn,Weight,strides = [1,2,2,1], padding = 'SAME')         

    # a function to define the neural network model
    def define_model(seed_num):
        tf.set_random_seed(seed_num)
        W1 = tf.Variable(tf.random_normal([width, height, 4, layer_1_unit]))
        b1 = tf.Variable(tf.random_normal([layer_1_unit]))

        y1 = tf.nn.relu(convolution2D(obs,W1)+b1)
        y1_new = tf.nn.relu(convolution2D(obs_new,W1)+b1)

        W2 = tf.Variable(tf.random_normal([width, height, layer_1_unit, layer_2_unit]))
        b2 = tf.Variable(tf.random_normal([layer_2_unit]))

        y2 = tf.nn.relu(convolution2D(y1,W2)+b2)
        y2_new = tf.nn.relu(convolution2D(y1_new,W2)+b2)

        # flatten the tensor before put into the fully connected layer
        flatten_shape = int(image_size/4)*int(image_size/4)*layer_2_unit
        W3 = tf.Variable(tf.random_normal([flatten_shape,256]))
        b3 = tf.Variable(tf.random_normal([256]))

        y3 = tf.nn.relu(tf.matmul(tf.reshape(y2,[-1,flatten_shape]),W3)+b3)
        y3_new = tf.nn.relu(tf.matmul(tf.reshape(y2_new,[-1,flatten_shape]),W3)+b3)

        W4 = tf.Variable(tf.random_normal([256,num_of_actions]))
        b4 = tf.Variable(tf.random_normal([num_of_actions]))

        y4 = tf.matmul(y3,W4) + b4
        y4_new = tf.matmul(y3_new,W4) + b4

        # target network which has the same structure, to copy the previous network
        W1_target = tf.Variable(tf.random_normal([width, height, 4, layer_1_unit]))
        b1_target = tf.Variable(tf.random_normal([layer_1_unit]))

        y1_target = tf.nn.relu(convolution2D(obs, W1_target) + b1_target)
        y1_new_target = tf.nn.relu(convolution2D(obs_new, W1_target) + b1_target)

        W2_target = tf.Variable(
            tf.random_normal([width, height, layer_1_unit, layer_2_unit]))
        b2_target = tf.Variable(tf.random_normal([layer_2_unit]))

        y2_target = tf.nn.relu(convolution2D(y1_target, W2_target) + b2_target)
        y2_new_target = tf.nn.relu(convolution2D(y1_new_target, W2_target) + b2_target)

        flatten_shape = int(image_size/4)*int(image_size/4)*layer_2_unit
        W3_target = tf.Variable(tf.random_normal([flatten_shape, 256]))
        b3_target = tf.Variable(tf.random_normal([256]))

        y3_target = tf.nn.relu(tf.matmul(tf.reshape(y2_target, [-1, flatten_shape]), W3_target) + b3_target)
        y3_new_target = tf.nn.relu(tf.matmul(tf.reshape(y2_new_target, [-1, flatten_shape]), W3_target) + b3_target)

        W4_target = tf.Variable(tf.random_normal([256, num_of_actions]))
        b4_target = tf.Variable(tf.random_normal([num_of_actions]))

        y4_target = tf.matmul(y3_target, W4_target) + b4_target
        y4_new_target = tf.matmul(y3_new_target, W4_target) + b4_target

        # global same variables for updating the target network
        global update_1, update_2, update_3, update_4, update_5, update_6, update_7, update_8

        # variables to update the network
        update_1 = W1_target.assign(W1)
        update_2 = b1_target.assign(b1)
        update_3 = W2_target.assign(W2)
        update_4 = b2_target.assign(b2)
        update_5 = W3_target.assign(W3)
        update_6 = b3_target.assign(b3)
        update_7 = W4_target.assign(W4)
        update_8 = b4_target.assign(b4)

        return y4,y4_new, y4_target, y4_new_target

    # a function to update the target network
    def update(sess):
        sess.run(update_1)
        sess.run(update_2)
        sess.run(update_3)
        sess.run(update_4)
        sess.run(update_5)
        sess.run(update_6)
        sess.run(update_7)
        sess.run(update_8)


    def image_preprocess(image):
        # if the image is Pong, cut the image so that the noisy part do not affect the network
        if game_name == 'Pong':
            image = image[34:194,:]
        # process image to 28 by 28 grey image
        img = Image.fromarray(image, 'RGB').convert('L')
        img = img.resize((image_size,image_size),resample=Image.BILINEAR)
        image_trans = np.asarray(img, dtype=np.uint8)
        # binarize the image of pong to just two level for network to better understand the image
        if game_name == 'Pong':
            image_trans.setflags(write=True)
            image_trans[image_trans>90] = 255
            image_trans[image_trans<=90] = 0

        return image_trans

    # a function to run the game with the current model for 100 times and average the performance
    def test_process(sess, action_max):
        print("\nchecking performace")
        if render_flag:
            num_test = 1
        else:
            num_test = 100
        est = time_est(num_test)
        with sess.as_default():
            # lists to store the the episode length and returns
            score_list = []
            dis_rtn_list = []
            for i in range(num_test):
                old_obs = env.reset()
                t = 0
                frame_buffer = deque(maxlen=4)
                total_reward = 0
                discount_factor = 1
                discounted_value = 0
                count = 0
                observation_stack = []
                while 1:
                    if count!= 0:
                        discount_factor *= 0.99
                    # greedy policy
                    if t > 3 and random.random() > final_greedy_rate:
                        action_dic = {obs:[old_obs],obs_new:np.zeros([1,image_size,image_size,4]),act:[0],reward_place:[reward]}
                        action = action_max.eval(action_dic)[0]
                    else:
                        action = round(random.uniform(0, num_of_actions - 1))
                    if render_flag:
                        env.render()
                    # take action and get the response from the action
                    observation, reward, done, info = env.step(action)
                    obs_frame = image_preprocess(observation)
                    frame_buffer.append(obs_frame)

                    if len(frame_buffer) == 4:
                        observation_stack = np.stack(list(frame_buffer),axis=0)
                        observation_stack = observation_stack.transpose([1,2,0])
                    # reward either -1, 1 or 0
                    reward = np.clip(reward,-1,1)
                    total_reward += reward
                    count += 1
                    # save the observation to a variable
                    old_obs = observation_stack
                    t += 1
                    # calculate the discounted returns
                    discounted_value += discount_factor * reward

                    if done:
                        # saving the reward and discouned rewards into list
                        score_list.append(total_reward)
                        dis_rtn_list.append(discounted_value)
                        break
                est.check()
        # return the mean of episode length and discounted rewards
        return np.mean(score_list), np.mean(dis_rtn_list)


    # function for training
    def training(seed_num):
        # define the greedy rate, initially 1, it can be changed in the training
        greedy_rate = 1

        # define the highes reward to be -21, it can be changed in the training
        highest_reward = -21

        # define some variables in tensorflow for training
        y2, y2_new, y2_target, y2_target_new= define_model(seed_num)
        # get action by Q values
        action_max = tf.cast(tf.argmax(y2, axis=1), tf.int32)
        # calculate bellman loss
        data_amount = tf.shape(y2_new)[0]
        Q_old_index = tf.concat([tf.reshape(tf.range(0,limit=data_amount),[data_amount,1]), tf.reshape(act,[data_amount,1])],axis=1)
        old_q = tf.gather_nd(y2,Q_old_index)


        action_max_primary = tf.cast(tf.argmax(y2_new, axis=1), tf.int32)
        Q_new_index = tf.concat([tf.reshape(tf.range(0,limit=data_amount),[data_amount,1]), tf.reshape(action_max_primary,[data_amount,1])],axis=1)

        max_q_value_next = tf.gather_nd(y2_target_new, Q_new_index)

        change_in_q = (reward_place + discount * tf.stop_gradient(max_q_value_next) - old_q)

        loss = tf.reduce_mean(tf.square(change_in_q)/2)

        # optimizer can be changed by the setting
        if optimizer == "SGD":
            train_step = tf.train.GradientDescentOptimizer(learning_rate=Learning_rate).minimize(loss)
        elif optimizer == "ADAM":
            train_step = tf.train.AdamOptimizer(learning_rate=Learning_rate).minimize(loss)
        elif optimizer == "RMS":
            train_step = tf.train.RMSPropOptimizer(learning_rate=Learning_rate,decay=0.9,momentum=0.2,centered=True).minimize(loss)
        else:
            aaaaaaaaa        


        # here comes the training code
        with tf.Session() as sess:
            print()
            # name of the last checkpoint model
            if game_name == "Pong":
                model_name = "{}/{}-{}-*{}*{}".format(path,file_name,game_name,-18.6,'.checkpoint')
            elif game_name == "MsPacgirl":
                model_name = "{}/{}-{}-*{}*{}".format(path,file_name,game_name,36.51,'.checkpoint')
            elif game_name == "Boxing":
                model_name = "{}/{}-{}-*{}*{}".format(path,file_name,game_name,4.2,'.checkpoint')

            print(model_name)
            # recover model and some data if possible, again, not used, all model start from begining
            saver = tf.train.Saver()
            saver.restore(sess, model_name)
            score_mean, dis_rtn_mean = test_process(sess, action_max)
            print("{}, score mean = {}, dis return mean = {}".format(game_name, score_mean, dis_rtn_mean))
            
    training(round(random.uniform(0,3000)))
if __name__ == "__main__":

    
    
    for game_name in ["MsPacgirl", "Boxing", "Pong"]:
        testing(game_name, render_flag)



MsPacgirl

../models/B_4_3/MsPacgirl/B_4_3-MsPacgirl-*36.51*.checkpoint
INFO:tensorflow:Restoring parameters from ../models/B_4_3/MsPacgirl/B_4_3-MsPacgirl-*36.51*.checkpoint

checking performace
Finished in 00:01:47                                                                                            
MsPacgirl, score mean = 35.45, dis return mean = 4.716168055875034
Boxing

../models/B_4_3/Boxing/B_4_3-Boxing-*4.2*.checkpoint
INFO:tensorflow:Restoring parameters from ../models/B_4_3/Boxing/B_4_3-Boxing-*4.2*.checkpoint

checking performace
Finished in 00:05:37                                                                                            
Boxing, score mean = 4.69, dis return mean = -0.39701310973007975
Pong

../models/B_4_3/Pong/B_4_3-Pong-*-18.6*.checkpoint
INFO:tensorflow:Restoring parameters from ../models/B_4_3/Pong/B_4_3-Pong-*-18.6*.checkpoint

checking performace
Finished in 00:03:12                                                                             