In [None]:
#!/usr/bin/env python
# coding: utf-8

# # This part of code is the DQN brain, which is the brain of the agent.
# All decisions are made in here.

# Two networks are created in class DeepQNetwork, namely evaluate_net and target_net, which respectively output Q estimation and Q reality, use the difference between these two values to construct the loss function, and update the evaluate_net parameter. When selecting an action, use the Q-learning algorithm.

# In[1]:

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt

In [None]:
np.random.seed(2)
tf.set_random_seed(2)

In [None]:
class DeepQNetwork:
    def __init__(self, **kwargs):
        self.n_actions = kwargs.get('n_actions')
        self.n_features = kwargs.get('n_features')
        self.lr = kwargs.get('learning_rate', 0.00025)
        self.gamma = kwargs.get('reward_decay', 0.99) #Reward attenuation factor
        self.epsilon_max = kwargs.get('e_greedy', 0.9)
        self.replace_target_iter = kwargs.get('replace_target_iter', 300) #Number of steps to update Q realistic network parameters
        self.memory_size = kwargs.get('memory_size', 500) #Number of stored memories
        self.batch_size = kwargs.get('batch_size', 64) #The number of samples taken from the memory each time, stochastic gradient descent SGD will be used
        self.epsilon_increment = kwargs.get('e_greedy_increment') #Increase epsilon so that there is a greater probability of getting the best value
        self.epsilon = kwargs.get('epsilon', 0.5 if self.epsilon_increment is not None else self.epsilon_max) #If e_greedy_increment has no value, self.epsilon is set to self.epsilon_max=0.9
        self.output_graph = kwargs.get('output_graph')
        self.log = kwargs.get('log', print) # logging function
        self.statusPeriod = kwargs.get('statusPeriod', 1) # period at which to report status
        self.flag = 0
        # total learning step
        self.learn_step_counter = 0#Record the number of steps learned，Self.epsilon continues to improve based on this number of steps

        # initialize zero memory [s, a, r, s_]
        self.memory = np.zeros((self.memory_size, self.n_features * 2 + 2))
        tf.reset_default_graph()
        self._build_net()

        self.sess = tf.Session()
        if self.output_graph:
            # $ tensorboard --logdir=logs
            # tf.train.SummaryWriter soon be deprecated, use following
            tf.summary.FileWriter("logs/", self.sess.graph)
        init= tf.global_variables_initializer()
        self.sess.run(init)
        self.cost_hist = [] #a cost table that records the error of each step
        #saver = tf.train.Saver(max_to_keep = 1)
       # save_path = saver.save(self.sess, "my_net/my_test_model.ckpt")
        #print("Save to path:", save_path)

    # sess = tf.Session()
    # target_net = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='target_net')
    # l1_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='target_net/l1')
    # l2_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='target_net/l2')
    # saver1 = tf.train.Saver(target_net, max_to_keep=0)
    # saver2 = tf.train.Saver(l1_params, max_to_keep=0)
    # saver3 = tf.train.Saver(l2_params, max_to_keep=0)
    # save_path1 = saver1.save(sess, "./my_net/target_net.ckpt")
    # save_path2 = saver2.save(sess, "./my_net/l1_params.ckpt")
    # save_path3 = saver3.save(sess, "./my_net/l2_params.ckpt")

    def _build_net(self):
        # Building a network
        # ------------------ build evaluate_net ------------------
        self.env_state = tf.placeholder(tf.float32, [None, self.n_features], name='action1')
        #self.s = tf.placeholder(tf.float32, [None, self.n_features], name='s')  # Enter the current state as the input to the NN
        self.q_target = tf.placeholder(tf.float32, [None, self.n_actions], name='Q_target')  # Input Q reality for backward error calculation
        with tf.variable_scope('eval_net'):
            # c_names(collections_names) are the collections to store variable
            c_names, n_l1, w_initializer, b_initializer = ['eval_net_params', tf.GraphKeys.GLOBAL_VARIABLES], 10,                                                           tf.random_normal_initializer(0., 0.3), tf.constant_initializer(0.1)  # config of layers
            # tf.random_normal_initializer(mean=0.0, stddev=1.0, seed=None, dtype=tf.float32)
            # first layer. collections is used later when assign to target net
            with tf.variable_scope('l1'):
                w1 = tf.get_variable('w1', [self.n_features, n_l1], initializer=w_initializer, collections=c_names)
                b1 = tf.get_variable('b1', [1, n_l1], initializer=b_initializer, collections=c_names)
                l1 = tf.nn.tanh(tf.matmul(self.env_state, w1) + b1)

            # second layer. collections is used later when assign to target net
            with tf.variable_scope('l2'):
                w2 = tf.get_variable('w2', [n_l1, self.n_actions], initializer=w_initializer, collections=c_names)
                b2 = tf.get_variable('b2', [1, self.n_actions], initializer=b_initializer, collections=c_names)
                self.q_eval = tf.matmul(l1, w2) + b2

        with tf.variable_scope('loss'):
            self.loss = tf.reduce_mean(tf.squared_difference(self.q_target, self.q_eval))#Constructing a loss-function based on Q estimates and Q reality
        with tf.variable_scope('train'):
            self._train_op = tf.train.RMSPropOptimizer(self.lr).minimize(self.loss)#Training

        # ------------------ build target_net ------------------The target value network predicts that the Qtarget parameter is the previous one, and finally the output q_next
        self.env_state_= tf.placeholder(tf.float32, [None, self.n_features], name='env_state_')    # Input s_ indicates the next state, and q_target is calculated with the next state
        
        with tf.variable_scope('target_net'):
            # c_names(collections_names) are the collections to store variables，Put q realistic parameters into this collection
            c_names = ['target_net_params', tf.GraphKeys.GLOBAL_VARIABLES]

            # first layer. collections is used later when assign to target net
            with tf.variable_scope('l1'):#This is the same as the previous network structure, except that the stored parameters are different, because the target network will not be trained later, but the estimated network is updated separately.
                w1 = tf.get_variable('w1', [self.n_features, n_l1], initializer=w_initializer, collections=c_names)
                b1 = tf.get_variable('b1', [1, n_l1], initializer=b_initializer, collections=c_names)
                l1 = tf.nn.tanh(tf.matmul(self.env_state_, w1) + b1)

            # second layer. collections is used later when assign to target net
            with tf.variable_scope('l2'):
                w2 = tf.get_variable('w2', [n_l1, self.n_actions], initializer=w_initializer, collections=c_names)
                b2 = tf.get_variable('b2', [1, self.n_actions], initializer=b_initializer, collections=c_names)
                self.q_next = tf.matmul(l1, w2) + b2



    def store_transition(self, env_state, a, r, env_state_):#Storage memory
        if not hasattr(self, 'memory_counter'):#hasattr(object, name)Determine whether there is a name attribute or a name method in an object, return the BOOL value, and the index item does not exist at the beginning.
            self.memory_counter = 0#Determines that the self object has a name attribute that returns True, otherwise returns False. That is, without this index value memory_counter, let self.memory_counter=0

        transition = np.hstack((env_state, [a, r], env_state_))#The numpy.hstack(tup) parameter tup can be a tuple, list, or numpy array, and the result is an array of numpys stacked in order (one by column).

        # replace the old memory with new memory
        index = self.memory_counter % self.memory_size
        self.memory[index, :] = transition #Replace the index line in memory with an array of transition lines.

        self.memory_counter += 1

    def choose_action(self, env_state):
        # to have batch dimension when feed into tf placeholder

        env_state = env_state[np.newaxis, :]


        if np.random.uniform() < self.epsilon:#np.random.uniform generates a uniformly distributed random number, the default is 0-1, the maximum probability of selecting the action_value maximum action
            # forward feed the observation and get q value for every actions
            actions_value = self.sess.run(self.q_eval, feed_dict={self.env_state: env_state})
            action = np.argmax(actions_value)
        else:
            action = np.random.randint(0, self.n_actions)

        return action

    def _replace_target_params(self,flag):
        t_params = tf.get_collection('target_net_params')
        e_params = tf.get_collection('eval_net_params')


        self.sess.run([tf.assign(t, e) for t, e in zip(t_params, e_params)])

    def learn(self):
        # check to replace target parameters
        #Check if the target_net parameter is replaced in advance, and self.learn_step_counter records the number of steps.
        if self.learn_step_counter % self.replace_target_iter == 0:
            self._replace_target_params(self.learn_step_counter)
            print(self.learn_step_counter)
            self.log('\ntarget_params_replaced\n')

        # sample batch memory from all memory
        if self.memory_counter > self.memory_size:#If the number of steps that need to be remembered exceeds the memory capacity
            sample_index = np.random.choice(self.memory_size, size=self.batch_size)#Generate a random sample from the given one-dimensional array self.memory_size, size is Output shape.
        else:
            sample_index = np.random.choice(self.memory_counter, size=self.batch_size)#If the number of steps does not exceed the total memory capacity, then at most 32 index values are selected in the self.memory_counter memory values.
        batch_memory = self.memory[sample_index, :]
        #Running these two neural networks
        q_next, q_eval = self.sess.run(
            [self.q_next, self.q_eval],
            feed_dict={
                self.env_state_: batch_memory[:, -self.n_features:],  # Fixed params, q_next is entered by the target value network using the value of the n_features column (observation_) in the memory bank.
                self.env_state: batch_memory[:, :self.n_features],  # Newest params, q_eval is entered by the predicted value network using the value of the positive n_features columns in the memory.
            })
       # print(self.learn_step_counter)
        #Save model parameters
        if self.learn_step_counter % 1000 == 0:
            t_params = tf.get_collection('target_net_params')
            saver1 = tf.train.Saver(t_params,max_to_keep=1)  # , max_to_keep=0)
            saver1.save(self.sess, "./my_net/target_net.ckpt",global_step=self.learn_step_counter,write_meta_graph=False)

        # change q_target w.r.t q_eval's action
        q_target = q_eval.copy()

        batch_index = np.arange(self.batch_size, dtype=np.int32)#Returns a list of index values of length self.batch_size aray([0,1,2,...,31])
        eval_act_index = batch_memory[:, self.n_features].astype(int)#Returns a list of actions of length 32, from the second column of the tag in the memory batch_memory, self.n_features=2
        reward = batch_memory[:, self.n_features + 1]#Returns a list of 32 rewards, extracting the reward from the memory
       
      
        q_target[batch_index, eval_act_index] = reward + self.gamma * np.max(q_next, axis=1)
        #Returns a 32*4 np.array form, q_next is output by the target network (sample number *4), and 32 inputs are taken from the memory to the network.

        # train eval network
        _, self.cost = self.sess.run([self._train_op, self.loss],
                                     feed_dict={self.env_state: batch_memory[:, :self.n_features],
                                                self.q_target: q_target})
        self.cost_hist.append(self.cost)

        # increasing epsilon
        self.epsilon = self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max
        self.learn_step_counter += 1

        
    def plot_cost(self):
        fig = plt.figure()
        plt.plot(self.cost_hist)

        print("cost_hist's Length is {}".format(len(self.cost_hist)))
        plt.ylabel('Cost')
        plt.xlabel('Training steps')
        plt.show()

        # Return fig handle for plot modification outside of this function.
        return fig