In [1]:
import random
import math
import matplotlib.pyplot as plt
import time
from ENV import Environment
import numpy as np
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
MAX_EPISODES = 200
MEMORY_CAPACITY = 10000
BATCH_SIZE = 32


# Deep Q Network off-policy
class DQN(object):
    def __init__(
            self,
            n_actions,
            n_features,
            learning_rate=0.1,
            reward_decay=0.001,
            e_greedy=0.99,
            replace_target_iter=200,
            memory_size=MEMORY_CAPACITY,
            batch_size=BATCH_SIZE,
            # e_greedy_increment=8.684615e-05,
            # e_greedy_increment=None,
            output_graph=False,
    ):
        self.n_actions = n_actions
        self.n_features = n_features
        self.lr = learning_rate
        self.gamma = reward_decay
        self.epsilon_max = e_greedy
        self.replace_target_iter = replace_target_iter
        self.memory_size = memory_size
        self.batch_size = batch_size
        # self.epsilon_increment = e_greedy_increment
        # self.epsilon = 0 if e_greedy_increment is not None else self.epsilon_max
        self.epsilon = 0.9
        # self.epsilon = 0.9

        # total learning step
        self.learn_step_counter = 0

        # initialize zero memory [s, a, r, s_]
        # memory里存放当前和下一个state，动作和奖励
        self.memory = np.zeros(
            (MEMORY_CAPACITY, n_features * 2 + 2), dtype=np.float32)

        # consist of [target_net, evaluate_net]
        self._build_net()

        t_params = tf.get_collection(
            tf.GraphKeys.GLOBAL_VARIABLES, scope='target_net')
        e_params = tf.get_collection(
            tf.GraphKeys.GLOBAL_VARIABLES, scope='eval_net')

        with tf.variable_scope('hard_replacement'):
            self.target_replace_op = [
                tf.assign(t, e) for t, e in zip(t_params, e_params)]

        self.sess = tf.Session()

        self.sess.run(tf.global_variables_initializer())
        self.cost_his = []

    def _build_net(self):
        # ------------------ all inputs ------------------------
        self.s = tf.placeholder(
            tf.float32, [None, self.n_features], name='s')  # input State
        self.s_ = tf.placeholder(
            tf.float32, [None, self.n_features], name='s_')  # input Next State
        self.r = tf.placeholder(tf.float32, [None, ], name='r')  # input Reward
        self.a = tf.placeholder(tf.int32, [None, ], name='a')  # input Action

        w_initializer, b_initializer = tf.random_normal_initializer(
            0., 0.3), tf.constant_initializer(0.1)

        # ------------------ build evaluate_net ------------------
        with tf.variable_scope('eval_net'):
            e1 = tf.layers.dense(self.s, 100, tf.nn.relu6, kernel_initializer=w_initializer,
                                 bias_initializer=b_initializer, name='e1')
            # e2 = tf.layers.dense(e1, 48, tf.nn.relu6, kernel_initializer=w_initializer,
            #                      bias_initializer=b_initializer, name='e2')
            e3 = tf.layers.dense(e1, 20, tf.nn.relu, kernel_initializer=w_initializer,
                                 bias_initializer=b_initializer, name='e3')
            self.q_eval = tf.layers.dense(e3, self.n_actions, tf.nn.softmax, kernel_initializer=w_initializer,
                                          bias_initializer=b_initializer, name='q')

        # ------------------ build target_net ------------------
        with tf.variable_scope('target_net'):
            t1 = tf.layers.dense(self.s_, 100, tf.nn.relu6, kernel_initializer=w_initializer,
                                 bias_initializer=b_initializer, name='t1')
            # t2 = tf.layers.dense(t1, 48, tf.nn.relu6, kernel_initializer=w_initializer,
            #                      bias_initializer=b_initializer, name='t2')
            t3 = tf.layers.dense(t1, 20, tf.nn.relu, kernel_initializer=w_initializer,
                                 bias_initializer=b_initializer, name='t3')
            self.q_next = tf.layers.dense(t3, self.n_actions, tf.nn.softmax, kernel_initializer=w_initializer,
                                          bias_initializer=b_initializer, name='t4')

        with tf.variable_scope('q_target'):
            q_target = self.r + self.gamma * \
                tf.reduce_max(self.q_next, axis=1,
                              name='Qmax_s_')  # shape=(None, )
            self.q_target = tf.stop_gradient(q_target)
        with tf.variable_scope('q_eval'):
            a_indices = tf.stack(
                [tf.range(tf.shape(self.a)[0], dtype=tf.int32), self.a], axis=1)
            self.q_eval_wrt_a = tf.gather_nd(
                params=self.q_eval, indices=a_indices)  # shape=(None, )
        with tf.variable_scope('loss'):
            self.loss = tf.reduce_mean(tf.squared_difference(
                self.q_target, self.q_eval_wrt_a, name='TD_error'))
        with tf.variable_scope('train'):
            # self._train_op = tf.train.RMSPropOptimizer(self.lr).minimize(self.loss)
            self._train_op = tf.train.AdamOptimizer(
                self.lr).minimize(self.loss)

    def store_transition(self, s, a, r, s_):
        if not hasattr(self, 'memory_counter'):
            self.memory_counter = 0
        transition = np.hstack((s, a, [r], s_))
        # replace the old memory with new memory
        index = self.memory_counter % MEMORY_CAPACITY
        self.memory[index, :] = transition
        self.memory_counter += 1

    def choose_action(self, observation):
        # to have batch dimension when feed into tf placeholder
        observation = observation[np.newaxis, :]

        if np.random.uniform() < self.epsilon:
            # forward feed the observation and get q value for every actions
            actions_value = self.sess.run(
                self.q_eval, feed_dict={self.s: observation})
            action = np.argmax(actions_value)
        else:
            action = np.random.randint(0, 2)
        return action

    def learn(self):
        # check to replace target parameters
        if self.learn_step_counter % self.replace_target_iter == 0:
            self.sess.run(self.target_replace_op)
            # print('\ntarget_params_replaced\n')

        # sample batch memory from all memory
        if self.memory_counter > self.memory_size:
            sample_index = np.random.choice(
                self.memory_size, size=self.batch_size)
        else:
            sample_index = np.random.choice(
                self.memory_counter, size=self.batch_size)
        batch_memory = self.memory[sample_index, :]

        _, cost = self.sess.run(
            [self._train_op, self.loss],
            feed_dict={
                self.s: batch_memory[:, :self.n_features],
                self.a: batch_memory[:, self.n_features],
                self.r: batch_memory[:, self.n_features + 1],
                self.s_: batch_memory[:, -self.n_features:],
            })

        self.cost_his.append(cost)
        self.learn_step_counter += 1


env = Environment()
action_space = ['0', '1']
n_actions = len(action_space)
n_features = 7
RL = DQN(n_actions, n_features, output_graph=False)
#QL = QLearningTable(actions=list(range(env.n_actions)))
MAX_EPISODES = 3000
T = 100
# var = 1  # control exploration
var = 0.1  # control exploration
t1 = time.time()
episode_list = []
delay_list = []
actions = []
offload = 0
local = 0
penalty = 5
delay = 0
r = 0

for episode in range(MAX_EPISODES):

    if episode % 10 == 0:
        time.sleep(0.1)
    i = 0
    for i in range(T):
        obs = env.observed()
        # print("obs", obs[0], obs[1])
        # Add exploration noise
        #a0 = RL.choose_action_d(obs)
        #print('DQN_a0', a0)
        a = RL.choose_action(obs)
        #print("DQN action", a)

        w_n = 0.2  # 0.1 or 0.99
        f_n = 0.3  # 0.1 or 0.99
        actions = [0.3, 0.4]

        if a == 1:
            # offload
            R_n = w_n*obs[0]*env.Wmax * \
                math.log(1+(obs[5]/(w_n*obs[0]*env.Wmax*env.n0)), 2)
            T_trans = obs[2] / R_n
            T_MEC = obs[3] / (f_n*obs[1]*env.Fmax)
            T_offload = round((T_trans + T_MEC), 5)
            reward = env.step(actions, round(T_trans, 5), round(T_MEC, 5))

            #reward = 10
            s_ = obs.copy()
            # print('S_', s_)
            s_[0] -= (s_[0] * w_n)
            s_[1] -= (s_[1] * f_n)
            # print(obs, s_)
            offload += 1

        else:
            # loacl
            actions[0] = actions[1] = 0
            T_local = obs[3] / obs[6]
            E_local = env.kn * pow(obs[6], 2) * obs[3]
            reward = env.alpha * T_local + env.beta * E_local

            #reward = 10
            # s_ = obs
            s_ = obs.copy()
            # s_[6] = 0
            # print(obs, s_)
            local += 1

        #reward = reward + 0.5
        RL.store_transition(obs, a, -reward, s_)

        env.reward_list.append(reward)

        if RL.memory_counter > MEMORY_CAPACITY:
            # var = max([var * 0.9997, VAR_MIN])  # decay the action randomness
            RL.learn()

    # episode_list = np.append(episode_list, ep_reward)
    episode_list.append(env.show_reward(T))
    # print('reward', episode_list)
    # print("actions", actions)
    print('Episode:', episode, ' Steps: %2d' % i, 'Reward',
          episode_list[-1], "local", local, "offload", offload)
    # print('OBS', obs)

    # # Evaluate episode
    # if (i + 1) % 50 == 0:
    #     eval_policy(ddpg, env)

print('Running time: ', time.time() - t1)
plt.plot(episode_list)
plt.xlabel("Episode")
plt.ylabel("reward")
plt.savefig("dqn.png")
plt.show()

Instructions for updating:
non-resource variables are not supported in the long term


  e1 = tf.layers.dense(self.s, 100, tf.nn.relu6, kernel_initializer=w_initializer,
  e3 = tf.layers.dense(e1, 20, tf.nn.relu, kernel_initializer=w_initializer,
  self.q_eval = tf.layers.dense(e3, self.n_actions, tf.nn.softmax, kernel_initializer=w_initializer,
  t1 = tf.layers.dense(self.s_, 100, tf.nn.relu6, kernel_initializer=w_initializer,
  t3 = tf.layers.dense(t1, 20, tf.nn.relu, kernel_initializer=w_initializer,
  self.q_next = tf.layers.dense(t3, self.n_actions, tf.nn.softmax, kernel_initializer=w_initializer,


Episode: 0  Steps: 99 Reward 0.46355525 local 0 offload 100
Episode: 1  Steps: 99 Reward 1.9737175850292936 local 5 offload 195
Episode: 2  Steps: 99 Reward 8.763026332086739 local 11 offload 289
Episode: 3  Steps: 99 Reward 6.158417261520123 local 18 offload 382
Episode: 4  Steps: 99 Reward 4.088424435587828 local 20 offload 480
Episode: 5  Steps: 99 Reward 3.323014116827647 local 24 offload 576
Episode: 6  Steps: 99 Reward 2.3937714716974487 local 27 offload 673
Episode: 7  Steps: 99 Reward 12.9063069398007 local 31 offload 769
Episode: 8  Steps: 99 Reward 3.8032680467385545 local 35 offload 865
Episode: 9  Steps: 99 Reward 77.58274604637234 local 38 offload 962
Episode: 10  Steps: 99 Reward 14.64728377715127 local 43 offload 1057
Episode: 11  Steps: 99 Reward 7.584631846356225 local 49 offload 1151
Episode: 12  Steps: 99 Reward 40483.88335602 local 53 offload 1247
Episode: 13  Steps: 99 Reward 39.29084542305123 local 62 offload 1338
Episode: 14  Steps: 99 Reward 7.7866934497160845 l

Episode: 115  Steps: 99 Reward 0.018502163744459718 local 2031 offload 9569
Episode: 116  Steps: 99 Reward 0.018001337097743722 local 2128 offload 9572
Episode: 117  Steps: 99 Reward 0.018782679768585612 local 2222 offload 9578
Episode: 118  Steps: 99 Reward 0.018742657435201503 local 2317 offload 9583
Episode: 119  Steps: 99 Reward 0.018195250182811266 local 2412 offload 9588
Episode: 120  Steps: 99 Reward 0.017480436093457636 local 2508 offload 9592
Episode: 121  Steps: 99 Reward 0.017789505599827306 local 2603 offload 9597
Episode: 122  Steps: 99 Reward 0.018795523938097924 local 2698 offload 9602
Episode: 123  Steps: 99 Reward 0.01746276362503889 local 2794 offload 9606
Episode: 124  Steps: 99 Reward 0.018319126716313056 local 2892 offload 9608
Episode: 125  Steps: 99 Reward 0.017728050780106367 local 2988 offload 9612
Episode: 126  Steps: 99 Reward 0.017772502717643114 local 3085 offload 9615
Episode: 127  Steps: 99 Reward 0.017539990513032396 local 3181 offload 9619
Episode: 128 

Episode: 223  Steps: 99 Reward 0.017404135391362496 local 12311 offload 10089
Episode: 224  Steps: 99 Reward 0.0179108910671618 local 12404 offload 10096
Episode: 225  Steps: 99 Reward 0.016487584508420042 local 12500 offload 10100
Episode: 226  Steps: 99 Reward 0.01671321356518037 local 12599 offload 10101
Episode: 227  Steps: 99 Reward 0.017552192634570484 local 12697 offload 10103
Episode: 228  Steps: 99 Reward 0.01835752205400875 local 12791 offload 10109
Episode: 229  Steps: 99 Reward 0.01677219235794019 local 12888 offload 10112
Episode: 230  Steps: 99 Reward 0.017391761605966304 local 12985 offload 10115
Episode: 231  Steps: 99 Reward 0.018255154842020074 local 13078 offload 10122
Episode: 232  Steps: 99 Reward 0.017683942326586568 local 13176 offload 10124
Episode: 233  Steps: 99 Reward 0.01708199253038206 local 13271 offload 10129
Episode: 234  Steps: 99 Reward 0.016019738981907995 local 13363 offload 10137
Episode: 235  Steps: 99 Reward 0.016986751544350413 local 13457 offloa

Episode: 330  Steps: 99 Reward 0.015786612083463124 local 22450 offload 10650
Episode: 331  Steps: 99 Reward 0.018486965115590025 local 22543 offload 10657
Episode: 332  Steps: 99 Reward 0.01651496110762643 local 22639 offload 10661
Episode: 333  Steps: 99 Reward 0.0164070831603837 local 22735 offload 10665
Episode: 334  Steps: 99 Reward 0.01658643560128214 local 22830 offload 10670
Episode: 335  Steps: 99 Reward 0.017842642701747343 local 22923 offload 10677
Episode: 336  Steps: 99 Reward 0.017031391663998564 local 23020 offload 10680
Episode: 337  Steps: 99 Reward 0.017765172082902757 local 23116 offload 10684
Episode: 338  Steps: 99 Reward 0.01882932090846909 local 23210 offload 10690
Episode: 339  Steps: 99 Reward 0.01863561786645883 local 23305 offload 10695
Episode: 340  Steps: 99 Reward 0.017432240741680403 local 23397 offload 10703
Episode: 341  Steps: 99 Reward 0.01714859647727466 local 23489 offload 10711
Episode: 342  Steps: 99 Reward 0.017398647638900887 local 23584 offload

Episode: 436  Steps: 99 Reward 0.017438984642514237 local 32525 offload 11175
Episode: 437  Steps: 99 Reward 0.01754242527195866 local 32622 offload 11178
Episode: 438  Steps: 99 Reward 0.017611951012866646 local 32714 offload 11186
Episode: 439  Steps: 99 Reward 0.017033652633528486 local 32807 offload 11193
Episode: 440  Steps: 99 Reward 0.016756982466773616 local 32899 offload 11201
Episode: 441  Steps: 99 Reward 0.016395659474473773 local 32994 offload 11206
Episode: 442  Steps: 99 Reward 0.017361852885567818 local 33089 offload 11211
Episode: 443  Steps: 99 Reward 0.017514852100517654 local 33184 offload 11216
Episode: 444  Steps: 99 Reward 0.017707734632256146 local 33281 offload 11219
Episode: 445  Steps: 99 Reward 0.016667986873411162 local 33375 offload 11225
Episode: 446  Steps: 99 Reward 0.017051772826754474 local 33469 offload 11231
Episode: 447  Steps: 99 Reward 0.018562549748374898 local 33561 offload 11239
Episode: 448  Steps: 99 Reward 0.017960293042451225 local 33654 o

Episode: 542  Steps: 99 Reward 0.017315678151866264 local 42568 offload 11732
Episode: 543  Steps: 99 Reward 0.017552819156417414 local 42662 offload 11738
Episode: 544  Steps: 99 Reward 0.017387809866557195 local 42757 offload 11743
Episode: 545  Steps: 99 Reward 0.01729238872478411 local 42852 offload 11748
Episode: 546  Steps: 99 Reward 0.017016338021999 local 42946 offload 11754
Episode: 547  Steps: 99 Reward 0.018461452703526192 local 43040 offload 11760
Episode: 548  Steps: 99 Reward 0.017008045638669672 local 43135 offload 11765
Episode: 549  Steps: 99 Reward 0.01832741374087381 local 43229 offload 11771
Episode: 550  Steps: 99 Reward 0.017686784114214572 local 43322 offload 11778
Episode: 551  Steps: 99 Reward 0.01744526376555301 local 43417 offload 11783
Episode: 552  Steps: 99 Reward 0.01743850685218146 local 43510 offload 11790
Episode: 553  Steps: 99 Reward 0.016439439101998278 local 43603 offload 11797
Episode: 554  Steps: 99 Reward 0.017991650669860505 local 43696 offload

KeyboardInterrupt: 