In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import gym
import argparse
import numpy as np
from ns3gym import ns3env
from DQN_model import DeepQNetwork
from DQN_model import Eval_Model
from DQN_model import Target_Model
import tensorflow as tf

import matplotlib.pyplot as plt

In [2]:
learning_rate = 0.01
reward_decay = 0.9
e_greedy = 0.9
replace_target_iter = 100
memory_size = 1000
batch_size = 32
training_episodes = 50
max_choice = 3
training_repeat_choice = []
testing_repeat_choice = []
optimized_choice = []
throughput_list = []
episode_time = 300
eval_model_weights_path = ''#'/workspace/model/eval_model_weights'
target_model_weights_path = ''#'/workspace/model/target_model_weights'


In [3]:
#env = gym.make('ns3-v0')
env = ns3env.Ns3Env(debug=True)

#env.reset()

ob_space = env.observation_space
ac_space = env.action_space
ob_space_n = ob_space['slotUsedTable'].shape[0] + ob_space['pktBytes'].shape[0] - 1
n_slotUsedTable = ob_space['slotUsedTable'].shape[0]


print("Observation space: ", ob_space,  ob_space.dtype)
print("Action space: ", ac_space, ac_space.dtype)
print("n_action: ",ac_space.shape[0])

Got new port for ns3gm interface:  9934
Start command:  /tf/ns3-gym/waf --run "tdma-rl --openGymPort=9934 --simSeed=3795307641"
Started ns3 simulation script, Process Id:  17705
Observation space:  Dict(pktBytes:Box(4,), slotUsedTable:Box(32,)) None
Action space:  Box(32,) int64
n_action:  32


In [4]:
eval_model = Eval_Model(num_actions=ac_space.shape[0])
target_model = Target_Model(num_actions=ac_space.shape[0])
try:
    eval_model.load_weights(eval_model_weights_path)
    target_model.load_weights(target_model_weights_path)
    print('Load weights from ',eval_model_weights_path," and ",target_model_weights_path)
except:
    print('Create new model')

    
RL = DeepQNetwork(ac_space.shape[0], max_choice, ob_space_n,
                  eval_model, target_model, learning_rate, reward_decay, e_greedy, 
                  replace_target_iter, memory_size, batch_size)

Create new model


In [5]:
def slot_select(q_values,choosable_slotNum):
    action_unsort = np.argpartition(q_values,-max_choice)[-max_choice:]
    action = action_unsort[np.argsort(q_values[action_unsort])]
    
    max_choosable_slotNum = min(choosable_slotNum, max_choice)
    # Using the queuing bytes to decide how many data slot would be choose
    action_num = 0 if queueBytes == 0 else min(max(0, int(queueBytes/6250 - 0.3)) + 1, max_choosable_slotNum)
        
    action[action_num:] = [-1] * (max_choice-action_num)
    
    # sort action num
    action[:action_num] = np.sort(action[:action_num])
    
    return action

In [None]:
for episode in range(training_episodes):

    isTraining = True if (episode+1)%10 != 0 else False
    if isTraining:
        print ("-----------------------episodes: ", episode, " (Training)----------------------")
    else:
        print ("-----------------------episodes: ", episode, " (Testing)-----------------------")

    stepIdx = 0
    repeat_choice_counter = 0
    total_choice_counter = 0
    optimized_choice_counter = 0
    nonRepeat_choice_counter = 0
    throughput = 0
    
    _obs = env.reset()
    queueBytes = _obs[1][0]
    
    choosable_slotNum = min(n_slotUsedTable - np.nonzero(_obs[0])[0].size,max_choice)
    
    _obs = np.array(list(_obs[0]) + list(_obs[1][1:]))
    _obs = np.pad(_obs,(0, ob_space_n - _obs.size), constant_values = 0)
    
    
    while True:
        stepIdx += 1

        action = np.array(slot_select(RL.choose_action(_obs, isTraining),choosable_slotNum))

        
        #print("---action: ", action)
        obs, reward, done, info = env.step(action)

        
        if info == "TimeOut":
            print("Step: ", stepIdx)
            
            Repeat_choice_ratio = repeat_choice_counter/total_choice_counter if total_choice_counter != 0 else 0
            print ("Repeat_choice_ratio :", Repeat_choice_ratio)
            print ("Repeat_choice_counter :", repeat_choice_counter)
            print ("Throughput :", throughput)
            
            if isTraining :
                RL.learn()
                if total_choice_counter != 0:  
                    training_repeat_choice.append(Repeat_choice_ratio)
            else:
                if total_choice_counter != 0:
                    testing_repeat_choice.append(Repeat_choice_ratio)
            
            if nonRepeat_choice_counter != 0:
                optimized_choice.append(optimized_choice_counter/nonRepeat_choice_counter)
            
            throughput_list.append(throughput)
            
            break
        
        choosable_slotNum = min(n_slotUsedTable - np.nonzero(obs[0])[0].size,max_choice)
        
        # Get queuing bytes
        queueBytes = obs[1][0]
        
        # Since there are multiple action in one step,
        # according to each action, it would have one reward.
        # But in ns3gym, the reward type is float, it means it can only return one reward,
        # we use the return info to send multiple reward
        
        reward_all = [float(r) for r in info.split(',')]
        throughput = reward_all[-1]
        reward_all = reward_all[:-1]
        
        #reward_all = [0,0,0]
        info = stepIdx
         
        
        #print("Step: ", stepIdx)
        #print("---obs, reward, done, info: ", obs, reward_all, done, info)
        
        obs = np.array(list(obs[0]) + list(obs[1][1:]),dtype=float)
        obs = np.pad(obs,(0, ob_space_n - obs.size), constant_values = 0)
        obs[n_slotUsedTable:] = obs[n_slotUsedTable:]/queueBytes if queueBytes != 0 else np.zeros(3)
        
        top_index = []
        for i in range(3):
            top_index.append(np.ravel(np.argwhere(_obs[:n_slotUsedTable]==i+1)))
        
        for act, r in zip(action,reward_all):
            if act == -1:
                continue
            
            
            total_choice_counter += 1
            
            if (act not in np.nonzero(_obs[:n_slotUsedTable])[0]):
                r += 0.5
                
                #print ("act:",act)
                #print ("r:",r)
                #print ("throughput:",throughput)
                
                isChosen = False
                
                # Add optimized reward
                for i in range(3):
                    if any(act < idx for idx in top_index[i]):
                        distance_map = top_index[i] - act
                        closest_distance = min(np.where(distance_map>0,distance_map,np.inf))
                        distance_reward = (1.2-pow((closest_distance/10),0.1))
                        size_weight = _obs[-(3-i)]
                        r += (distance_reward*size_weight)
                        
                        if not isChosen:
                            isChosen = True
                            optimized_choice_counter += 1
            
                if all(idx.size == 0 for idx in top_index):
                    closest_distance = n_slotUsedTable - act
                    distance_reward = (1.2-pow((closest_distance/10),0.1))
                    r += distance_reward
                else:
                    nonRepeat_choice_counter += 1
                
                
                
            if isTraining:
                RL.store_transition(_obs, act, r, obs)
            
            if r <= 0:
                repeat_choice_counter += 1

                
        #if stepIdx % 10000 == 0:
        #    print("Step: ", stepIdx)
        
        if (total_choice_counter > 1000) and (total_choice_counter % 10 == 0) and isTraining:
            RL.learn()
            
        # swap observation
        _obs = obs
        
        
        if done or stepIdx == int(300/0.04*16):
            print("Step: ", stepIdx)
            print ("done")
            
            Repeat_choice_ratio = repeat_choice_counter/total_choice_counter if total_choice_counter != 0 else 0
            print ("Repeat_choice_ratio :", Repeat_choice_ratio)
            print ("Repeat_choice_counter :", repeat_choice_counter)
            print ("Throughput :", throughput)
            
            if isTraining:
                RL.learn()
                training_repeat_choice.append(Repeat_choice_ratio)
            else:
                testing_repeat_choice.append(Repeat_choice_ratio)
            
            if nonRepeat_choice_counter != 0:
                optimized_choice.append(optimized_choice_counter/nonRepeat_choice_counter)
            
            throughput_list.append(throughput)
            
            break
            
        #break

-----------------------episodes:  0  (Training)----------------------
Step:  120000
done
Repeat_choice_ratio : 0.21190965581041166
Repeat_choice_counter : 71240
Throughput : 81333.0
-----------------------episodes:  1  (Training)----------------------
Got new port for ns3gm interface:  8379
Start command:  /tf/ns3-gym/waf --run "tdma-rl --openGymPort=8379 --simSeed=3599096750"
Started ns3 simulation script, Process Id:  11267
Step:  120000
done
Repeat_choice_ratio : 0.21524537218442394
Repeat_choice_counter : 72511
Throughput : 80361.0
-----------------------episodes:  2  (Training)----------------------
Got new port for ns3gm interface:  8782
Start command:  /tf/ns3-gym/waf --run "tdma-rl --openGymPort=8782 --simSeed=1578943919"
Started ns3 simulation script, Process Id:  3071
Step:  120000
done
Repeat_choice_ratio : 0.22040221094244244
Repeat_choice_counter : 74207
Throughput : 81046.0
-----------------------episodes:  3  (Training)----------------------
Got new port for ns3gm interf

In [None]:
# step counter of each episode
plt.plot(optimized_choice)
plt.show()

plt.plot(training_repeat_choice)
plt.show()

plt.plot(testing_repeat_choice)
plt.show()

plt.plot(throughput_list)
plt.show()

In [None]:
RL.plot_cost()

In [None]:
env.close()

In [None]:
eval_model.save_weights('/workspace/model/eval_model_weights')
target_model.save_weights('/workspace/model/target_model_weights')