In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import gym
import argparse
import numpy as np
from ns3gym import ns3env
from DQN_model import DeepQNetwork
from DQN_model import Eval_Model
from DQN_model import Target_Model
import tensorflow as tf

import matplotlib.pyplot as plt

In [None]:
learning_rate = 0.01
reward_decay = 0.9
e_greedy = 0.9
replace_target_iter = 100
memory_size = 500
batch_size = 32
training_episodes = 20
testing_episodes = 10
max_chosen = 3
training_repeat_chosen = []
testing_repeat_chosen = []
episode_time = 300
eval_model_path = ''#'/workspace/model/eval_model'
target_model_path = ''#'/workspace/model/target_model'

In [None]:
#env = gym.make('ns3-v0')
env = ns3env.Ns3Env(debug=True)

#env.reset()

ob_space = env.observation_space
ac_space = env.action_space
ob_space_n = ob_space['slotUsedTable'].shape[0] + ob_space['pktBytes'].shape[0] - 1

print("Observation space: ", ob_space,  ob_space.dtype)
print("Action space: ", ac_space, ac_space.dtype)
print("n_action: ",ac_space.shape[0])

In [None]:
try:
    eval_model = tf.keras.models.load_model(eval_model_path)
    target_model = tf.keras.models.load_model(target_model_path)
    print('load model')
except:
    eval_model = Eval_Model(num_actions=ac_space.shape[0])
    target_model = Target_Model(num_actions=ac_space.shape[0])
    print('Create new model')
RL = DeepQNetwork(ac_space.shape[0], max_chosen, ob_space_n,
                  eval_model, target_model, learning_rate, reward_decay, e_greedy, 
                  replace_target_iter, memory_size, batch_size)

In [None]:
for episode in range(training_episodes):
    print ("-----------------------episodes: ", episode, " -----------------------")
    stepIdx = 0
    repeat_chosen_counter = 0
    
    _obs = env.reset()
    queueBytes = _obs[1][0]
    
    _obs = np.array(list(_obs[0]) + list(_obs[1][1:]))
    _obs = np.pad(_obs,(0, ob_space_n - _obs.size), constant_values = 0)

    
    while True:
        stepIdx += 1

        action = np.array(RL.choose_action(_obs, True))

        
        # Using the queuing bytes to decide how many data slot would be choose
        action_num = 0 if queueBytes == 0 else min(max(0, int(queueBytes/6250 - 0.3)) + 1, 3)
        
        action[action_num:] = [-1] * (max_chosen-action_num)
        
        
        #action = env.action_space.sample()
        
        #print("---action: ", action)
        #print ("Send to env")
        obs, reward, done, info = env.step(action)
        #print ("Back!")
        
        if info == "TimeOut":
            print("Step: ", stepIdx)
            print ("Repeat_chosen_counter :", repeat_chosen_counter)
            RL.learn()
            
            training_repeat_chosen.append(repeat_chosen_counter)
            break
        
        # Get queuing bytes
        queueBytes = obs[1][0]
        
        # Since there are multiple action in one step,
        # according to each action, it would have one reward.
        # But in ns3gym, the reward type is float, it means it can only return one reward,
        # we use the return info to send multiple reward
        
        reward_all = [float(r) for r in info.split(',')]
        info = stepIdx
        
        #print("Step: ", stepIdx)
        #print("---obs, reward, done, info: ", obs, reward_all, done, info)
        
        obs = np.array(list(obs[0]) + list(obs[1][1:]))
        obs = np.pad(obs,(0, ob_space_n - obs.size), constant_values = 0)
        
        for act, r in zip(action,reward_all):
            RL.store_transition(_obs, act, r, obs)
            
            if r <= -100:
                repeat_chosen_counter += 1
        
        if (stepIdx > 64*50) and (stepIdx % 5 == 0):
            RL.learn()
            
        # swap observation
        _obs = obs
        
        
        if done or stepIdx == int(300/0.132*64):
            print("Step: ", stepIdx)
            print ("done")
            print ("Repeat_chosen_counter :", repeat_chosen_counter)
            RL.learn()
            
            training_repeat_chosen.append(repeat_chosen_counter)
            break
            
        #break
        


In [None]:
for episode in range(testing_episodes):
    print ("-----------------------episodes: ", episode, " -----------------------")
    stepIdx = 0
    repeat_chosen_counter = 0
    
    _obs = env.reset()
    queueBytes = _obs[1][0]
    
    _obs = np.array(list(_obs[0]) + list(_obs[1][1:]))
    _obs = np.pad(_obs,(0, ob_space_n - _obs.size), constant_values = 0)

    
    while True:
        stepIdx += 1

        action = np.array(RL.choose_action(_obs, False))

        
        # Using the queuing bytes to decide how many data slot would be choose
        action_num = 0 if queueBytes == 0 else min(max(0, int(queueBytes/6250 - 0.3)) + 1, 3)
        
        action[action_num:] = [-1] * (max_chosen-action_num)
        
        
        #action = env.action_space.sample()
        
        #print("---action: ", action)
        #print ("Send to env")
        obs, reward, done, info = env.step(action)
        #print ("Back!")
        
        if info == "TimeOut":
            print("Step: ", stepIdx)
            print ("Repeat_chosen_counter :", repeat_chosen_counter)
            RL.learn()
            
            testing_repeat_chosen.append(repeat_chosen_counter)
            break
        
        # Get queuing bytes
        queueBytes = obs[1][0]
        
        # Since there are multiple action in one step,
        # according to each action, it would have one reward.
        # But in ns3gym, the reward type is float, it means it can only return one reward,
        # we use the return info to send multiple reward
        
        reward_all = [float(r) for r in info.split(',')]
        info = stepIdx
        
        #print("Step: ", stepIdx)
        #print("---obs, reward, done, info: ", obs, reward_all, done, info)
        
        obs = np.array(list(obs[0]) + list(obs[1][1:]))
        obs = np.pad(obs,(0, ob_space_n - obs.size), constant_values = 0)
        
        for act, r in zip(action,reward_all):
            #RL.store_transition(_obs, act, r, obs)
            
            if r <= -100:
                repeat_chosen_counter += 1
        
        #if (stepIdx > 64*50) and (stepIdx % 5 == 0):
        #    RL.learn()
            
        # swap observation
        _obs = obs
        
        
        if done or stepIdx == int(300/0.132*64):
            print("Step: ", stepIdx)
            print ("done")
            print ("Repeat_chosen_counter :", repeat_chosen_counter)
            #RL.learn()
            
            testing_repeat_chosen.append(repeat_chosen_counter)
            break
            
        #break
        


In [None]:
# step counter of each episode
plt.plot(training_repeat_chosen)
plt.show()

In [None]:
RL.plot_cost()

In [None]:
env.close()

In [None]:
eval_model.save('/workspace/model/eval_model', save_format='tf')
target_model.save('/workspace/model/target_model', save_format='tf')