In [None]:
import os
import numpy as np
import mxnet as mx
from model.simple_stack import SimpleStack
from utils import check_dir
from memory import Memory
from environments.SimpleEnv import SimpleEnv
from utils import create_input, translate_state
from evaluation import evaluate
from mxnet import gluon, nd, autograd

In [3]:
# training cases
order = "TEST"
# agent view
agent_view = 5
map_size = 10
# action max
action_max = 3
# learning rate
model_save = "./model_save/"
lr = 0.005
num_episode = 1000000
# start play
replay_start = 10000
# update step
update_step = 1000
# gamma in q-loss calculation
gamma = 0.99
# memory pool size
memory_length = 100000
# file to save train log
summary = "./{}_Reward.csv".format(order)
eval_statistics = "./{}_CSV.csv".format(order)
# the number of step it take to linearly anneal the epsilon to it min value
annealing_end = 200000
# min level of stochastically of policy (epsilon)-greedy
epsilon_min = 0.2
# temporary files
temporary_model = "./{}/{}.params".format(model_save, order)
temporary_pool = "./{}/{}.pool".format(model_save, order)

In [4]:
if os.path.exists(summary):
    os.remove(summary)
ctx = mx.gpu()
for i in ["model_save", "data_save"]:
    check_dir(i)

In [5]:
# build models
online_model = SimpleStack(agent_view, map_size)
offline_model = SimpleStack(agent_view, map_size)
online_model.collect_params().initialize(mx.init.Normal(0.02), ctx=ctx)
offline_model.collect_params().initialize(mx.init.Normal(0.02), ctx=ctx)
offline_model.collect_params().zero_grad()



In [6]:
# create env
env = SimpleEnv(display=False)
env.reset_env()
memory_pool = Memory(memory_length, ctx=ctx)

In [7]:
_epoch = 0

In [8]:
_print = False

In [9]:
data = create_input([translate_state(env.map.state())], ctx)
action = offline_model(data)

In [10]:
# workflow
algorithm = DQN([online_model, offline_model], ctx, lr, gamma, memory_pool,
                action_max, temporary_model, bz=1024)
annealing = 0
total_reward = np.zeros(num_episode)
eval_result = []
for epoch in range(_epoch, num_episode):
    _epoch += 1
    env.reset_env()
    finish = 0
    cum_clipped_dr = 0
    if epoch == 100:
        print("Model Structure: ")
        print(offline_model)
    if sum(env.step_count) > replay_start and _print == False:
        print('annealing and learning are started')
        _print = True
    while not finish:
        if sum(env.step_count) > replay_start:
            annealing += 1
        eps = np.maximum(1 - sum(env.step_count) / annealing_end, epsilon_min)
        action, by = algorithm.get_action(env.map.state(), eps)
        old, new, reward_get, finish = env.step(action)
        memory_pool.add(old, new, action, reward_get, finish)
        if finish and epoch > 50:
            cum_clipped_dr += env.detect_rate[-1]
            dr_50 = float(np.mean(env.detect_rate[-50:]))
            dr_all = float(np.mean(env.detect_rate))
            if epoch % 50 == 0:
                text = "DR: %f(50), %f(all), eps: %f" % (dr_50, dr_all, eps)
                print(text)
                with open(summary, "a") as f:
                    f.writelines(text + "\n")
            if epoch % 100 == 0 and annealing > replay_start:
                eval_result.extend(evaluate(offline_model, 5, ctx))
        # save model and replace online model each update_step
        if annealing > replay_start and annealing % update_step == 0:
            copy_params(offline_model, online_model)
            offline_model.save_parameters(temporary_model)
    #  train every epoch
    if annealing > replay_start and epoch % 2 == 0 :
        algorithm.train()
    total_reward[int(epoch) - 1] = cum_clipped_dr

Model Structure: 
SimpleStack(
  (view_decode): Sequential(
    (0): Dense(588 -> 1024, linear)
    (1): Dense(1024 -> 1024, linear)
    (2): Dense(1024 -> 128, linear)
    (3): Dense(128 -> 128, linear)
  )
  (map_decode): Sequential(
    (0): Dense(4800 -> 1024, linear)
    (1): Dense(1024 -> 1024, linear)
    (2): Dense(1024 -> 128, linear)
    (3): Dense(128 -> 128, linear)
  )
  (decision_making): Sequential(
    (0): Dense(260 -> 2048, linear)
    (1): Dense(2048 -> 2048, linear)
    (2): Dense(2048 -> 512, linear)
    (3): Dense(512 -> 128, linear)
    (4): Dense(128 -> 64, linear)
    (5): Dense(64 -> 3, Activation(relu))
  )
)
DR: 0.051664(50), 0.053894(all), eps: 0.950000
annealing and learning are started
DR: 0.073603(50), 0.060420(all), eps: 0.925000
DR: 0.035858(50), 0.054310(all), eps: 0.900000
DR: 0.100320(50), 0.063475(all), eps: 0.875000
DR: 0.065033(50), 0.063734(all), eps: 0.850000
DR: 0.064739(50), 0.063877(all), eps: 0.825000
DR: 0.091645(50), 0.067340(all), eps: 0

KeyboardInterrupt: 