In [1]:
import os
from PIL import Image
import numpy as np
import mxnet as mx
import tqdm
from utils import check_dir
from memory import Memory
from utils import create_input, translate_state
from evaluation_mxnet import evaluate
from mxnet import gluon, nd, autograd
from environments.SimpleEnv import SimpleEnv



In [2]:
# training cases
order = "model_test"
# batch size
batch_size = 16
# agent view
agent_view = 5
map_size = 20
# action max
action_max = 3
# learning rate
model_save = "./model_save/"
lr = 0.001
num_episode = 1000000
# start play
replay_start = 2000
# update step
update_step = 1000
# gamma in q-loss calculation
gamma = 0.99
# memory pool size
memory_length = 20000
# file to save train log
summary = "./{}_Reward.csv".format(order)
eval_statistics = "./{}_CSV.csv".format(order)
# the number of step it take to linearly anneal the epsilon to it min value
annealing_end = 100000
# min level of stochastically of policy (epsilon)-greedy
epsilon_min = 0.2
# temporary files
temporary_model = "./{}/{}.params".format(model_save, order)
temporary_pool = "./{}/{}.pool".format(model_save, order)

In [3]:
if os.path.exists(summary):
    os.remove(summary)
ctx = mx.gpu()
for i in ["model_save", "data_save"]:
    check_dir(i)

In [4]:
# build models
from model.simple_stack import SimpleStack
online_model = SimpleStack()
offline_model = SimpleStack()
online_model.collect_params().initialize(mx.init.MSRAPrelu(), ctx=ctx)
offline_model.collect_params().initialize(mx.init.MSRAPrelu(), ctx=ctx)
offline_model.collect_params().zero_grad()

In [5]:
# create env
env = SimpleEnv(display=False, agent_view=agent_view, map_size=map_size)
memory_pool = Memory(memory_length)
annealing = 0
total_reward = np.zeros(num_episode)
loss_func = gluon.loss.L2Loss()
trainer = gluon.Trainer(offline_model.collect_params(), 'adam', {'learning_rate': lr})

In [6]:
_print = True
best = 0
_all = 0
_update = 0

In [None]:
for epoch in range(num_episode):
    env.reset_env()
    finish = 0
    cum_clipped_dr = 0
    if epoch == 51:
        print("Model Structure: ")
        print(offline_model)
    if sum(env.step_count) > replay_start and _print:
        print('annealing and learning are started')
        _print = False
    eps = np.maximum(1 - sum(env.step_count) / annealing_end, epsilon_min)
    if np.random.random() < eps :
        by = "Random"
        hidden = offline_model.random_state()
    else:
        by = "Model"
        action = np.random.randint(0, action_max)
        hidden = offline_model.begin_state()
    next_hidden = None
    last_hidden = None
    hidden_history = [hidden]
    while not finish:
        _all += 1
        if sum(env.step_count) > replay_start:
            annealing += 1
        if by == "Random":
            action = np.random.randint(0, action_max)
            old, new, reward_get, finish = env.step(action)
            hidden_history.append(offline_model.random_state())
        else:
            data = create_input([translate_state(env.map.state())])
            data = [nd.array(i, ctx=ctx) for i in data]
            hidden = [i.as_in_context(ctx) for i in hidden]
            action, hidden = offline_model(data, hidden)
            action = int(nd.argmax(action, axis=1).asnumpy()[0])
            hidden_history.append(hidden)
        old, new, reward_get, finish = env.step(action)
        old["hidden"] = [i.asnumpy() for i in hidden_history[-1]]
        new["hidden"] = [i.asnumpy() for i in offline_model.random_state()]
        memory_pool.add(old, new, action, reward_get, finish)
        if finish and epoch > 50:
            cum_clipped_dr += sum(env.detect_rate[-1]) / 2
            dr_50 = float(np.mean(env.detect_rate[-50:]))
            dr_all = float(np.mean(env.detect_rate))
            ar_50 = float(np.mean(env.rewards[-50:]))
            ar_all = float(np.mean(env.rewards))
            if epoch % 50 == 0:
                text = "DR: %f(50), %f(all), AR: %f(50), %f(all) eps: %f" % (dr_50, dr_all, ar_50, ar_all, eps)
                print(text)
                with open(summary, "a") as f:
                    f.writelines(text + "\n")
            # save model and replace online model each update_step
            if annealing > replay_start and annealing % update_step == 0:
                offline_model.save_parameters(temporary_model)
                online_model.load_parameters(temporary_model, ctx)
                if best < dr_all:
                    best = dr_all
                    offline_model.save_parameters(temporary_model+ ".best")
    #  train every epoch
    if annealing > replay_start:
        _update += 1
        # Sample random mini batch of transitions
        if len(memory_pool.memory) > batch_size:
            bz = batch_size
        else:
            bz = len(memory_pool.memory)
        for_train = memory_pool.next_batch(bz)
        with autograd.record(train_mode=True):
            _state =[nd.array(i, ctx=ctx) for i in for_train["state"]]
            _state_next = [nd.array(i, ctx=ctx) for i in for_train["state_next"]]
            _hidden = [nd.array(i, ctx=ctx) for i in for_train["hidden_state"]]
            _hidden = nd.concat(*_hidden, dim=2)
            _hidden = [_hidden[0], _hidden[1]]
            _hidden_next = [nd.array(i, ctx=ctx) for i in for_train["hidden_state_next"]]
            _hidden_next = nd.concat(*_hidden_next, dim=2)
            _hidden_next = [_hidden_next[0], _hidden_next[1]]
            _finish = nd.array(for_train["finish"], ctx=ctx)
            _action = nd.array(for_train["action"], ctx=ctx)
            _reward = nd.array(for_train["reward"], ctx=ctx)
            q_sp = nd.max(online_model(_state_next, _hidden_next)[0], axis=1)
            q_sp = q_sp * (nd.ones(bz, ctx=ctx) - _finish)
            q_s_array = offline_model(_state, _hidden)[0]
            q_s = nd.pick(q_s_array, _action, 1)
            loss = nd.mean(loss_func(q_s, (_reward + gamma * q_sp)))
        loss.backward()
        trainer.step(bz)
    total_reward[int(epoch) - 1] = cum_clipped_dr

annealing and learning are started
Model Structure: 
SimpleStack(
  (map): HybridSequential(
    (0): Conv2D(None -> 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
    (1): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=None)
    (2): HardSwish(
      (act): HardSigmoid(
        (act): ReLU6(
        
        )
      )
    )
    (3): _ResUnit(
      (conv1): _Unit(
        (conv): Conv2D(None -> 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=16, bias=False)
        (bn): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=None)
        (act): Activation(
          (act): Activation(relu)
        )
      )
      (se): _SE(
        (pool): GlobalAvgPool2D(size=(1, 1), stride=(1, 1), padding=(0, 0), ceil_mode=True, global_pool=True, pool_type=avg, layout=NCHW)
        (conv1): Conv2D(None -> 8, kernel_size=(1, 1), stride=(1, 1))
        (act1): Activation(
      

DR: 0.032936(50), 0.029818(all), AR: -0.019950(50), -0.014073(all) eps: 0.900000
DR: 0.037959(50), 0.032514(all), AR: -0.022931(50), -0.017006(all) eps: 0.850000
DR: 0.051157(50), 0.037151(all), AR: -0.025080(50), -0.019014(all) eps: 0.800000
DR: 0.013405(50), 0.032421(all), AR: -0.031596(50), -0.021521(all) eps: 0.750000
DR: 0.021253(50), 0.030566(all), AR: -0.031853(50), -0.023237(all) eps: 0.700000
DR: 0.009452(50), 0.027558(all), AR: -0.048897(50), -0.026892(all) eps: 0.650000
DR: 0.024854(50), 0.027221(all), AR: -0.041771(50), -0.028748(all) eps: 0.600000
DR: 0.009797(50), 0.025289(all), AR: -0.058536(50), -0.032050(all) eps: 0.550000
DR: 0.024570(50), 0.025218(all), AR: -0.051684(50), -0.034010(all) eps: 0.500000
DR: 0.042838(50), 0.026817(all), AR: -0.048643(50), -0.035337(all) eps: 0.450000
DR: 0.019205(50), 0.026183(all), AR: -0.059481(50), -0.037346(all) eps: 0.400000
DR: 0.011948(50), 0.025090(all), AR: -0.067472(50), -0.039660(all) eps: 0.350000
DR: 0.020775(50), 0.024782(a

DR: 0.044408(50), 0.022657(all), AR: -0.060851(50), -0.067746(all) eps: 0.200000
DR: 0.028100(50), 0.022708(all), AR: -0.067153(50), -0.067740(all) eps: 0.200000
DR: 0.017378(50), 0.022658(all), AR: -0.063795(50), -0.067703(all) eps: 0.200000
DR: 0.034922(50), 0.022773(all), AR: -0.071041(50), -0.067734(all) eps: 0.200000
DR: 0.053204(50), 0.023054(all), AR: -0.071883(50), -0.067772(all) eps: 0.200000
DR: 0.010740(50), 0.022941(all), AR: -0.065009(50), -0.067747(all) eps: 0.200000
DR: 0.021848(50), 0.022931(all), AR: -0.078678(50), -0.067846(all) eps: 0.200000
DR: 0.041845(50), 0.023102(all), AR: -0.070070(50), -0.067866(all) eps: 0.200000
DR: 0.053302(50), 0.023371(all), AR: -0.055417(50), -0.067755(all) eps: 0.200000
DR: 0.010534(50), 0.023258(all), AR: -0.069850(50), -0.067774(all) eps: 0.200000
DR: 0.020788(50), 0.023236(all), AR: -0.075914(50), -0.067845(all) eps: 0.200000
DR: 0.043981(50), 0.023416(all), AR: -0.074295(50), -0.067901(all) eps: 0.200000
DR: 0.008948(50), 0.023292(a

In [None]:
import seaborn

In [None]:
y = []
for i in tqdm.tqdm(range(1, len(env.detect_rate))):
    y.append(np.mean(env.detect_rate[:i]))
seaborn.lineplot(x=range(len(y)), y=y)

In [None]:
y = []
for i in tqdm.tqdm(range(1, len(env.rewards))):
    y.append(np.mean(env.rewards[:i]))
seaborn.lineplot(x=range(len(y)), y=y)

In [None]:
nd.mean(loss_func(q_s, (_reward + gamma * q_sp)))