In [1]:
import torch
import numpy as np
from utils import check_dir
from memory import Memory
from environments.SimpleEnv import SimpleEnv
from utils import create_input, translate_state
from evaluation_torch import evaluate
from model.simple_stack_torch import SimpleStack


In [2]:
# training cases
order = "Torch"
# batch size
batch_size = 512
# agent view
agent_view = 5
map_size = 20
# action max
action_max = 3
# learning rate
model_save = "./model_save/"
lr = 0.001
num_episode = 1000000
# start play
replay_start = 10000
# update step
update_step = 1000
# gamma in q-loss calculation
gamma = 0.99
# memory pool size
memory_length = 100000
# file to save train log
summary = "./{}_Reward.csv".format(order)
eval_statistics = "./{}_CSV.csv".format(order)
# the number of step it take to linearly anneal the epsilon to it min value
annealing_end = 200000
# min level of stochastically of policy (epsilon)-greedy
epsilon_min = 0.2
# temporary files
temporary_model = "./{}/{}.params".format(model_save, order)
temporary_pool = "./{}/{}.pool".format(model_save, order)

In [3]:
if torch.cuda.is_available():
    ctx = torch.device("cuda:%s" %torch.cuda.current_device())
else:
    ctx = torch.device("cpu")
for i in ["model_save", "data_save"]:
    check_dir(i)

In [4]:
# build models
online_model = SimpleStack()
offline_model = SimpleStack()
online_model.to(ctx)
offline_model.to(ctx)
offline_model.zero_grad(True)

In [5]:
# create env
env = SimpleEnv(display=False, agent_view=agent_view, map_size=map_size)
env.reset_env()
memory_pool = Memory(memory_length)
annealing = 0
total_reward = np.zeros(num_episode)
eval_result = []
loss_func = torch.nn.MSELoss()
trainer = torch.optim.Adam(offline_model.parameters(), lr=lr)

In [None]:
_print = True
_last_dr_50 = 0
for epoch in range(num_episode):
    env.reset_env()
    finish = 0
    cum_clipped_dr = 0
    if epoch == 51:
        print("Model Structure: ")
        print(offline_model)
    if sum(env.step_count) > replay_start and _print:
        print('annealing and learning are started')
        _print = False
    while not finish:
        if sum(env.step_count) > replay_start:
            annealing += 1
        eps = np.maximum(1 - sum(env.step_count) / annealing_end, epsilon_min)
        if np.random.random() < eps:
            by = "Random"
            action = np.random.randint(0, action_max)
        else:
            by = "Model"
            data = create_input([translate_state(env.map.state())])
            data = [torch.FloatTensor(i).to(ctx) for i in data]
            action = offline_model.forward(data)
            action = int(torch.argmax(action).cpu().numpy())
        old, new, reward_get, finish = env.step(action)
        memory_pool.add(old, new, action, reward_get, finish)
        if finish and epoch > 50:
            cum_clipped_dr += env.detect_rate[-1]
            dr_50 = float(np.mean(env.detect_rate[-50:]))
            dr_all = float(np.mean(env.detect_rate))
            if epoch % 50 == 0:
                text = "DR: %f(50), %f(all), eps: %f" % (dr_50, dr_all, eps)
                print(text)
                with open(summary, "a") as f:
                    f.writelines(text + "\n")
            if epoch % 100 == 0 and annealing > replay_start:
                eval_result.extend(evaluate(ctx, offline_model, env, 5))
            # save model and replace online model each update_step
            if annealing > replay_start and annealing % update_step == 0:
                if dr_50 >= _last_dr_50:
                    _last_dr_50 = dr_50
                    torch.save(offline_model.state_dict(), temporary_model)
                    online_model =torch.load(temporary_model, ctx)
    #  train every 2 epoch
    if annealing > replay_start and epoch % 2 == 0:
        # Sample random mini batch of transitions
        if len(memory_pool.memory) > batch_size:
            bz = batch_size
        else:
            bz = len(memory_pool.memory)
        for_train = memory_pool.next_batch(bz)
        state =  [torch.FloatTensor(i).to(ctx) for i in for_train["state"]]
        state_next = [torch.FloatTensor(i).to(ctx) for i in for_train["state_next"]]
        finish =torch.FloatTensor(for_train["finish"]).to(ctx)
        action = torch.LongTensor(for_train["action"]).to(ctx)
        reward = torch.FloatTensor(for_train["reward"]).to(ctx)
        q_n  = torch.max(online_model(state_next).detach(), dim=1).values
        q_n  = q_n  * (torch.ones(bz).to(ctx) - finish)
        q_e = offline_model(state).gather(1, action.unsqueeze(-1)).squeeze()
        q_t = reward + gamma * q_n
        loss = torch.mean(loss_func(q_e, q_t))
        trainer.zero_grad()
        loss.backward()
        trainer.step()
    total_reward[int(epoch) - 1] = cum_clipped_dr

Model Structure: 
SimpleStack(
  (view): Sequential(
    (0): Conv2d(2, 256, kernel_size=(1, 1), stride=(1, 1))
    (1): Conv2d(256, 128, kernel_size=(2, 2), stride=(1, 1))
    (2): Conv2d(128, 128, kernel_size=(2, 2), stride=(1, 1))
  )
  (map): Sequential(
    (0): Conv2d(3, 256, kernel_size=(1, 1), stride=(1, 1))
    (1): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1))
    (2): Conv2d(256, 128, kernel_size=(3, 3), stride=(1, 1))
    (3): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1))
    (4): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1))
    (5): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1))
  )
  (decision_making): Sequential(
    (0): Linear(in_features=13953, out_features=1024, bias=True)
    (1): Sigmoid()
    (2): Linear(in_features=1024, out_features=64, bias=True)
    (3): Sigmoid()
    (4): Linear(in_features=64, out_features=3, bias=True)
    (5): Sigmoid()
  )
)
DR: 0.062128(50), 0.054079(all), eps: 0.964525
annealing and learning are started
DR: 0.049