In [1]:
import os, os.path
import random
import numpy as np
import math
import matplotlib.pyplot as plt
import time
import tensorflow as tf
from collections import deque
import itertools
from sklearn.utils import shuffle
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Dense
import tensorflow.keras.losses as kls
from libs.utils import *
from libs.generate_boxes import *
from libs.dqn import *

In [2]:
os.environ["CUDA_VISIBLE_DEVICES"]="0"
tf_device='/gpu:0'
tf.get_logger().setLevel('INFO')
tf.keras.backend.floatx()

'float32'

In [3]:
num_episode = 1500
global_step = 0
tr_l, h_fill, tr_r, avg_loss_l,history_eps,used_boxes_eps  = [],[],[],[],[],[]
N_MDD = 7
K = 4
n_candidates =4

In [4]:
boxes_multi1 = [np.array([[20, 20,  4],
         [20,  4,  4],
         [20,  4,  4],
         [20,  4,  4],
         [20,  4,  4],
         [20,  4,  4],
         [20, 20,  4],
         [20, 20,  4],
         [20, 20,  4]])]
gt_pos1 = [np.array([[ 0,  0,  0],
         [ 0,  0,  4],
         [ 0,  4,  4],
         [ 0,  8,  4],
         [ 0, 12,  4],
         [ 0, 16,  4],
         [ 0,  0,  8],
         [ 0,  0, 12],
         [ 0,  0, 16]])]

boxes_multi2 = [np.array([[20, 20,  5],
        [ 4, 20,  5],
        [ 4, 20,  5],
        [ 4, 20,  5],
        [ 4, 20,  5],
        [ 4, 20,  5],
        [10, 20,  5],
        [10, 20,  5],
        [20, 20,  5]])]

gt_pos2 = [np.array([[ 0,  0,  0],
        [ 0,  0,  5],
        [ 4,  0,  5],
        [ 8,  0,  5],
        [12,  0,  5],
        [16,  0,  5],
        [ 0,  0, 10],
        [10,  0, 10],
        [ 0,  0, 15]])]

In [5]:
num_max_boxes = max(len(boxes_multi1[0]), len(boxes_multi2[0]))
num_max_remain = num_max_boxes
print('num_max_boxes',num_max_boxes,'num_max_remain',num_max_remain)

num_max_boxes 9 num_max_remain 9


In [6]:
env=Bpp3DEnv()

In [7]:
class PPO_Actor(tf.keras.Model):
    def __init__(self, state_size, selected_size, remain_size, loading_size, output_size):
        super(PPO_Actor, self).__init__()
        
        l1, b1, k1 = state_size
        self.state_size = (l1*b1*k1,)
        self.case_dnn1 = Dense(64, activation='relu', input_shape=self.state_size)
        self.case_dnn2 = Dense(64, activation='relu')
        
        l2, b2, k2 = selected_size
        self.selected_size = (l2*b2*k2,)
        self.select_dnn1 = Dense(64, activation='relu', input_shape=self.selected_size)
        self.select_dnn2 = Dense(64, activation='relu')
        
        l3, b3, k3 = remain_size
        self.remain_size = (l3*b3*k3,)
        self.remain_dnn1 = Dense(128, activation='relu', input_shape=self.remain_size)
        self.remain_dnn2 = Dense(128, activation='relu')
        
        l4, b4, k4 = loading_size
        self.loading_size = (l4*b4*k4,)
        self.loading_dnn1 = Dense(128, activation='relu', input_shape=self.loading_size)
        self.loading_dnn2 = Dense(128, activation='relu')
        
        self.fc1 = Dense(256, activation='relu')
        self.fc2 = Dense(128, activation='relu')
        self.fc_out = Dense(output_size)
        
    def call(self, cb_list):
        c,s,r,l = cb_list[0], cb_list[1], cb_list[2], cb_list[3]
        c = tf.reshape(c, [-1, self.state_size[0]])
        s = tf.reshape(s, [-1, self.selected_size[0]])
        r = tf.reshape(r, [-1, self.remain_size[0]])
        l = tf.reshape(l, [-1, self.loading_size[0]])
        
        c = self.case_dnn1(c)
        c = self.case_dnn2(c)
        
        s = self.select_dnn1(s)
        s = self.select_dnn2(s)
        
        r = self.remain_dnn1(r)
        r = self.remain_dnn2(r)
        
        l = self.loading_dnn1(l)
        l = self.loading_dnn2(l)
        
        x = concatenate([c,s,r,l])
        x = self.fc1(x)
        x = self.fc2(x)
        q = self.fc_out(x)
        return q

In [8]:
class PPO_Critic(tf.keras.Model):
    def __init__(self, state_size, selected_size, remain_size, loading_size, output_size):
        super(PPO_Critic, self).__init__()
        
        l1, b1, k1 = state_size
        self.state_size = (l1*b1*k1,)
        self.case_dnn1 = Dense(64, activation='relu', input_shape=self.state_size)
        self.case_dnn2 = Dense(64, activation='relu')
        
        l2, b2, k2 = selected_size
        self.selected_size = (l2*b2*k2,)
        self.select_dnn1 = Dense(64, activation='relu', input_shape=self.selected_size)
        self.select_dnn2 = Dense(64, activation='relu')
        
        l3, b3, k3 = remain_size
        self.remain_size = (l3*b3*k3,)
        self.remain_dnn1 = Dense(128, activation='relu', input_shape=self.remain_size)
        self.remain_dnn2 = Dense(128, activation='relu')
        
        l4, b4, k4 = loading_size
        self.loading_size = (l4*b4*k4,)
        self.loading_dnn1 = Dense(128, activation='relu', input_shape=self.loading_size)
        self.loading_dnn2 = Dense(128, activation='relu')
        
        self.fc1 = Dense(256, activation='relu')
        self.fc2 = Dense(128, activation='relu')
        self.fc_out = Dense(output_size)
        
    def call(self, cb_list):
        c,s,r,l = cb_list[0], cb_list[1], cb_list[2], cb_list[3]
        c = tf.reshape(c, [-1, self.state_size[0]])
        s = tf.reshape(s, [-1, self.selected_size[0]])
        r = tf.reshape(r, [-1, self.remain_size[0]])
        l = tf.reshape(l, [-1, self.loading_size[0]])
        
        c = self.case_dnn1(c)
        c = self.case_dnn2(c)
        
        s = self.select_dnn1(s)
        s = self.select_dnn2(s)
        
        r = self.remain_dnn1(r)
        r = self.remain_dnn2(r)
        
        l = self.loading_dnn1(l)
        l = self.loading_dnn2(l)
        
        x = concatenate([c,s,r,l])
        x = self.fc1(x)
        x = self.fc2(x)
        q = self.fc_out(x)
        return q

In [9]:
class PPOAgent():
    def __init__(self, L=20, B=20, H=20, n_remains=5, n_loading=3, lr=1e-8):
        self.state_size = (L,B,1)
        self.selected_size = (L,B,2)
        self.remain_size=(L,B,n_remains)
        self.loading_size=(L,B,n_loading)
        self.output_size=1
        
        self.discount_factor=0.99
        self.gamma = 0.9
        self.clip_pram = 0.2
        self.learning_rate=lr
        
        self.actor = PPO_Actor(self.state_size, self.selected_size, self.remain_size, self.loading_size, self.output_size)
        self.critic = PPO_Critic(self.state_size, self.selected_size, self.remain_size, self.loading_size, self.output_size)
        
        self.actor_optimizer = Adam(self.learning_rate)
        self.critic_optimizer = Adam(self.learning_rate)
        
        self.avg_actor_loss = 0
        self.avg_critic_loss = 0
        
    def get_action(self, state, loaded_mh_c, r_boxes, loading):
        probs = self.actor([state, loaded_mh_c, r_boxes, loading])
        argmax_idx = np.where(probs == tf.math.reduce_max(probs))
        action_idx = argmax_idx[0][0]
        return probs[action_idx], action_idx
        
    def get_value(self, state, loaded_mh_c, r_boxes, loading):
        value = self.critic([state, loaded_mh_c, r_boxes, loading])
        argmax_idx = np.where(value == tf.math.reduce_max(value))
        value = argmax_idx[0][0]
        return value
    
    def get_actor_loss(self, probs, advantage, old_probs, critic_loss):
        probability = probs
        entropy = tf.reduce_mean(tf.math.negative(tf.math.multiply(probability, tf.math.log(probability))))
        entropy = tf.math.abs(entropy) #삭제 가능하도록
        sr1, sr2 = [], []

        for prob, adv, old_prob in zip(probability, advantage, old_probs):
            adv = tf.constant(adv)
            old_prob = tf.constant(old_prob)
            ratio = tf.math.divide(prob, old_prob)
            adv = tf.cast(adv, dtype='float32')
            s1 = tf.math.multiply(ratio, adv)
            s2 = tf.math.multiply(tf.clip_by_value(ratio, 1.0-self.clip_pram, 1.0 + self.clip_pram), adv)
            sr1.append(s1)
            sr2.append(s2)
        sr1 = tf.stack(sr1)
        sr2 = tf.stack(sr2)
        act_loss = tf.math.negative(tf.reduce_mean(tf.math.minimum(sr1, sr2)) - critic_loss + 0.001 * entropy)
        return act_loss
    
    def train(self, state, loaded_mh_c, r_boxes, loaded, returns, adv, probs):
        discnt_rewards = tf.reshape(returns, (len(returns),))
        adv = tf.reshape(adv, (len(adv),))
        old_probs = probs
        old_probs = tf.reshape(old_probs, (len(old_probs),1))
        old_probs = tf.math.abs(old_probs) #삭제 가능하도록
        
        state = np.array(state)
        loaded_mh_c = np.array(loaded_mh_c)
        r_boxes = np.array(r_boxes)
        loaded = np.array(loaded)
        
        with tf.GradientTape() as actor_tape, tf.GradientTape() as critic_tape:
            prob = self.actor([state, loaded_mh_c, r_boxes, loaded])
            prob = tf.math.abs(prob) #삭제 가능하도록
            value = self.critic([state, loaded_mh_c, r_boxes, loaded])
            value = tf.reshape(value, (len(value),))
            critic_loss = 0.5 * kls.mean_squared_error(discnt_rewards, value)
            actor_loss = self.get_actor_loss(prob, adv, old_probs, critic_loss)
            self.avg_actor_loss += actor_loss.numpy()
            self.avg_critic_loss += critic_loss.numpy()
                
        actor_grads = actor_tape.gradient(actor_loss, self.actor.trainable_variables)
        critic_grads = critic_tape.gradient(critic_loss, self.critic.trainable_variables)
        self.actor_optimizer.apply_gradients(zip(actor_grads, self.actor.trainable_variables))
        self.critic_optimizer.apply_gradients(zip(critic_grads, self.critic.trainable_variables))

In [10]:
agent = PPOAgent(L=20, B=20, H=20, n_remains=num_max_remain, n_loading=K, lr=1e-4)

In [11]:
 def terminal_rewards(state, gamma, reward):
    terminal_rewards = []
    for i in range(len(state)):
        rewards = (gamma ** (len(state) - i - 1)) * reward
        terminal_rewards.append(rewards)
    return terminal_rewards

In [12]:
def get_discnt_reward(rewards, values, next_values, done):
    g = 0
    lmbda = 0.95
    gamma = 0.99
    returns = []
    
    for i in reversed(range(len(rewards))):
        delta = rewards[i] + gamma * next_values[i] * dones[i] - values[i]
        g = delta + gamma * lmbda * dones[i] * g
        returns.append(g + values[i])
    returns.reverse()
    return returns

In [13]:
def get_advantage(returns, values):
    advantage = np.array(returns) - values
    advantage = (advantage - np.mean(advantage)) / (np.std(advantage))
    return advantage

In [15]:
st = time.time()
    
#if e%2 == 1: boxes_multi,gt_pos = boxes_multi1.copy(), gt_pos1.copy()
boxes_multi,gt_pos = boxes_multi2.copy(), gt_pos2.copy()

env.reset()
done = False
step = 0

history, h_load, h_remain_size, h_load_size = [],[],[],[]
values, prob_l, dones, next_values = [],[],[],[]
used_boxes, pred_pos = [],[]

boxes_all = np.array(boxes_multi)[0].copy()
r_boxes = boxes_all.copy()

In [16]:
state = env.container_s.copy()
state_h = env.container_h.copy()
step += 1

k = min(K, len(r_boxes))
selected = cbn_select_boxes(r_boxes[:n_candidates], k)
s_order = get_selected_order(selected, k)

num_loaded_c, loading_size_c, loading_pos_c, next_state_c, next_h_c, loaded_mh_c = get_selected_location(s_order, state, state_h, env.height)
in_state, in_r_boxes, in_loading = raw2input(state_h, len(num_loaded_c), r_boxes, num_max_remain, K, loading_size_c, env.height)

In [17]:
num_loaded_c

[4, 4, 4, 4]

In [18]:
prob, action_idx = agent.get_action(in_state, np.array(loaded_mh_c), in_r_boxes, in_loading)
value = agent.get_value(in_state, loaded_mh_c, in_r_boxes, in_loading)
prob_l.append(prob)
values.append(value)

In [19]:
value

0

In [20]:
prob

<tf.Tensor: shape=(1,), dtype=float32, numpy=array([-0.01892245], dtype=float32)>

In [21]:
action_idx

0