In [1]:
import os
import random
import math
import time
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt

from collections import deque
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, concatenate, Conv2D, MaxPooling2D
import tensorflow.keras.losses as kls
#import tensorflow_probability as tfp

from libs.utils import *
from libs.generate_boxes import  *

In [6]:
class Actor(tf.keras.Model):
    def __init__(self, state_size, selected_size, remain_size, output_size):
        super(Actor, self).__init__()
        
        l1, b1, k1 = state_size
        self.state_size = (l1*b1*k1,)
        self.case_dnn1 = Dense(64, activation='relu', input_shape=self.state_size)
        self.case_dnn2 = Dense(64, activation='relu')
        
        l2, b2, k2 = selected_size
        self.selected_size = (l2*b2*k2,)
        self.select_dnn1 = Dense(64, activation='relu', input_shape=self.selected_size)
        self.select_dnn2 = Dense(64, activation='relu')
        
        l3, b3, k3 = remain_size
        self.remain_size = (l3*b3*k3,)
        self.remain_dnn1 = Dense(128, activation='relu', input_shape=self.remain_size)
        self.remain_dnn2 = Dense(128, activation='relu')
        
        self.d1 = Dense(256, activation='relu')
        self.d2 = Dense(256, activation='relu')
        self.d3 = Dense(128, activation='relu')
        self.out = Dense(output_size, activation='softmax')
        
    def call(self, cb_list):
        c, s, r = cb_list[0], cb_list[1], cb_list[2]
        c = tf.reshape(c, [-1, self.state_size[0]])
        s = tf.reshape(s, [-1, self.selected_size[0]])
        r = tf.reshape(r, [-1, self.remain_size[0]])
        
        c = self.case_dnn1(c)
        c = self.case_dnn2(c)
        
        s = self.select_dnn1(s)
        s = self.select_dnn2(s)
        
        r = self.remain_dnn1(r)
        r = self.remain_dnn2(r)
        
        x = concatenate([c,s,r])
        x = self.d1(x)
        x = self.d2(x)
        x = self.d3(x)
        q = self.out(x)
        return q

In [7]:
class Critic(tf.keras.Model):
    def __init__(self, state_size, selected_size, remain_size, output_size):
        super(Critic, self).__init__()
        
        l1, b1, k1 = state_size
        self.state_size = (l1*b1*k1,)
        self.case_dnn1 = Dense(64, activation='relu', input_shape=self.state_size)
        self.case_dnn2 = Dense(64, activation='relu')
        
        l2, b2, k2 = selected_size
        self.selected_size = (l2*b2*k2,)
        self.select_dnn1 = Dense(64, activation='relu', input_shape=self.selected_size)
        self.select_dnn2 = Dense(64, activation='relu')
        
        l3, b3, k3 = remain_size
        self.remain_size = (l3*b3*k3,)
        self.remain_dnn1 = Dense(128, activation='relu', input_shape=self.remain_size)
        self.remain_dnn2 = Dense(128, activation='relu')
        
        self.d1 = Dense(256, activation='relu')
        self.d2 = Dense(256, activation='relu')
        self.d3 = Dense(128, activation='relu')
        self.out = Dense(output_size, activation='softmax')
        
    def call(self, cb_list):
        c, s, r = cb_list[0], cb_list[1], cb_list[2]
        c = tf.reshape(c, [-1, self.state_size[0]])
        s = tf.reshape(s, [-1, self.selected_size[0]])
        r = tf.reshape(r, [-1, self.remain_size[0]])
        
        c = self.case_dnn1(c)
        c = self.case_dnn2(c)
        
        s = self.select_dnn1(s)
        s = self.select_dnn2(s)
        
        r = self.remain_dnn1(r)
        r = self.remain_dnn2(r)
        
        x = concatenate([c,s,r])
        x = self.d1(x)
        x = self.d2(x)
        x = self.d3(x)
        q = self.out(x)
        return q

In [43]:
N_MDD = 7
K = 4
n_candidates = 4

In [44]:
boxes_multi1 = [np.array([[20, 20,  4],
         [20,  4,  4],
         [20,  4,  4],
         [20,  4,  4],
         [20,  4,  4],
         [20,  4,  4],
         [20, 20,  4],
         [20, 20,  4],
         [20, 20,  4]])]

In [87]:
num_max_boxes = len(boxes_multi1[0])
num_max_remain = num_max_boxes
print('num_max_boxes',num_max_boxes,'num_max_remain',num_max_remain)

num_max_boxes 9 num_max_remain 9


In [88]:
env = Bpp3DEnv()

In [89]:
state_size = (20, 20, 1)
selected_size = (20, 20, 2)
remain_size = (20, 20, num_max_remain)
output_size = 1

In [90]:
actor = Actor(state_size, selected_size, remain_size, output_size)
critic = Critic(state_size, selected_size, remain_size, output_size)

In [91]:
actor_opt = Adam(1e-4)
critic_opt = Adam(1e-4)

In [92]:
avg_actor_loss, avg_critic_loss = 0, 0

In [93]:
env.reset()
done = False
step = 0

In [94]:
history, h_load, h_remain_size, h_load_size = [],[],[],[]
next_history, next_load, next_remain_size, next_load_size = [],[],[],[]
used_boxes, pred_pos = [],[]
rewards, values, prob_l, dones = [], [], [], []

In [95]:
boxes_all = np.array(boxes_multi1[0]).copy()
r_boxes = boxes_all.copy()
type(r_boxes), r_boxes, r_boxes.shape

(numpy.ndarray,
 array([[20, 20,  4],
        [20,  4,  4],
        [20,  4,  4],
        [20,  4,  4],
        [20,  4,  4],
        [20,  4,  4],
        [20, 20,  4],
        [20, 20,  4],
        [20, 20,  4]]),
 (9, 3))

In [96]:
while not done:
    state = env.container.copy()
    state_h = env.update_h().copy()
    step += 1
    
    k = min(K, len(r_boxes))
    selected = cbn_select_boxes(r_boxes[:n_candidates], k)
    s_order = get_selected_order(selected, k)
    s_loc_c, num_loaded_box_c, loading_size_c, loading_pos_c, next_cube_c , next_state_c = get_selected_location(s_order, state) 
    loaded_mh_c = np.array([get_loaded_mh(s_loc, env.length, env.breadth, env.height) for s_loc in s_loc_c] ) # 3D -> 2D
    in_state, in_r_boxes, in_loading = raw2input(state_h, len(s_loc_c), r_boxes, num_max_remain, K, loading_size_c, env.height)

    s_order, s_loc_c, num_loaded_box_c, loading_size_c, loading_pos_c, next_cube_c , next_state_c, loaded_mh_c, in_state, in_r_boxes, in_loading =\
        get_unique(s_order, s_loc_c, num_loaded_box_c, loading_size_c, loading_pos_c, next_cube_c , next_state_c, loaded_mh_c, in_state, in_r_boxes, in_loading)
    
    if len(s_loc_c) == 1:
        action_idx = 0
    else:
        probs = actor([in_state, loaded_mh_c, in_r_boxes])
        argmax_idx = np.where(probs == tf.math.reduce_max(probs))
        action_idx = argmax_idx[0][0]
        prob_l.append(probs[action_idx])

        value = critic([in_state, loaded_mh_c, in_r_boxes])
        argmax_idx = np.where(value == tf.math.reduce_max(value))
        value = argmax_idx[0][0]
        values.append(value)
        
    env.convert_state(next_cube_c[action_idx]) 
    num_loaded_box = num_loaded_box_c[action_idx]
    if num_loaded_box != 0:
        new_used_boxes = loading_size_c[action_idx]
        r_boxes = get_remain(new_used_boxes, r_boxes)
    else:
        r_boxes = get_remain(s_order[action_idx], r_boxes)
    used_boxes = used_boxes + loading_size_c[action_idx]
    pred_pos = pred_pos + loading_pos_c[action_idx]  
    if len(r_boxes) == 0 or np.sum(env.container_h != env.height) == 0:
        done = True
    dones.append(done)
        
    if len(s_loc_c) != 1 or done:
        history.append(in_state[action_idx])
        h_load.append(loaded_mh_c[action_idx])
        h_remain_size.append(in_r_boxes[action_idx])
        h_load_size.append(in_loading[action_idx])

        next_state = env.container.copy()
        next_state_h = env.container_h.copy() 
        if done:
            in_next_history = next_state_h.reshape((1, env.length, env.breadth, 1))
            loaded_mh_c = np.zeros((1, env.length, env.breadth, 2))
            in_next_remains = np.zeros((1, env.length, env.breadth, num_max_remain))
            in_next_loading = np.zeros((1, env.length, env.breadth, K))
        else:
            k = min(K, len(r_boxes))
            selected = cbn_select_boxes(r_boxes[:n_candidates], k)
            s_order = get_selected_order(selected, k)
            s_loc_c, num_loaded_box_c, loading_size_c, loading_pos_c, next_cube_c , next_state_c  =\
                get_selected_location(s_order, next_state)
            loaded_mh_c = np.array( [get_loaded_mh(s_loc, env.length, env.breadth, env.height) for s_loc in s_loc_c] )
            in_next_history, in_next_remains, in_next_loading =\
                raw2input(next_state_h, len(s_loc_c), r_boxes, num_max_remain,  K, loading_size_c, env.height)

        s_order, s_loc_c, num_loaded_box_c, loading_size_c, loading_pos_c, next_cube_c , next_state_c, loaded_mh_c, in_next_history, in_next_remains, in_next_loading =\
            get_unique(s_order, s_loc_c, num_loaded_box_c, loading_size_c, loading_pos_c, next_cube_c , next_state_c, loaded_mh_c, in_next_history, in_next_remains, in_next_loading)

        next_history.append(in_next_history)
        next_load.append(loaded_mh_c)
        next_remain_size.append(in_next_remains)
        next_load_size.append(in_next_loading)

In [104]:
if done:
    N = len(history)
    for i in range(N):
        reward = (0.99 ** (N-i-1)) * env.terminal_reward()
        rewards.append(reward)
rewards

[0.792, 0.8]

In [98]:
next_value = critic([in_next_history, loaded_mh_c, in_next_remains])
argmax_idx = np.where(next_value == tf.math.reduce_max(next_value))
next_value = argmax_idx[0][0]
values.append(next_value)

In [101]:
g = 0
lmbda = 0.95
gamma = 0.99
returns = []

In [108]:
for i in reversed(range(len(rewards))):
    delta = rewards[i] + gamma * values[i + 1] * dones[i] - values[i]
    g = delta + gamma * lmbda * dones[i] * g
    print(delta, g, g+values[i])
    returns.append(g + values[i])

0.8 0.8 0.8
0.792 0.792 0.792


In [109]:
returns.reverse()

In [110]:
adv = np.array(returns, dtype=np.float32) - values[:-1]
adv = (adv - np.mean(adv)) / (np.std(adv) + 1e-10)
adv

array([-0.99999998,  0.99999998])

In [114]:
prob_l

[<tf.Tensor: shape=(1,), dtype=float32, numpy=array([1.], dtype=float32)>,
 <tf.Tensor: shape=(1,), dtype=float32, numpy=array([1.], dtype=float32)>]

In [115]:
np.reshape(prob_l, (len(prob_l), 1))
prob_l

[<tf.Tensor: shape=(1,), dtype=float32, numpy=array([1.], dtype=float32)>,
 <tf.Tensor: shape=(1,), dtype=float32, numpy=array([1.], dtype=float32)>]

In [116]:
probs = np.stack(prob_l, axis=0)
probs

array([[1.],
       [1.]], dtype=float32)

In [117]:
discnt_rewards = tf.reshape(returns, (len(returns),))
discnt_rewards

<tf.Tensor: shape=(2,), dtype=float64, numpy=array([0.792, 0.8  ])>

In [118]:
adv  = tf.reshape(adv, (len(adv),))
adv

<tf.Tensor: shape=(2,), dtype=float64, numpy=array([-0.99999998,  0.99999998])>

In [119]:
old_p = probs
old_p = tf.reshape(old_p, (len(old_p),1))
old_p

<tf.Tensor: shape=(2, 1), dtype=float32, numpy=
array([[1.],
       [1.]], dtype=float32)>

In [141]:
def actor_loss(p, adv, old_p, c_loss):
    probability = p
    entropy = tf.reduce_mean(tf.math.negative(tf.math.multiply(probability, tf.math.log(probability))))
    sur1, sur2 = [], []
    
    for pb, t, op in zip(probability, adv, old_p):
        t = tf.constant(t)
        op = tf.constant(op)
        ratio = tf.math.divide(pb, op)
        t = tf.cast(t, dtype='float32')
        s1 = tf.math.multiply(ratio, t)
        s2 = tf.math.multiply(tf.clip_by_value(ratio, 1.0-0.2, 1.0 + 0.2), t)
        sur1.append(s1)
        sur2.append(s2)
    sr1 = tf.stack(sur1)
    sr2 = tf.stack(sur2)
    loss = tf.math.negative(tf.reduce_mean(tf.math.minimum(sr1, sr2)) - c_loss + 0.001 * entropy)
    return loss

In [142]:
with tf.GradientTape() as tape1, tf.GradientTape() as tape2:
    p = actor([history, h_load, h_remain_size])
    v = critic([history, h_load, h_remain_size])
    v = tf.reshape(v, (len(v),))
    c_loss = 0.5 * kls.mean_squared_error(discnt_rewards, v)
    a_loss = actor_loss(p, adv, old_p, c_loss)

In [None]:
next_history[i], next_load[i], next_remain_size[i]

In [83]:
state = env.container.copy()
state_h = env.update_h().copy()

In [None]:
in_state, in_r_boxes, in_loading = raw2input(state_h, len(s_loc_c), r_boxes, num_max_remain, K, loading_size_c, env.height)

In [18]:
state = env.container.copy()
state_h = env.update_h().copy()
type(state), type(state_h), state.shape, state_h.shape

(numpy.ndarray, numpy.ndarray, (20, 20, 20), (20, 20))

In [19]:
k = min(K, len(r_boxes))
selected = cbn_select_boxes(r_boxes[:n_candidates], k)
s_order = get_selected_order(selected, k)
s_loc_c, num_loaded_box_c, loading_size_c, loading_pos_c, next_cube_c , next_state_c = get_selected_location(s_order, state) 
loaded_mh_c = np.array([get_loaded_mh(s_loc, env.length, env.breadth, env.height) for s_loc in s_loc_c] ) # 3D -> 2D
in_state, in_r_boxes, in_loading = raw2input(state_h, len(s_loc_c), r_boxes, num_max_remain, K, loading_size_c, env.height)

s_order, s_loc_c, num_loaded_box_c, loading_size_c, loading_pos_c, next_cube_c , next_state_c, loaded_mh_c, in_state, in_r_boxes, in_loading =\
    get_unique(s_order, s_loc_c, num_loaded_box_c, loading_size_c, loading_pos_c, next_cube_c , next_state_c, loaded_mh_c, in_state, in_r_boxes, in_loading)

In [21]:
len(s_loc_c)

4

In [23]:
probs = actor([in_state, loaded_mh_c, in_r_boxes])
probs

<tf.Tensor: shape=(4, 1), dtype=float32, numpy=
array([[1.],
       [1.],
       [1.],
       [1.]], dtype=float32)>

In [25]:
argmax_idx = np.where(probs == tf.math.reduce_max(probs))
argmax_idx

(array([0, 1, 2, 3], dtype=int64), array([0, 0, 0, 0], dtype=int64))

In [26]:
action_idx = argmax_idx[0][0]
action_idx

0

In [28]:
value = critic([in_state, loaded_mh_c, in_r_boxes])
argmax_idx = np.where(value == tf.math.reduce_max(value))
value = argmax_idx[0][0]
value

0

### probs

In [32]:
probs = probs[action_idx]

In [33]:
probs_l = []
probs_l.append(probs)

In [34]:
values = []
values.append(value)

In [35]:
env.convert_state(next_cube_c[action_idx]) 
num_loaded_box = num_loaded_box_c[action_idx]
if num_loaded_box != 0:
    new_used_boxes = loading_size_c[action_idx]
    r_boxes = get_remain(new_used_boxes, r_boxes)
else:
    r_boxes = get_remain(s_order[action_idx], r_boxes)
used_boxes = used_boxes + loading_size_c[action_idx]
pred_pos = pred_pos + loading_pos_c[action_idx]  
if len(r_boxes) == 0 or np.sum(env.container_h != env.height) == 0:
    done = True

In [36]:
dones = []
dones.append(done)

In [38]:
if len(s_loc_c) != 1 or done:
    history.append(in_state[action_idx])
    h_load.append(loaded_mh_c[action_idx])
    h_remain_size.append(in_r_boxes[action_idx])
    h_load_size.append(in_loading[action_idx])

    next_state = env.container.copy()
    next_state_h = env.container_h.copy() 
    if done:
        in_next_history = next_state_h.reshape((1, env.length, env.breadth, 1))
        loaded_mh_c = np.zeros((1, env.length, env.breadth, 2))
        in_next_remains = np.zeros((1, env.length, env.breadth, num_max_remain))
        in_next_loading = np.zeros((1, env.length, env.breadth, K))
    else:
        k = min(K, len(r_boxes))
        selected = cbn_select_boxes(r_boxes[:n_candidates], k)
        s_order = get_selected_order(selected, k)
        s_loc_c, num_loaded_box_c, loading_size_c, loading_pos_c, next_cube_c , next_state_c  =\
            get_selected_location(s_order, next_state)
        loaded_mh_c = np.array( [get_loaded_mh(s_loc, env.length, env.breadth, env.height) for s_loc in s_loc_c] )
        in_next_history, in_next_remains, in_next_loading =\
            raw2input(next_state_h, len(s_loc_c), r_boxes, num_max_remain,  K, loading_size_c, env.height)

    s_order, s_loc_c, num_loaded_box_c, loading_size_c, loading_pos_c, next_cube_c , next_state_c, loaded_mh_c, in_next_history, in_next_remains, in_next_loading =\
        get_unique(s_order, s_loc_c, num_loaded_box_c, loading_size_c, loading_pos_c, next_cube_c , next_state_c, loaded_mh_c, in_next_history, in_next_remains, in_next_loading)

    next_history.append(in_next_history)
    next_load.append(loaded_mh_c)
    next_remain_size.append(in_next_remains)
    next_load_size.append(in_next_loading)

In [40]:
rewards.append(env.terminal_reward())

In [None]:
for e in range(num_episode):
    st=time.time()
    
    boxes_multi,gt_pos = generation_3dbox(case_size=[[20,20,20]], min_s = 1, N_mdd=N_MDD)
    
    env.reset()
    done = False
    step = 0
    
    history, h_load, h_remain_size, h_load_size = [],[],[],[]
    next_history, next_load, next_remain_size, next_load_size = [],[],[],[]
    used_boxes, pred_pos = [],[]
    rewards, values, prob_l, dones = [], [], [], []
    
    boxes_all = np.array(boxes_multi)[0].copy()
    r_boxes = boxes_all.copy()

    while not done:
        state = env.container.copy()
        state_h = env.update_h().copy()
        step += 1
        
        k = min(K, len(r_boxes))
        selected = cbn_select_boxes(r_boxes[:n_candidates], k)
        s_order = get_selected_order(selected, k)
        s_loc_c, num_loaded_box_c, loading_size_c, loading_pos_c, next_cube_c , next_state_c = get_selected_location(s_order, state) 
        loaded_mh_c = np.array([get_loaded_mh(s_loc, env.length, env.breadth, env.height) for s_loc in s_loc_c] ) # 3D -> 2D
        in_state, in_r_boxes, in_loading = raw2input(state_h, len(s_loc_c), r_boxes, num_max_remain, K, loading_size_c, env.height)
        
        s_order, s_loc_c, num_loaded_box_c, loading_size_c, loading_pos_c, next_cube_c , next_state_c, loaded_mh_c, in_state, in_r_boxes, in_loading =\
            get_unique(s_order, s_loc_c, num_loaded_box_c, loading_size_c, loading_pos_c, next_cube_c , next_state_c, loaded_mh_c, in_state, in_r_boxes, in_loading)
        
        if len(s_loc_c) == 1:
            action_idx = 0
        else:
            prob, action_idx = agent.get_action(in_state, loaded_mh_c, in_r_boxes)
            value = agent.get_value(in_state, loaded_mh_c, in_r_boxes)
            prob_l.append(prob)
            values.append(value)
        
        env.convert_state(next_cube_c[action_idx]) 
        num_loaded_box = num_loaded_box_c[action_idx]
        if num_loaded_box != 0:
            new_used_boxes = loading_size_c[action_idx]
            r_boxes = get_remain(new_used_boxes, r_boxes)
        else:
            r_boxes = get_remain(s_order[action_idx], r_boxes)
        used_boxes = used_boxes + loading_size_c[action_idx]
        pred_pos = pred_pos + loading_pos_c[action_idx]  
        if len(r_boxes) == 0 or np.sum(env.container_h != env.height) == 0:
            done = True

        if len(s_loc_c) != 1 or done:
            dones.append(done)
            history.append(in_state[action_idx])
            h_load.append(loaded_mh_c[action_idx])
            h_remain_size.append(in_r_boxes[action_idx])
            h_load_size.append(in_loading[action_idx])
            
            next_state = env.container.copy()
            next_state_h = env.container_h.copy() 
            if done:
                in_next_history = next_state_h.reshape((1, env.length, env.breadth, 1))
                loaded_mh_c = np.zeros((1, env.length, env.breadth, 2))
                in_next_remains = np.zeros((1, env.length, env.breadth, num_max_remain))
                in_next_loading = np.zeros((1, env.length, env.breadth, K))
            else:
                k = min(K, len(r_boxes))
                selected = cbn_select_boxes(r_boxes[:n_candidates], k)
                s_order = get_selected_order(selected, k)
                s_loc_c, num_loaded_box_c, loading_size_c, loading_pos_c, next_cube_c , next_state_c  =\
                    get_selected_location(s_order, next_state)
                loaded_mh_c = np.array( [get_loaded_mh(s_loc, env.length, env.breadth, env.height) for s_loc in s_loc_c] )
                in_next_history, in_next_remains, in_next_loading =\
                    raw2input(next_state_h, len(s_loc_c), r_boxes, num_max_remain,  K, loading_size_c, env.height)

            s_order, s_loc_c, num_loaded_box_c, loading_size_c, loading_pos_c, next_cube_c , next_state_c, loaded_mh_c, in_next_history, in_next_remains, in_next_loading =\
                get_unique(s_order, s_loc_c, num_loaded_box_c, loading_size_c, loading_pos_c, next_cube_c , next_state_c, loaded_mh_c, in_next_history, in_next_remains, in_next_loading)

            next_history.append(in_next_history)
            next_load.append(loaded_mh_c)
            next_remain_size.append(in_next_remains)
            next_load_size.append(in_next_loading)
                
        if done:
            N = len(history)
            for i in range(N):
                reward=(agent.gamma**(N-i-1))*env.terminal_reward()
                rewards.append(reward)
                
            next_value = agent.get_value(in_next_history, loaded_mh_c, in_next_remains)
            values.append(next_value)
            print(len(rewards), len(values), len(dones), len(s_loc_c))
            
            g = 0
            lmbda = 0.95
            gamma = agent.gamma
            returns = []
            
            for i in reversed(range(len(rewards))):
                delta = rewards[i] + gamma * values[i+1] * dones[i] - values[i]
                g = delta + gamma * lmbda * dones[i] * g
                returns.append(g + values[i])
            returns.reverse()
            print(len(returns), len(values))
            adv = np.array(returns, dtype=np.float64) - values[:-1]
            adv = (adv - np.mean(adv)) / (np.std(adv) + 1e-10)
            probs = np.stack(prob_l, axis=0)
            
            agent.train(history, h_load, h_remain_size, returns, adv, probs)
            avg_actor_loss_l.append(agent.avg_actor_loss)
            avg_critic_loss_l.append(agent.avg_critic_loss)

    log = "=====episode: {:5d} | ".format(e)
    log += "reward(): {:.3f}| ".format(env.terminal_reward())
    log += "avg actor loss : {:6f} ".format(agent.avg_actor_loss / float(step))
    log += "avg critic loss : {:6f} ".format(agent.avg_critic_loss / float(step))
    log += "time: {:.3f}".format(time.time()-st)
    print(log)
    agent.avg_actor_loss, agent.avg_critic_loss = 0, 0