In [1]:
import os
import random
import math
import time
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt

from collections import deque
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, concatenate, Conv2D, MaxPooling2D
import tensorflow.keras.losses as kls
#import tensorflow_probability as tfp

from libs.utils import *
from libs.generate_boxes import  *

In [2]:
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '0'
tf.get_logger().setLevel('INFO')
tf.keras.backend.floatx()

plt.style.use('fivethirtyeight')
plt.rcParams['figure.figsize'] = (20,10)

In [3]:
class Actor(tf.keras.Model):
    def __init__(self, state_size, selected_size, remain_size, output_size):
        super(Actor, self).__init__()
        
        l1, b1, k1 = state_size
        self.state_size = (l1*b1*k1,)
        self.case_dnn1 = Dense(64, activation='relu', input_shape=self.state_size)
        self.case_dnn2 = Dense(64, activation='relu')
        
        l2, b2, k2 = selected_size
        self.selected_size = (l2*b2*k2,)
        self.select_dnn1 = Dense(64, activation='relu', input_shape=self.selected_size)
        self.select_dnn2 = Dense(64, activation='relu')
        
        l3, b3, k3 = remain_size
        self.remain_size = (l3*b3*k3,)
        self.remain_dnn1 = Dense(128, activation='relu', input_shape=self.remain_size)
        self.remain_dnn2 = Dense(128, activation='relu')
        
        self.d1 = Dense(256, activation='relu')
        self.d2 = Dense(256, activation='relu')
        self.d3 = Dense(128, activation='relu')
        self.out = Dense(output_size, activation='softmax')
        
    def call(self, cb_list):
        c, s, r = cb_list[0], cb_list[1], cb_list[2]
        c = tf.reshape(c, [-1, self.state_size[0]])
        s = tf.reshape(s, [-1, self.selected_size[0]])
        r = tf.reshape(r, [-1, self.remain_size[0]])
        
        c = self.case_dnn1(c)
        c = self.case_dnn2(c)
        
        s = self.select_dnn1(s)
        s = self.select_dnn2(s)
        
        r = self.remain_dnn1(r)
        r = self.remain_dnn2(r)
        
        x = concatenate([c,s,r])
        x = self.d1(x)
        x = self.d2(x)
        x = self.d3(x)
        q = self.out(x)
        return q

In [4]:
class Critic(tf.keras.Model):
    def __init__(self, state_size, selected_size, remain_size, output_size):
        super(Critic, self).__init__()
        
        l1, b1, k1 = state_size
        self.state_size = (l1*b1*k1,)
        self.case_dnn1 = Dense(64, activation='relu', input_shape=self.state_size)
        self.case_dnn2 = Dense(64, activation='relu')
        
        l2, b2, k2 = selected_size
        self.selected_size = (l2*b2*k2,)
        self.select_dnn1 = Dense(64, activation='relu', input_shape=self.selected_size)
        self.select_dnn2 = Dense(64, activation='relu')
        
        l3, b3, k3 = remain_size
        self.remain_size = (l3*b3*k3,)
        self.remain_dnn1 = Dense(128, activation='relu', input_shape=self.remain_size)
        self.remain_dnn2 = Dense(128, activation='relu')
        
        self.d1 = Dense(256, activation='relu')
        self.d2 = Dense(256, activation='relu')
        self.d3 = Dense(128, activation='relu')
        self.out = Dense(output_size, activation='softmax')
        
    def call(self, cb_list):
        c, s, r = cb_list[0], cb_list[1], cb_list[2]
        c = tf.reshape(c, [-1, self.state_size[0]])
        s = tf.reshape(s, [-1, self.selected_size[0]])
        r = tf.reshape(r, [-1, self.remain_size[0]])
        
        c = self.case_dnn1(c)
        c = self.case_dnn2(c)
        
        s = self.select_dnn1(s)
        s = self.select_dnn2(s)
        
        r = self.remain_dnn1(r)
        r = self.remain_dnn2(r)
        
        x = concatenate([c,s,r])
        x = self.d1(x)
        x = self.d2(x)
        x = self.d3(x)
        q = self.out(x)
        return q

In [44]:
class PPO_Agent():
    def __init__(self, L=20, B=20, H=20, n_remains=5, lr=1e-8, exp_steps=500,
                train_st = 200, memory_len = 500):
        self.state_size = (L,B,1)
        self.selected_size = (L,B,2)
        self.remain_size = (L,B,n_remains)
        self.output_size = 1
        
        self.discount_factor = 0.99
        self.learning_rate = lr #1e-8 #1e-4
        self.exploration_steps = exp_steps
        self.batch_size = 32
        self.train_start = train_st
        self.beta = 0.2
        self.clip_pram = 0.2
        self.memory = deque(maxlen=memory_len)
        self.gamma = 0.9
        self.actor = Actor(self.state_size, self.selected_size, self.remain_size, self.output_size)
        self.critic = Critic(self.state_size, self.selected_size, self.remain_size, self.output_size)
        self.actor_optimizer = Adam(self.learning_rate)
        self.critic_optimizer = Adam(self.learning_rate)
        self.avg_actor_loss = 0
        self.avg_critic_loss = 0
        
    def get_action(self, state, loaded_mh_c, r_boxes):
        q_values = self.actor([state, loaded_mh_c, r_boxes])
        argmax_idx = np.where(q_values == tf.math.reduce_max(q_values))
        action_idx = argmax_idx[0][0]
        return q_values, argmax_idx, action_idx
        
    def append_sample(self, history, load, remain_size, load_size, reward, last, next_history, next_load, next_remain_size, next_load_size):
        self.memory.append(( history, load, remain_size, load_size, reward, last, next_history, next_load, next_remain_size, next_load_size))
        
    def actor_loss_temp(self, probs, actions, adv, old_probs, closs):
        probability = probs
        entropy = tf.reduce_mean(tf.math.negative(tf.math.multiply(probability, tf.math.log(probability))))
        sur1 = []
        sur2 = []
        
        for pb, t, op in zip(probability, adv, old_probs):
            t = tf.constant(t)
            op = tf.constant(op)
            ratio = tf.math.divide(pb, op)
            s1 = tf.math.multiply(ratio, t)
            s2 = tf.math.multiply(tf.clip_by_value(ratio, 1.0-self.clip_pram, 1.0 + self.clip_pram), t)
            sur1.append(s1)
            sur2.append(s2)
        
        sr1 = tf.stack(sur1)
        sr2 = tf.stack(sur2)
        
        loss = tf.math.negative(tf.reduce_mean(tf.math.minimum(sr1, sr2)) - closs + 0.001 * entropy)
        return loss
    
    def get_actor_loss(self, discnt_rewards, a):
        return 0.5 * kls.mean_squared_error(discnt_rewards, a)
    
    def get_critic_loss(self, discnt_rewards, v):
        return 0.5 * kls.mean_squared_error(discnt_rewards, v)
    
    def train_model(self):
        batch = random.sample(self.memory, self.batch_size)
        
        history = np.array([sample[0] for sample in batch])
        load = np.array([sample[1] for sample in batch])
        remain_size = np.array([sample[2] for sample in batch])
        load_size = np.array([sample[3] for sample in batch])
        reward = np.array([sample[4] for sample in batch])
        dones = np.array([sample[5] for sample in batch])
        next_history = [sample[6] for sample in batch] 
        next_load = [sample[7] for sample in batch]
        next_remain_size = [sample[8] for sample in batch] 
        next_load_size = [sample[9] for sample in batch] 
        
        with tf.GradientTape() as actor_tape, tf.GradientTape() as critic_tape:
            actor = self.actor([history, load, remain_size])
            critic = self.critic([history, load, remain_size])
            targets = []
            for i in range(self.batch_size):
                next_value = self.critic([next_history[i], next_load[i], next_remain_size[i]])
                targets.append(next_value)
            targets = np.array(targets)
            targets = targets.reshape(-1, 1)
            print(actor.shape, critic.shape, targets.shape)
            actor_loss = self.get_actor_loss(targets, actor)
            critic_loss = self.get_critic_loss(targets, value)
                
        actor_grads = actor_tape.gradient(actor_loss, self.actor.trainable_variables)
        critic_grads = critic_tape.gradient(critic_loss, self.critic.trainable_variables)
        self.actor_optimizer.apply_gradients(zip(actor_grads, self.actor.trainable_variables))
        self.critic_optimizer.apply_gradients(zip(critic_grads, self.critic.trainable_variables))

In [117]:
num_episode = 1500
global_step = 0
allow_skip = False
tr_l, h_fill, tr_r, avg_actor_loss_l, avg_critic_loss_l, history_eps, used_boxes_eps = [],[],[],[],[],[],[]
N_MDD = 7
K = 4
n_candidates = 4

boxes_multi1 = [np.array([[20, 20,  4],
         [20,  4,  4],
         [20,  4,  4],
         [20,  4,  4],
         [20,  4,  4],
         [20,  4,  4],
         [20, 20,  4],
         [20, 20,  4],
         [20, 20,  4]])]
gt_pos1 = [np.array([[ 0,  0,  0],
         [ 0,  0,  4],
         [ 0,  4,  4],
         [ 0,  8,  4],
         [ 0, 12,  4],
         [ 0, 16,  4],
         [ 0,  0,  8],
         [ 0,  0, 12],
         [ 0,  0, 16]])]

boxes_multi2 = [np.array([[20, 20,  5],
        [ 4, 20,  5],
        [ 4, 20,  5],
        [ 4, 20,  5],
        [ 4, 20,  5],
        [ 4, 20,  5],
        [10, 20,  5],
        [10, 20,  5],
        [20, 20,  5]])]

gt_pos2 = [np.array([[ 0,  0,  0],
        [ 0,  0,  5],
        [ 4,  0,  5],
        [ 8,  0,  5],
        [12,  0,  5],
        [16,  0,  5],
        [ 0,  0, 10],
        [10,  0, 10],
        [ 0,  0, 15]])]

In [125]:
num_max_boxes = max(len(boxes_multi1[0]), len(boxes_multi2[0]))
num_max_remain = num_max_boxes
print('num_max_boxes',num_max_boxes,'num_max_remain',num_max_remain)

num_max_boxes 9 num_max_remain 9


In [126]:
env=Bpp3DEnv()

In [127]:
agent = PPO_Agent(L=20, B=20, H=20, n_remains=num_max_remain, lr=1e-4, exp_steps=900,
                train_st=500, memory_len=1000)

In [128]:
boxes_multi, gt_pos = boxes_multi2.copy(), gt_pos2.copy()
env.reset()
done = False
step = 0

history, h_load, h_remain_size, h_load_size = [],[],[],[]
next_history, next_load, next_remain_size, next_load_size = [],[],[],[]
used_boxes, pred_pos = [],[]

boxes_all = np.array(boxes_multi)[0].copy()
r_boxes = boxes_all.copy()
r_boxes

array([[20, 20,  5],
       [ 4, 20,  5],
       [ 4, 20,  5],
       [ 4, 20,  5],
       [ 4, 20,  5],
       [ 4, 20,  5],
       [10, 20,  5],
       [10, 20,  5],
       [20, 20,  5]])

In [129]:
q_list, arg_list, action_list = [], [], []

In [130]:
while not done:
        state = env.container.copy()
        state_h = env.update_h().copy()
        step += 1
        
        k = min(K, len(r_boxes))
        selected = cbn_select_boxes(r_boxes[:n_candidates], k)
        s_order = get_selected_order(selected, k)
        s_loc_c, num_loaded_box_c, loading_size_c, loading_pos_c, next_cube_c , next_state_c = get_selected_location(s_order, state) 
        loaded_mh_c = np.array([get_loaded_mh(s_loc, env.length, env.breadth, env.height) for s_loc in s_loc_c] ) # 3D -> 2D
        in_state, in_r_boxes, in_loading = raw2input(state_h, len(s_loc_c), r_boxes, num_max_remain, K, loading_size_c, env.height)
        
        s_order, s_loc_c, num_loaded_box_c, loading_size_c, loading_pos_c, next_cube_c , next_state_c, loaded_mh_c, in_state, in_r_boxes, in_loading =\
            get_unique(s_order, s_loc_c, num_loaded_box_c, loading_size_c, loading_pos_c, next_cube_c , next_state_c, loaded_mh_c, in_state, in_r_boxes, in_loading)
        
        if len(s_loc_c) == 1:
            action_idx = 0
        else:
            q, arg, action_idx = agent.get_action(in_state, loaded_mh_c, in_r_boxes)
        print('Action')
        print(q, arg, action_idx)
        q_list.append(q)
        arg_list.append(arg)
        action_list.append(action_idx)
        
        env.convert_state(next_cube_c[action_idx]) 
        num_loaded_box = num_loaded_box_c[action_idx]
        if num_loaded_box != 0:
            new_used_boxes = loading_size_c[action_idx]
            r_boxes = get_remain(new_used_boxes, r_boxes)
        else:
            r_boxes = get_remain(s_order[action_idx], r_boxes)
        used_boxes = used_boxes + loading_size_c[action_idx]
        pred_pos = pred_pos + loading_pos_c[action_idx]  
        if len(r_boxes) == 0 or np.sum(env.container_h != env.height) == 0:
            done = True

        if len(s_loc_c) != 1 or done:
            history.append(in_state[action_idx])
            h_load.append(loaded_mh_c[action_idx])
            h_remain_size.append(in_r_boxes[action_idx])
            h_load_size.append(in_loading[action_idx])
            
            next_state = env.container.copy()
            next_state_h = env.container_h.copy() 
            if done:
                in_next_history = next_state_h.reshape((1, env.length, env.breadth, 1))
                loaded_mh_c = np.zeros((1, env.length, env.breadth, 2))
                in_next_remains = np.zeros((1, env.length, env.breadth, num_max_remain))
                in_next_loading = np.zeros((1, env.length, env.breadth, K))
            else:
                k = min(K, len(r_boxes))
                selected = cbn_select_boxes(r_boxes[:n_candidates], k)
                s_order = get_selected_order(selected, k)
                s_loc_c, num_loaded_box_c, loading_size_c, loading_pos_c, next_cube_c , next_state_c  =\
                    get_selected_location(s_order, next_state)
                loaded_mh_c = np.array( [get_loaded_mh(s_loc, env.length, env.breadth, env.height) for s_loc in s_loc_c] )
                in_next_history, in_next_remains, in_next_loading =\
                    raw2input(next_state_h, len(s_loc_c), r_boxes, num_max_remain,  K, loading_size_c, env.height)

            s_order, s_loc_c, num_loaded_box_c, loading_size_c, loading_pos_c, next_cube_c , next_state_c, loaded_mh_c, in_next_history, in_next_remains, in_next_loading =\
                get_unique(s_order, s_loc_c, num_loaded_box_c, loading_size_c, loading_pos_c, next_cube_c , next_state_c, loaded_mh_c, in_next_history, in_next_remains, in_next_loading)

            next_history.append(in_next_history)
            next_load.append(loaded_mh_c)
            next_remain_size.append(in_next_remains)
            next_load_size.append(in_next_loading)

Action
tf.Tensor(
[[1.]
 [1.]
 [1.]
 [1.]], shape=(4, 1), dtype=float32) (array([0, 1, 2, 3], dtype=int64), array([0, 0, 0, 0], dtype=int64)) 0
Action
tf.Tensor(
[[1.]
 [1.]], shape=(2, 1), dtype=float32) (array([0, 1], dtype=int64), array([0, 0], dtype=int64)) 0
Action
tf.Tensor(
[[1.]
 [1.]], shape=(2, 1), dtype=float32) (array([0, 1], dtype=int64), array([0, 0], dtype=int64)) 0


In [133]:
done

True

In [134]:
avg_tr = 0 if len(tr_r)==0 else np.mean(tr_r)
terminal_reward = env.terminal_reward()
tr_l.append(terminal_reward)
h_fill.append(env.terminal_reward())
tr_r.append(env.terminal_reward())

In [135]:
avg_tr

0

In [136]:
terminal_reward

0.75

In [137]:
a_repeate = 6 if env.terminal_reward() ==1.0 else 1
is_last = False
N = len(history)
for i in range(N):
    if i == N-1: is_last=True
    reward=(0.99**(N-i-1))*terminal_reward
    print(i, ':', reward, ':', a_repeate)
    for a in range(a_repeate):
        agent.append_sample(history[i], h_load[i], h_remain_size[i], h_load_size[i], reward, is_last,
                            next_history[i], next_load[i], next_remain_size[i], next_load_size[i])

0 : 0.7350749999999999 : 1
1 : 0.7424999999999999 : 1
2 : 0.75 : 1


In [138]:
len(agent.memory)

3

In [173]:
batch = random.sample(agent.memory, 3)
        
history = np.array([sample[0] for sample in batch])
load = np.array([sample[1] for sample in batch])
remain_size = np.array([sample[2] for sample in batch])
load_size = np.array([sample[3] for sample in batch])
reward = np.array([sample[4] for sample in batch])
dones = np.array([sample[5] for sample in batch])
next_history = [sample[6] for sample in batch] 
next_load = [sample[7] for sample in batch]
next_remain_size = [sample[8] for sample in batch] 
next_load_size = [sample[9] for sample in batch] 

In [174]:
history.shape

(3, 20, 20, 1)

In [175]:
vmin = 0
vmax = 1
nsup = 51

In [176]:
z = np.linspace(vmin,vmax,nsup)
z

array([0.  , 0.02, 0.04, 0.06, 0.08, 0.1 , 0.12, 0.14, 0.16, 0.18, 0.2 ,
       0.22, 0.24, 0.26, 0.28, 0.3 , 0.32, 0.34, 0.36, 0.38, 0.4 , 0.42,
       0.44, 0.46, 0.48, 0.5 , 0.52, 0.54, 0.56, 0.58, 0.6 , 0.62, 0.64,
       0.66, 0.68, 0.7 , 0.72, 0.74, 0.76, 0.78, 0.8 , 0.82, 0.84, 0.86,
       0.88, 0.9 , 0.92, 0.94, 0.96, 0.98, 1.  ])

In [177]:
dz = (vmax - vmin) / (nsup - 1.)
dz

0.02

In [181]:
with tf.GradientTape() as actor_tape, tf.GradientTape() as critic_tape:
    actor = agent.actor([history, load, remain_size])
    value = agent.critic([history, load, remain_size])
    #print(value.shape)
    print('Actor')
    print(actor, '\n')
    print('Critic')
    print(value, '\n')
    targets = []
    print('Next Critic')
    for i in range(3):
        #print(next_history[i].shape, next_load[i].shape, next_remain_size[i].shape)
        next_value = agent.critic([next_history[i], next_load[i], next_remain_size[i]])
        t_max_q = tf.math.reduce_max(next_value)
        t = [(1- 0.75)*reward[i] + (1 - dones[i]) *0.75*t_max_q]
        targets.append(t)
    #targets = np.array(targets)
    #targets = targets.reshape(-1, 1)
    print('\ntargets')
    print(targets)
    #print(actor.shape, value.shape, targets.shape)
    actor_loss = agent.get_actor_loss(targets, actor)
    critic_loss = agent.get_critic_loss(targets, value)
    print('Act Loss:', actor_loss)
    print('Crt Loss:', critic_loss)
    print('')

Actor
tf.Tensor(
[[1.]
 [1.]
 [1.]], shape=(3, 1), dtype=float32) 

Critic
tf.Tensor(
[[1.]
 [1.]
 [1.]], shape=(3, 1), dtype=float32) 

Next Critic

targets
[[<tf.Tensor: shape=(), dtype=float32, numpy=0.93376875>], [<tf.Tensor: shape=(), dtype=float32, numpy=0.1875>], [<tf.Tensor: shape=(), dtype=float32, numpy=0.935625>]]
Act Loss: tf.Tensor([0.00219329 0.33007812 0.00207207], shape=(3,), dtype=float32)
Crt Loss: tf.Tensor([0.00219329 0.33007812 0.00207207], shape=(3,), dtype=float32)



In [182]:
actor_grads = actor_tape.gradient(actor_loss, agent.actor.trainable_variables)
critic_grads = critic_tape.gradient(critic_loss, agent.critic.trainable_variables)

In [183]:
agent.actor_optimizer.apply_gradients(zip(actor_grads, agent.actor.trainable_variables))
agent.critic_optimizer.apply_gradients(zip(critic_grads, agent.critic.trainable_variables))

<tf.Variable 'UnreadVariable' shape=() dtype=int64, numpy=1>

In [184]:
with tf.GradientTape() as actor_tape, tf.GradientTape() as critic_tape:
    actor = agent.actor([history, load, remain_size])
    value = agent.critic([history, load, remain_size])
    print('Actor')
    print(actor)
    print('Critic')
    print(critic)
    targets = []
    print('Next Critic')
    for i in range(1):
        next_value = agent.critic([next_history[i], next_load[i], next_remain_size[i]])
        print(i, next_value)
        targets.append(next_value)
    targets = np.array(targets)
    targets = targets.reshape(-1, 1)
    print('targets')
    print(targets)
    actor_loss = agent.get_actor_loss(targets, actor)
    critic_loss = agent.get_critic_loss(targets, value)
    print('Act Loss:', actor_loss)
    print('Crt Loss:', critic_loss)

Actor
tf.Tensor(
[[1.]
 [1.]
 [1.]], shape=(3, 1), dtype=float32)
Critic
tf.Tensor([[1.]], shape=(1, 1), dtype=float32)
Next Critic
0 tf.Tensor(
[[1.]
 [1.]], shape=(2, 1), dtype=float32)
targets
[[1.]
 [1.]]


InvalidArgumentError: Incompatible shapes: [3,1] vs. [2,1] [Op:SquaredDifference]