In [12]:
from tensorflow.python.client import device_lib
device_lib.list_local_devices()

[name: "/device:CPU:0"
 device_type: "CPU"
 memory_limit: 268435456
 locality {
 }
 incarnation: 13585792589863284389,
 name: "/device:GPU:0"
 device_type: "GPU"
 memory_limit: 4949437312
 locality {
   bus_id: 1
   links {
   }
 }
 incarnation: 15461685689681602294
 physical_device_desc: "device: 0, name: NVIDIA GeForce RTX 2060, pci bus id: 0000:01:00.0, compute capability: 7.5"]

In [13]:
import tensorflow as tf
tf.__version__

'2.4.1'

In [14]:
import gym
import numpy as np
import tqdm
import collections
import statistics

import matplotlib.pyplot as plt
from tensorflow.keras import layers
from tensorflow.keras import optimizers
from tensorflow.keras import losses
from tensorflow import math

In [15]:
epsilon = np.finfo(np.float32).eps.item()

env = gym.make('CartPole-v0')

In [16]:
class ActorCriticModel(tf.keras.Model):
    def __init__(self, action_n, hidden_n):
        super().__init__()
        self.x = layers.Dense(hidden_n, activation='relu')
        self.actor = layers.Dense(action_n)
        self.critic = layers.Dense(1)
        
    def call(self, inputs):
        x = self.x(inputs)
        return self.actor(x), self.critic(x)

In [17]:
class ActorCritic():
    def __init__(self):
        pass
        
    def env_step(self, action):
        state, reward, done = env.step(action)
        return (state.astype(np.int32), 
                np.array(reward, np.int32), 
                np.array(done, np.int32))
    
    def step(self, action):
        return tf.numpy_function(self.env_step, [action], 
                                 [tf.float32, tf.int32, tf.int32])
    
    def expect_reward(self, rewards, gamma, is_standard):
        shape = tf.shape(rewards)[0]
        expect_rewards = tf.TensorArray(dtype=tf.float32, size=shape)
        
        rewards = tf.cast(rewards[::-1], dtype=tf.float32)
        discount_sum = tf.constant(0.0)
        sum_shape = discount_sum.shape
        
        for i in tf.range(shape):
            reward = rewards[i]
            discount_sum = reward + gamma * discount_sum
            discount_sum.set_shape(sum_shape)
            expect_rewards = expect_rewards.write(i, discount_sum)
        expect_rewards = expect_rewards.stack()[::-1]
        
        if is_standard:
            expect_rewards = ((expect_rewards-math.reduce_mean(expect_rewards)) 
                              / (math.reduce_std(expect_rewards) + epsilon))
            
        return expect_rewards
    
    def run(self, init_state, model, max_step):
        actions = tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True)
        values = tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True)
        rewards = tf.TensorArray(dtype=tf.int32, size=0, dynamic_size=True)
        
        shape = init_state.shape
        state = init_state
        
        boxes를 여기 넣는게 맞나?
        이전에 Model선언했던거 없애
        AC를 Agent로 바꾸고
        
        for step in range(max_step):
            box = boxes[step]
            w_upleft = 
            f_upleft = feasible_location(state, w_upleft, box, ~)
            
            if len(f_upleft) == 0:
                done = True
            else:
                done = False
                state = tf.expand_dims(state, 0)
                model = ActorCriticModel(len(f_upleft), 128)
                actor, critic = model(state)
                action = tf.random.categorical(actor, 1)[0,0]
                actor_by_actions = tf.nn.softmax(actor)
                values = values.write(step, tf.squeeze(critic))
                actions = actions.write(step, actor_by_actions[0, action])

                state, reward, done = self.step(action)
            
                state.set_shape(shape)
                rewards = rewards.write(step, reward)
            
            if done:
                break
                
        actions = actions.stack()
        values = values.stack()
        rewards = rewards.stack()
        
        return actions, values, rewards
    
    def get_loss(self, actions, values, expect_reward):
        loss = losses.Huber(reduction=losses.Reduction.SUM)
        error = expect_reward - values
        action_log = math.log(actions)
        
        actor_loss = -math.reduce_sum(action_log * error)
        critic_loss = loss(values, expect_reward)
        
        return actor_loss + critic_loss
    
    def fit_RL(self, init_state, model, optimizer, gamma, max_step):
        with tf.GradientTape() as tape:
            actions, values, rewards = self.run(init_state, model, max_step)
            expect_rewards = self.expect_reward(rewards, gamma, True)
            
            actions, values, expect_rewards = [tf.expand_dims(x,1) for x in [actions,values,expect_rewards]]
            
            loss = self.get_loss(actions, values, expect_rewards)
        gradient = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradient, model.trainable_variables))
        reward = math.reduce_sum(rewards)
        
        return loss, int(reward)

In [8]:
class Bpp3DEnv():
    def __init__(self,length=20, breadth=20, height=20):
        super(Bpp3DEnv, self).__init__()
        self.length=length
        self.breadth=breadth
        self.height=height
        self.container=np.zeros((self.length,self.breadth))
    
    def convert_state(self, new_container):
        self.container = new_container
    
    def next_state(self, upleft,bxl,bxb,bxh):
        next_container = self.container.copy()
        next_container[upleft[0]:upleft[0]+bxl,upleft[1]:upleft[1]+bxb] += bxh 
        return next_container
    
    def step(self, upleft,bxl,bxb,bxh):
        self.container[upleft[0]:upleft[0]+bxl,upleft[1]:upleft[1]+bxb] += bxh 
        return self.container
    
    def reset(self):
        self.container=np.zeros((self.length,self.breadth))
    
    def terminal_reward(self):
        return np.sum(self.container)/(self.length*self.breadth*self.height)

In [5]:
from ActorCritic.libs.utils import *

In [17]:
w_upleft

[[0, 0],
 [0, -10],
 [-10, 0],
 [-10, -10],
 [0, 10],
 [0, 20],
 [-10, 10],
 [-10, 20],
 [10, 0],
 [10, -10],
 [20, 0],
 [20, -10],
 [10, 10],
 [10, 20],
 [20, 10],
 [20, 20]]

In [18]:
f_upleft

array([[ 0,  0],
       [ 0, 10],
       [10,  0],
       [10, 10]])

In [16]:
env = Bpp3DEnv()
env.reset()
history = env.container.copy()
box, pos = [10,10,20], [0,0,0]
upleft_l, used_boxes = [], []

w_upleft = whole_upleft(*box_cornel([0,0],20,20), box[0], box[1])
f_upleft = feasible_location(history, w_upleft, box[0], box[1], box[2], state_H = env.height, is2d=False)

a_ops = action_options(f_upleft)
a_ops

[array([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0.],
        [0

In [None]:
q_value_list = []
history = np.array([history.flatten()]*len(a_ops))
actions = np.array([sample.flatten() for sample in a_ops])
remains = np.array([remains]*len(a_ops))
q_value_list = 

In [7]:
%%time

max_episode = 10000
gamma = 0.99

episode_rewards:collections.deque = collections.deque(maxlen=100)
optimizer = optimizers.Adam(learning_rate=0.01)

rewards = 0
loss_list = []
reward_list = []
AC = ActorCritic()
ACModel = ActorCriticModel(env.action_space.n, 128)
with tqdm.trange(max_episode) as t:
    for i in t:
        init_state = tf.constant(env.reset(), dtype=tf.float32)
        loss, reward = AC.fit_RL(init_state, ACModel, optimizer, gamma, 1000)
        episode_rewards.append(reward)
        rewards = statistics.mean(episode_rewards)
        
        t.set_description(f'Episode {i}')
        t.set_postfix(episode_reward = reward, running_reward = rewards)
        
        loss_list.append(loss)
        reward_list.append(rewards)
        
        if rewards > 190:
            break
            
print(i, rewards)

Episode 120:   1%|▎                       | 121/10000 [00:38<52:03,  3.16it/s, episode_reward=101, running_reward=73.9]


KeyboardInterrupt: 

In [18]:
class Bpp3DEnv():
    def __init__(self):
        super(Bpp3DEnv, self).__init__()
        self.length=20
        self.breadth=20
        self.height=20
        self.container=np.zeros((self.length,self.breadth))
        
    def step(self, upleft,bxl,bxb,bxh):
        self.container[upleft[0]:upleft[0]+bxl,upleft[1]:upleft[1]+bxb] += bxh 
        return self.container
    
    def reset(self):
        self.container=np.zeros((self.length,self.breadth))

    def terminal_reward(self):
        return np.sum(self.container)/(self.length*self.breadth*self.height)

In [8]:
from utils import generation_2dbox, whole_upleft, box_cornel, feasible_location, action_options_list

In [20]:
env = Bpp3DEnv()
agent = ActorCritic()
global_step = 0
num_episode = 2000

In [None]:
frac_list = []
avg_loss_list = []
history_eps = []
used_boxes_eps =[]

for e in range(num_episode):
    done = False
    step = 0
    
    env.reset()
    boxes, gt_tmp = generation_2dbox(N_epi=1, c_l=20, c_b=20)
    boxes = boxes[0]
    
    history = np.zeros((20,20))
    history_list=[]
    action_list=[]
    reward_list=[]
    next_history_list=[]
    upleft_list=[]
    next_action_list=[]
    used_boxes=[]
    
    while not done:
        global_step += 1
        step += 1
        box = boxes[step-1]
        w_upleft = whole_upleft(*box_cornel([0,0],20,20),box[0],box[1])
        
        for i,ul in enumerate(upleft_list):
            w_upleft += whole_upleft(*box_cornel([ul[0],ul[1]], used_boxes[i][0], used_boxes[i][1]),box[0],box[1])
        f_upleft = feasible_location(history,w_upleft,box[0],box[1])
        a_ops=action_options_list(f_upleft)
        action, action_idx = 
        
        if action_idx == -1 and step != len(boxes):
            continue

In [None]:
tensor array 출력