In [None]:
import os, os.path
os.environ["CUDA_VISIBLE_DEVICES"]="1"
import random
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from collections import deque
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Dense
from utils import generation_2dbox,whole_upleft, box_cornel, feasible_location,action_options_list
config = tf.compat.v1.ConfigProto(device_count={'GPU':1})
sess = tf.compat.v1.Session(config=config) 

# 상태가 입력, 큐함수가 출력인 인공신경망 생성
class DQN(tf.keras.Model):
    def __init__(self, state_size):
        super(DQN, self).__init__()
        self.dnn1 = Dense(512, activation='relu',input_shape=state_size)#
        self.dnn2 = Dense(256, activation='relu')#
        self.dnn3 = Dense(256, activation='relu')#
        self.fc = Dense(256, activation='relu')
        self.fc_out = Dense(1,activation='tanh')

    def call(self, x):
        x = self.dnn1(x)
        x = self.dnn2(x)
        x = self.dnn3(x)
        x = self.fc(x)
        q = self.fc_out(x)
        return q


# 브레이크아웃 예제에서의 DQN 에이전트
class DQNAgent:
    def __init__(self, state_size=(20*20+20*20,)):
        #self.render = False
        # 상태와 행동의 크기 정의
        self.state_size = state_size
        #self.action_size = action_size

        # DQN 하이퍼파라미터
        self.discount_factor = 0.99
        self.learning_rate = 1e-4
        self.epsilon = 1.
        self.epsilon_start, self.epsilon_end = 1.0, 0.02
        self.exploration_steps = 500
        self.epsilon_decay_step = self.epsilon_start - self.epsilon_end
        self.epsilon_decay_step /= self.exploration_steps
        self.batch_size = 32
        self.train_start = 1000
        self.update_target_rate = 10

        # 리플레이 메모리, 최대 크기 100,000
        self.memory = deque(maxlen=2500)#17000
        
        # 모델과 타깃 모델 생성
        self.model = DQN(state_size)
        self.target_model = DQN(state_size)
        self.optimizer = Adam(self.learning_rate)#, clipnorm=10.)
        # 타깃 모델 초기화
        self.update_target_model()
        self.avg_q_max, self.avg_loss = 0, 0
        self.writer = tf.summary.create_file_writer('summary')
        self.model_path = os.path.join(os.getcwd(), 'save_model', 'model')

    # 타깃 모델을 모델의 가중치로 업데이트
    def update_target_model(self):
        self.target_model.set_weights(self.model.get_weights())

    # 입실론 탐욕 정책으로 행동 선택
    def get_action(self, history,a_ops):
        if len(a_ops)==0:
            return 0,-1
        
        q_value_list=[]
        history = np.array([history.flatten()]*len(a_ops))
        actions = np.array([sample.flatten() for sample in a_ops])
        q_value_list=self.model(np.concatenate([history,actions],axis=1))
        
        if np.random.rand() <= self.epsilon:
            random_action_idx=random.randrange(len(a_ops))
            random_action=a_ops[random_action_idx]
            #print('random_action_idx',random_action_idx)
            return random_action,random_action_idx
        else:
            #print('maximum action idx',np.argmax(q_value_list))
            #q_value = self.model(history)
            amax_action_idx=np.argmax(q_value_list)
            amax_action=a_ops[amax_action_idx]
            return amax_action,amax_action_idx

    # 샘플 <s, a, r, s'>을 리플레이 메모리에 저장
    def append_sample(self, history, action, reward, next_history,next_action):
        self.memory.append((history, action, reward, next_history,next_action))

    # 텐서보드에 학습 정보를 기록
    #def draw_tensorboard(self, score, step, episode):
    #    with self.writer.as_default():
    #        tf.summary.scalar('Total Reward/Episode', score, step=episode)
    #        tf.summary.scalar('Average Max Q/Episode',
    #                          self.avg_q_max / float(step), step=episode)
    #        tf.summary.scalar('Duration/Episode', step, step=episode)
    #        tf.summary.scalar('Average Loss/Episode',
    #                          self.avg_loss / float(step), step=episode)

    # 리플레이 메모리에서 무작위로 추출한 배치로 모델 학습
    def train_model(self):
        #if self.epsilon > self.epsilon_end:
        #    self.epsilon -= self.epsilon_decay_step

        ## 메모리에서 배치 크기만큼 무작위로 샘플 추출
        batch = random.sample(self.memory, self.batch_size)

        history = np.array([sample[0].flatten() for sample in batch])
        actions = np.array([sample[1].flatten() for sample in batch])
        rewards = np.array([sample[2] for sample in batch])
        next_history = np.array([sample[3].flatten() for sample in batch])
        #dones = np.array([sample[4] for sample in batch])
        next_actions = [np.array(sample[4]).reshape((-1,20*20)) for sample in batch]

        # 학습 파라메터
        model_params = self.model.trainable_variables
        with tf.GradientTape() as tape:
            # 현재 상태에 대한 모델의 큐함수
            predicts = self.model(np.concatenate([history,actions],axis=1))
            #one_hot_action = tf.one_hot(actions, self.action_size)
            #predicts = tf.reduce_sum(one_hot_action * predicts, axis=1)

            # 다음 상태에 대한 타깃 모델의 큐함수
            #target_predicts = self.target_model(next_history)
            target_predicts=[]
            for i,na in enumerate(next_actions):
                target_predicts.append(self.target_model(np.concatenate([np.array([next_history[i]]*len(na)),na],axis=1)))
            #target_predicts=self.target_model(np.concatenat([next_history,next_actions]))
            
            # 벨만 최적 방정식을 구성하기 위한 타깃과 큐함수의 최대 값 계산
            #max_q = np.amax(target_predicts, axis=1)
            targets =[]
            for i,tp in enumerate(target_predicts):
                max_q=np.amax(tp)
                targets.append([(1- 0.75)*rewards[i] + 0.75*max_q])
            targets=np.array(targets)
            #targets = rewards + (1 - dones) * self.discount_factor * max_q
            #targets = (1- self.discount_factor)*rewards + self.discount_factor*max_q

            # 후버로스 계산
            #error = tf.abs(targets - predicts)
            #quadratic_part = tf.clip_by_value(error, 0.0, 1.0)
            #linear_part = error - quadratic_part
            #loss = tf.reduce_mean(0.5 * tf.square(quadratic_part) + linear_part)
            loss = tf.reduce_mean(tf.square(targets - predicts))
            
            self.avg_loss += loss.numpy()

        # 오류함수를 줄이는 방향으로 모델 업데이트
        grads = tape.gradient(loss, model_params)
        self.optimizer.apply_gradients(zip(grads, model_params))
        
        

In [None]:
import gym
import gym_bpp
# 환경과 DQN 에이전트 생성
#env = gym.make('BreakoutDeterministic-v4')
env = gym.make('bpp-v0')
agent = DQNAgent()
global_step = 0
num_episode = 2000
# np.random.seed(seed=100)

frac_list=[]
avg_loss_list=[]
history_eps=[]
used_boxes_eps=[]
for e in range(num_episode):
    done = False
    step = 0
    # env 초기화
    env.reset()
    boxes,gt_tmp=generation_2dbox(N_epi=1,c_l=20,c_b=20)#N,2
    boxes=boxes[0]
    
    history = np.zeros((20,20))#container
    history_list=[]
    action_list=[]
    reward_list=[]
    next_history_list=[]
    upleft_list=[]
    next_action_list=[]
    used_boxes=[]
    
    if e < 500 and agent.epsilon > agent.epsilon_end:
        agent.epsilon -= agent.epsilon_decay_step

    while not done:#박스에 대해 반복##############
        
        #if agent.render:
        #    env.render()
        global_step += 1
        step += 1
        box=boxes[step-1]#[L,B]
        w_upleft =whole_upleft(*box_cornel([0,0],20,20),box[0],box[1])
        
        # 바로 전 history를 입력으로 받아 행동을 선택
        for i,ul in enumerate(upleft_list):
            w_upleft += whole_upleft(*box_cornel([ul[0],ul[1]], used_boxes[i][0], used_boxes[i][1]),box[0],box[1])
        f_upleft = feasible_location(history,w_upleft,box[0],box[1])
        a_ops=action_options_list(f_upleft)
        action,action_idx = agent.get_action(history,a_ops)
        
        # 중박스 스킵 허용
        if action_idx==-1 and step != len(boxes): continue
            
        #메모리 추가(action이 없거나 박스가 다 사용되거나)-------------------------------
        if  action_idx==-1 or step ==len(boxes):#취할 action이 더 없는 경우(대박스가 다 충진된 경우 포함)
            done=True
            print('action = 0 or all boxes are used',action_idx,'|',step)
            if len(frac_list)==0:
                t_reward=env.terminal_reward()
            else:
                t_reward=env.terminal_reward()- np.mean(frac_list)#terminal reward 
            frac_list.append(env.terminal_reward())
                
            for i in range((len(history_list))):
                if len(next_action_list[i])>0:
                    reward=(0.99**(len(history_list)-i-1))*t_reward #step reward
                    agent.append_sample(history_list[i], action_list[i], reward, next_history_list[i],next_action_list[i])
        # 선택한 행동으로 환경에서 한 타임스텝 진행
        else:
            used_boxes.append(box)
            next_box=boxes[step]
            next_w_upleft=whole_upleft(*box_cornel([0,0],20,20),next_box[0],next_box[1])
            upleft=f_upleft[action_idx]
            upleft_list.append(upleft)
            next_history = env.step(upleft,box[0],box[1])#action options=[] -> done=True
            # 샘플 <s, a, r, s'>을 리플레이 메모리에 저장
            history_list.append(history)
            action_list.append(action)
            next_history_list.append(next_history)
            # getting next action after calculate upleft_list
            for i,ul in enumerate(upleft_list):
                next_w_upleft += whole_upleft(*box_cornel([ul[0],ul[1]],boxes[i][0],boxes[i][1]),next_box[0],next_box[1])
            next_f_upleft_list = feasible_location(next_history,next_w_upleft,box[0],box[1])
            next_action_list.append(action_options_list(next_f_upleft_list))
            #agent.avg_q_max += np.amax(agent.model(history)[0])
            history = next_history
            
        # epi 1000 이상부터 ### 메모리 차면 학습X ############################################################
        if e >= 5:#if len(agent.memory) >= agent.train_start:
            agent.train_model()
            # 일정 시간마다 타겟모델을 모델의 가중치로 업데이트
            if global_step % agent.update_target_rate == 0:
                agent.update_target_model()
                
        if done:#------------------------------------------------------------------------------
            # 각 에피소드 당 학습 정보를 기록
            #if global_step > agent.train_start:
            #    agent.draw_tensorboard(score, step, e)
            log = "============episode: {:5d} | ".format(e)
            log += "memory length: {:5d} | ".format(len(agent.memory))
            log += "epsilon: {:.3f} | ".format(agent.epsilon)
            log += "env.terminal_reward(): {:.3f} | ".format(env.terminal_reward())
            #log += "t_reward: {:.3f} | ".format(t_reward)
            #log += "q avg : {:3.2f} | ".format(agent.avg_q_max / float(step))
            log += "avg loss : {:6f}".format(agent.avg_loss / float(step))
            print(log)
            #agent.avg_q_max, agent.avg_loss = 0, 0
            avg_loss_list.append(agent.avg_loss/float(step))
            agent.avg_loss = 0
    
    #used_boxes_eps.append(used_boxes)
    #history_eps.append(history)
    # 1000 에피소드마다 모델 저장
    if e % 1000 == 0:
        agent.model.save_weights("save_model/model", save_format="tf")


In [None]:
plt.figure(figsize=(20,5))
plt.plot(list(range(5)),frac_list[:5],'b.-')
plt.plot(list(range(5,5+len(frac_list[5:]))),frac_list[5:],'g.-')
plt.grid()

plt.figure(figsize=(20,5))
plt.plot(avg_loss_list[5:],'b.-')
plt.grid()