In [16]:
import os
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, concatenate, Conv2D, MaxPooling2D

In [9]:
class DQN(tf.keras.Model):
    def __init__(self, state_size, current_size, mask_size, furniture_size, next_size):
        super(DQN, self).__init__()
        
        self.grid_cnn = Conv2D(filters=5, kernel_size=3, activation='relu', padding='valid', input_shape=state_size)
        self.grid_dnn = Dense(32, activation='relu')
        
        self.cur_grid_cnn = Conv2D(filters=5, kernel_size=3, activation='relu', padding='valid', input_shape=current_size)
        self.cur_grid_dnn = Dense(32, activation='relu')
        
        self.mask_grid_cnn = Conv2D(filters=5, kernel_size=3, activation='relu', padding='valid', input_shape=mask_size)
        self.mask_grid_dnn = Dense(32, activation='relu')
        
        self.size_grid_cnn = Conv2D(filters=5, kernel_size=3, activation='relu', padding='valid', input_shape=furniture_size)
        self.size_grid_dnn = Dense(32, activation='relu')
        
        self.next_grid_cnn = Conv2D(filters=5, kernel_size=3, activation='relu', padding='valid', input_shape=next_size)
        self.next_grid_dnn = Dense(32, activation='relu')
        
        self.f1 = Dense(256, activation='relu')
        self.f2 = Dense(128, activation='relu')
        self.f = Dense(1)
        
    def call(self, datas):
        grid, cur_grid, mask_grid, size_grid, next_grid = datas[0], datas[1], datas[2], datas[3], datas[4]
        
        g = self.grid_cnn(g)
        g = MaxPooling2D(pool_size=(2,2))(g)
        g = Flatten()(g)
        g = self.grid_dnn(g)
        
        c = self.cur_grid_cnn(c)
        c = MaxPooling2D(pool_size=(2,2))(c)
        c = Flatten()(c)
        c = self.cur_grid_dnn(c)
        
        m = self.mask_grid_cnn(m)
        m = MaxPooling2D(pool_size=(2,2))(m)
        m = Flatten()(m)
        m = self.mask_grid_dnn(m)
        
        s = self.size_grid_cnn(s)
        s = MaxPooling2D(ppol_size=(2,2))(s)
        s = Flatten()(s)
        s = self.size_grid_dnn(s)
        
        n = self.next_grid_cnn(n)
        n = MaxPooling2D(pool_size=(2,2))(n)
        n = Flatten()(n)
        n = self.next_grid_dnn(n)
        
        x = concatenate([g,c,m,s,n])     
        x = self.f1(x)
        x = self.f2(x)
        q = self.f(x)
        
        return q

In [10]:
import random
from collections import deque

In [14]:
class Agent:
    def __init__(self, L=30, B=30, learning_rate=1e-8, exp_steps=500, train_st=200, batch_size=32, memory_len=500, 
                update_target_rate=30):
        self.state_size = (L,B)
        self.current_size = (L,B)
        self.mask_size = (L,B)
        self.furniture_size = (L,B)
        self.next_size = (L,B)
        
        self.discount_factor = 0.99
        self.learning_rate = learning_rate
        self.epsilon = 1.
        self.epsilon_start, self.epsilon_end = 1.0, 0.01
        self.exploration_steps = exp_steps
        self.epsilon_decay_step = self.epsilon_start - self.epsilon_end
        self.epsilon_decay_step /= self.exploration_steps
        self.batch_size = batch_size
        self.train_start = train_st
        self.update_target_rate = update_target_rate
        
        self.memory = deque(maxlen=memory_len)
        self.gamma = 0.9
    
        self.model = DQN(self.state_size, self.current_size, self.mask_size, self.furniture_size, self.next_size)
        self.target_model = DQN(self.state_size, self.current_size, self.mask_size, self.furniture_size, self.next_size)
        
        self.optimizer = Adam(self.learning_rate)
        self.update_target_model()
        self.avg_q_max, self.avg_loss = 0,0
        self.model_path = os.path.join(os.getcwd(), 'save_mode', 'model_3d')
    
    def update_target_model(self):
        self.target_model.set_weights(self.model.get_weights())
        
    def get_action(self, state, current, mask, f_size):
        if np.random.rand() < self.epsilon:
            random_action = random.randrange(len(state))
            return random_action
        else:
            q_values = self.model([state, current, mask, f_size])
            argmax_idx = np.where(q_values == tf.math.reduce_max(q_values))
            action_idx = argmax_idx[0][0]
            return action_idx
        
    def append_sample(self, history, current, mask, f_size, next_state, reward, last, 
                      t_history, t_current, t_mask, t_f_size, t_next_state):
        self.memory.append((history, current, mask, f_size, next_state, reward, last, 
                            t_history, t_current, t_mask, t_f_size, t_next_state))
        
    def train_model(self):
        batch = random.sample(self.memory, self.batch_size)
        
        history = np.array([sample[0] for sample in batch])
        current = np.array([sample[1] for sample in batch])
        mask = np.array([sample[2] for sample in batch])
        f_size = np.array([sample[3] for sample in batch])
        next_staet = np.array([sample[4] for sample in batch])
        reward = np.array([sample[5] for sample in batch])
        last = np.array([sample[6] for sample in batch])
        t_history = np.array([sample[7] for sample in batch])
        t_current = np.array([sample[8] for sample in batch])
        t_mask = np.array([sample[9] for sample in batch])
        t_f_size = np.array([sample[10] for sample in batch])
        t_next_state = np.array([sample[11] for sample in batch])
        
        model_params = self.model.trainable_variables
        with tf.GradientTape() as tape:
            pred = self.model([history, current, mask, f_size, next_state])
            targets = []
            for i in range(self.batch_size):
                t_q = self.target_model([t_history[i], t_current[i], t_mask[i], t_f_size[i], t_next_state[i]])
                t_max_q = tf.math.reduce_max(t_q)
                targets.append([(1-0.75)*reward[i] + (1-last[i])*0.75*t_max_q])
            targets = np.array(targets)
            error = tf.abs(targets-predicts)
            quadratic_part = tf.clip_by_value(error, 0.0, 1.0)
            linear_part = error-quadratic_part
            loss = tf.reduce_mean(0.5 * tf.square(quadratic_part) + linear_part)
            self.avg_loss += loss.numpy()
        
        grads = tape.gradient(loss, model_params)
        self.optimizer.apply_gradients(zip(grads, model_params))
        

In [18]:
import time
import json

## 데이터 로드

In [None]:
+with open('data/ikea.json', 'r') as ikea_json:
    ikea_python = json.load(ikea_json)
    
with open('data/make.json', 'r') as make_json:
    make_python = json.load(make_json)
    
ikea = json.loads(ikea_python)
make = json.loads(make_python)

## 환경 생성

In [17]:
agent = Agent(L=30, B=30, learning_rate=1e-4, exp_steps=9000, train_st=1000, memory_len=1000, batch_size=32,
             update_target_rate=1000)

## 반복

In [None]:
for e in range(num_episode):
    st = time.time()
    step = 0 
    
    if e > 9000:
        agent.epsilon = 0
    if agent.epsilon > agent.epsilon_end and len(agent.memory) >= agent.train_start:
        agent.epsilon -= agent.epsilon_decay_step
        
    [] [] [] [] []
    
    for ~
        done = False
        
        append ~
        while not done:
            state = 
            current_state = 
            step += 1
            
            mask = 
            
            furniture = 
            furniture_size = 
            
            action_list = []
            next_states = state에 action을 하나씩 넣어서 리스트로 만들기
            
            state, current_state, mask, furniture_size = *len(action_list)
            
            action_idx = agent.get_action(state, current_state, mask, furniture_size)
            