# 1. Simulation

In [1]:
from string import ascii_uppercase
from draw_utils import *
from pyglet.gl import *
import numpy as np
import pandas as pd
import os



# reward
move_reward = -0.01
obs_reward = -0.2
goal_reward = 2
print('reward:' , move_reward, obs_reward, goal_reward)

local_path = '/home/zlxlekta924/YC' #os.path.abspath(os.path.join(os.path.dirname(__file__)))


class Simulator:
    def __init__(self):
        '''
        height : 그리드 높이
        width : 그리드 너비 
        inds : A ~ Q alphabet list
        '''
        # Load train data
        self.files = pd.read_csv(os.path.join(local_path, "./data/factory_order_train.csv"))
        self.height = 10
        self.width = 9
        self.inds = list(ascii_uppercase)[:17]
        self.clear_item = False

    def set_box(self):
        '''
        아이템들이 있을 위치를 미리 정해놓고 그 위치 좌표들에 아이템이 들어올 수 있으므로 그리드에 100으로 표시한다.
        데이터 파일에서 이번 에피소드 아이템 정보를 받아 가져와야 할 아이템이 있는 좌표만 -100으로 표시한다.
        self.local_target에 에이전트가 이번에 방문해야할 좌표들을 저장한다.
        따라서 가져와야하는 아이템 좌표와 end point 좌표(처음 시작했던 좌표로 돌아와야하므로)가 들어가게 된다.
        '''
        box_data = pd.read_csv(os.path.join(local_path, "./data/box.csv"))

        # 물건이 들어있을 수 있는 경우
        for box in box_data.itertuples(index = True, name ='Pandas'):
            self.grid[getattr(box, "row")][getattr(box, "col")] = 0     #####################수정 100 -> 0

        # 물건이 실제 들어있는 경우
        order_item = list(set(self.inds) & set(self.items))
        order_csv = box_data[box_data['item'].isin(order_item)]
    
        #print(order_csv) ######################################## 수정
        
        for order_box in order_csv.itertuples(index = True, name ='Pandas'):
            self.grid[getattr(order_box, "row")][getattr(order_box, "col")] = -100   ###################### 수정 -100 -> 0
            # local target에 가야 할 위치 좌표 넣기
            self.local_target.append(
                [getattr(order_box, "row"),
                 getattr(order_box, "col")]
                )
        self.local_target.append([9,4]) 
#         self.grid[self.local_target[0][0]][self.local_target[0][1]] = -100 #############################수정
        # 알파벳을 Grid에 넣어서 -> grid에 2Dconv 적용 가능

    def set_obstacle(self):
        '''
        장애물이 있어야하는 위치는 미리 obstacles.csv에 정의되어 있다. 이 좌표들을 0으로 표시한다.
        '''
        obstacles_data = pd.read_csv(os.path.join(local_path, "./data/obstacles.csv"))
        for obstacle in obstacles_data.itertuples(index = True, name ='Pandas'):
            self.grid[getattr(obstacle, "row")][getattr(obstacle, "col")] = 0

    def reset(self, epi):
        '''
        reset()은 첫 스텝에서 사용되며 그리드에서 에이전트 위치가 start point에 있게 한다.

        :param epi: episode, 에피소드 마다 가져와야 할 아이템 리스트를 불러올 때 사용
        :return: 초기셋팅 된 그리드
        :rtype: numpy.ndarray
        _____________________________________________________________________________________
        items : 이번 에피소드에서 가져와야하는 아이템들
        terminal_location : 현재 에이전트가 찾아가야하는 목적지
        local_target : 한 에피소드에서 찾아가야하는 아이템 좌표, 마지막 엔드 포인트 등의 위치좌표들
        actions: visualization을 위해 에이전트 action을 저장하는 리스트
        curloc : 현재 위치
        '''

        # initial episode parameter setting
        self.epi = epi
        self.items = list(self.files.iloc[self.epi])[0]
        self.cumulative_reward = 0
        self.terminal_location = None
        self.local_target = []
        self.actions = []


        # initial grid setting
        self.grid = np.ones((self.height, self.width), dtype="float16")

        # set information about the gridworld
        self.set_box()
        self.set_obstacle()

        # start point를 grid에 표시
        self.curloc = [9, 4]
        self.grid[int(self.curloc[0])][int(self.curloc[1])] = -5


        self.done = False
        
        #print('###########################')  ###################################수정한 부분
        #print(f'items loc : {self.local_target}')
        
        
        return self.grid

    def apply_action(self, action, cur_x, cur_y):
        '''
        에이전트가 행한 action대로 현 에이전트의 위치좌표를 바꾼다.
        action은 discrete하며 4가지 up,down,left,right으로 정의된다.
        
        :param x: 에이전트의 현재 x 좌표
        :param y: 에이전트의 현재 y 좌표
        :return: action에 따라 변한 에이전트의 x 좌표, y 좌표
        :rtype: int, int
        '''
        new_x = cur_x
        new_y = cur_y
        # up
        if action == 0:
            new_x = cur_x - 1
        # down
        elif action == 1:
            new_x = cur_x + 1
        # left
        elif action == 2:
            new_y = cur_y - 1
        # right
        else:
            new_y = cur_y + 1

        return int(new_x), int(new_y)


    def get_reward(self, new_x, new_y, out_of_boundary):
        '''
        get_reward함수는 리워드를 계산하는 함수이며, 상황에 따라 에이전트가 action을 옳게 했는지 판단하는 지표가 된다.

        :param new_x: action에 따른 에이전트 새로운 위치좌표 x
        :param new_y: action에 따른 에이전트 새로운 위치좌표 y
        :param out_of_boundary: 에이전트 위치가 그리드 밖이 되지 않도록 제한
        :return: action에 따른 리워드
        :rtype: float
        '''

        # 바깥으로 나가는 경우
        if any(out_of_boundary):
            reward = obs_reward
                       
        else:
            # 장애물에 부딪히는 경우 
            if self.grid[new_x][new_y] == 0:
                reward = obs_reward  

            # 현재 목표에 도달한 경우
            elif new_x == self.terminal_location[0] and new_y == self.terminal_location[1]:
                reward = goal_reward

            # 그냥 움직이는 경우 
            else:
                reward = move_reward

        return reward

    def step(self, action):
        ''' 
        에이전트의 action에 따라 step을 진행한다.
        action에 따라 에이전트 위치를 변환하고, action에 대해 리워드를 받고, 어느 상황에 에피소드가 종료되어야 하는지 등을 판단한다.
        에이전트가 endpoint에 도착하면 gif로 에피소드에서 에이전트의 행동이 저장된다.

        :param action: 에이전트 행동
        :return:
            grid, 그리드
            reward, 리워드
            cumulative_reward, 누적 리워드
            done, 종료 여부
            goal_ob_reward, goal까지 아이템을 모두 가지고 돌아오는 finish율 계산을 위한 파라미터

        :rtype: numpy.ndarray, float, float, bool, bool/str

        (Hint : 시작 위치 (9,4)에서 up말고 다른 action은 전부 장애물이므로 action을 고정하는 것이 좋음)
        '''
        if self.local_target:
            self.terminal_location = self.local_target[0]
            
        cur_x,cur_y = self.curloc
        self.actions.append((cur_x, cur_y))

        goal_ob_reward = False
        
        new_x, new_y = self.apply_action(action, cur_x, cur_y)

        out_of_boundary = [new_x < 0, new_x >= self.height, new_y < 0, new_y >= self.width]

        # 바깥으로 나가는 경우 종료
        if any(out_of_boundary):
            reward = self.get_reward(new_x, new_y, out_of_boundary) ######################수정
            self.done = True
            goal_ob_reward = True
            
        else:
            # 장애물에 부딪히는 경우 종료
            if self.grid[new_x][new_y] == 0:
                self.done = True
                goal_ob_reward = True
                reward = self.get_reward(new_x, new_y, out_of_boundary) ###############################수정

            # 현재 목표에 도달한 경우, 다음 목표설정
            elif new_x == self.terminal_location[0] and new_y == self.terminal_location[1]:


                # end point 일 때
                if [new_x, new_y] == [9,4]:
                    self.done = True
                
                # item 일 때
#                 self.clear_item = True
                
                self.local_target.remove(self.local_target[0])
                
#                 if self.local_target:
#                     self.grid[self.local_target[0][0]][self.local_target[0][1]] = -100 ######### 수정 가야할 곳은 -100 으로 표시
                
                self.grid[cur_x][cur_y] = 1
                self.grid[new_x][new_y] = -5
                goal_ob_reward = True
                self.curloc = [new_x, new_y]
                
                reward = self.get_reward(new_x, new_y, out_of_boundary)
            else:
                # 그냥 움직이는 경우 
                
                self.grid[cur_x][cur_y] = 1    
                self.grid[new_x][new_y] = -5
                    
                
                self.curloc = [new_x,new_y]
                
                reward = self.get_reward(new_x, new_y, out_of_boundary)
                
        #reward = self.get_reward(new_x, new_y, out_of_boundary)  ################################# 수정
        #print('reward : ' ,reward) ####################################수정한 부분
        
        self.cumulative_reward += reward

        if self.done == True:
            if [new_x, new_y] == [9, 4]:
                if self.terminal_location == [9, 4]:
                    # 완료되면 GIFS 저장
                    goal_ob_reward = 'finish'
                    height = 10
                    width = 9 
                    display = Display(visible=False, size=(width, height))
                    display.start()

                    start_point = (9, 4)
                    unit = 50
                    screen_height = height * unit
                    screen_width = width * unit
                    log_path = "./logs"
                    data_path = "./data"
                    render_cls = Render(screen_width, screen_height, unit, start_point, data_path, log_path)
                    for idx, new_pos in enumerate(self.actions):
                        render_cls.update_movement(new_pos, idx+1)
                    
                    render_cls.save_gif(self.epi)
                    render_cls.viewer.close()
                    display.stop()
        
        
        return self.grid, reward, self.cumulative_reward, self.done, goal_ob_reward


reward: -0.01 -0.2 2


## 1. Agent 구성

Agent 구성에 필요한 요소

1. Replay Buffer
2. Qnet

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical
import numpy as np
import time
from tqdm import tqdm
#Hyperparameters
learning_rate = 0.0005
gamma         = 0.98
n_rollout     = 200


  from .autonotebook import tqdm as notebook_tqdm


In [3]:

class ActorCritic(nn.Module):
    def __init__(self):
        super(ActorCritic, self).__init__()
        self.data = []
        
        self.layer1 = nn.Sequential(
            nn.Conv2d(in_channels=1, out_channels=32, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2))
        
        self.layer2 = nn.Sequential(
            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2))
        
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(256,128)
        self.fc2 = nn.Linear(128,64)
        self.fc_pi = nn.Linear(64,4)
        self.fc_v = nn.Linear(64,1)
        self.optimizer = optim.Adam(self.parameters(), lr=learning_rate)
        
    def pi(self, x, softmax_dim = 0):
        x = torch.reshape(x, (-1, 1, 10, 9)).to('cuda')
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.flatten(x)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc_pi(x)
        prob = F.softmax(x, dim=softmax_dim) #, dim=softmax_dim
        
        return prob
    
    def v(self, x):
        x = torch.reshape(x, (-1, 1, 10, 9)).to('cuda')
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.flatten(x)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        v = self.fc_v(x)
        return v
    
    def put_data(self, transition):
        self.data.append(transition)
        
    def make_batch(self):
        s_lst, a_lst, r_lst, s_prime_lst, done_lst = [], [], [], [], []
        for transition in self.data:
            s,a,r,s_prime,done = transition
            s_lst.append(s)
            a_lst.append([a])
            r_lst.append([r])
            s_prime_lst.append(s_prime)
            done_mask = 0.0 if done else 1.0
            done_lst.append([done_mask])
        
        s_batch, a_batch, r_batch, s_prime_batch, done_batch = torch.tensor(s_lst, dtype=torch.float), torch.tensor(a_lst), \
                                                               torch.tensor(r_lst, dtype=torch.float), torch.tensor(s_prime_lst, dtype=torch.float), \
                                                               torch.tensor(done_lst, dtype=torch.float)
        self.data = []
        return s_batch, a_batch, r_batch, s_prime_batch, done_batch
  
    def train_net(self):
        s, a, r, s_prime, done = self.make_batch()
        td_target = torch.tensor(r,device='cuda') + torch.tensor(gamma, device='cuda') * self.v(s_prime) * torch.tensor(done, device='cuda')
        delta = td_target - self.v(s)
        pi = self.pi(s, softmax_dim=1)
        pi_a = pi.gather(1,a.to('cuda'))
        loss = -(torch.log(pi_a)+1e9) * (delta.detach()+1e9) + F.smooth_l1_loss(self.v(s), td_target.detach())
        
        self.optimizer.zero_grad()
        loss.mean().backward()
        self.optimizer.step() 
        torch.autograd.set_detect_anomaly(True)

#### 전체적인 구성

In [4]:
def main():  
    env = Simulator()
    model = ActorCritic()
    model = model.to('cuda')
    files = pd.read_csv("./data/factory_order_train.csv")
    print_interval = 20
    
    for epi in tqdm(range(40000)):
        time.sleep(0.1)
        done = False
        s = env.reset(epi)
#         items = list(files.iloc[epi%40000])[0]
#         s = env.reset(epi)
        score = 0.0
        s = np.asarray(s, dtype=np.float32)
#         a_step= 0
        first = True

        #for i in range(50):
        while not done:
            for t in range(n_rollout): # n_rollout을 통해 for loop 진행 후 학습 진행
                # 어디로 갈지 확률 예측 부분
                prob = model.pi(torch.from_numpy(s).float())
                m = Categorical(prob)
                a = m.sample().item()

                if first:
                    a = 0
                    first = False
                    

                s_prime, r, cumul, done, goal_reward = env.step(a)
                
                view = s_prime
                
                s_prime = np.asarray(s_prime, dtype=np.float32)
                
                model.put_data((s,a,r,s_prime,done))
                
                s = s_prime
                score += r
                if done:
                    break
#                 a_step +=1


            if done:
                break                      

#             if a_step == 1000:
#                 break
            model.train_net() # 학습 진행 코드
          
        if epi%print_interval==0 and epi!=0:
            print("# of episode :{}, avg score : {:.1f}".format(epi, score/print_interval))
            print(view)
            score = 0.0


if __name__ == '__main__':
    main()

  0%|                                                                     | 21/40000 [00:08<4:28:05,  2.49it/s]

# of episode :20, avg score : -0.6
[[   0.    0.    0. -100.    0. -100.    0. -100.    0.]
 [   1.    1.    1.    1.    1.    1.    1.    1.    1.]
 [   0.    1.    1.    1.    1.    1.    1.    1.    0.]
 [   0.    1.    0.    1.    0.    1.    0.    1. -100.]
 [-100.    1.    0.    1.    0.    1.    0.    1. -100.]
 [-100.    1.    0.    1.    0.    1.    0.    1.    0.]
 [   1.    1.    0.    1.    0.    1.    0.    1.    1.]
 [   1.    1.    1.    1.    1.    1.    1.   -5.    1.]
 [   1.    1.    1.    1.    1.    1.    1.    1.    1.]
 [   0.    0.    0.    0.    1.    0.    0.    0.    0.]]


  0%|                                                                     | 41/40000 [00:16<4:33:10,  2.44it/s]

# of episode :40, avg score : -0.5
[[   1.    0. -100.    0. -100. -100.    0. -100.    0.]
 [   1.    1.    1.    1.    1.    1.    1.    1.    1.]
 [   0.    1.    1.    1.    1.    1.    1.    1.    0.]
 [   1.    1.    0.    1.    0.    1.    0.    1. -100.]
 [   0.    1.    0.    1.    0.    1.    0.    1.    0.]
 [   0.    1.    0.    1.    0.    1.    0.    1.    0.]
 [   1.    1.    0.    1.    0.    1.    0.    1.    1.]
 [   1.    1.    1.    1.    1.    1.    1.    1.    1.]
 [   1.    1.    1.    1.    1.    1.    1.   -5.    1.]
 [   0.    0.    0.    0.    1.    0.    0.    0.    0.]]


  0%|                                                                     | 61/40000 [00:25<4:33:36,  2.43it/s]

# of episode :60, avg score : -0.5
[[-100.    0.    0.    0.    0.    0.    0.    0. -100.]
 [   1.    1.    1.    1.    1.    1.    1.    1.    1.]
 [   0.    1.    1.    1.    1.    1.    1.    1.    0.]
 [   0.    1.    0.    1.    0.    1.    0.    1. -100.]
 [   0.    1.    0.    1.    0.   -5.    0.    1.    0.]
 [   0.    1.    0.    1.    0.    1.    0.    1.    0.]
 [   1.    1.    0.    1.    0.    1.    0.    1.    1.]
 [   1.    1.    1.    1.    1.    1.    1.    1.    1.]
 [   1.    1.    1.    1.    1.    1.    1.    1.    1.]
 [   0.    0.    0.    0.    1.    0.    0.    0.    0.]]


  0%|▏                                                                    | 81/40000 [00:33<4:30:27,  2.46it/s]

# of episode :80, avg score : -0.7
[[   0. -100.    0. -100.    0.    0. -100.    0.    0.]
 [   1.    1.    1.    1.    1.    1.    1.    1.    1.]
 [   0.    1.    1.    1.    1.    1.    1.    1.    0.]
 [   0.    1.    0.    1.    0.    1.    0.    1.    0.]
 [   0.    1.    0.    1.    0.    1.    0.    1.    0.]
 [   0.    1.    0.    1.    0.    1.    0.    1.    0.]
 [   1.    1.    0.    1.    0.    1.    0.    1.   -5.]
 [   1.    1.    1.    1.    1.    1.    1.    1.    1.]
 [   1.    1.    1.    1.    1.    1.    1.    1.    1.]
 [   0.    0.    0.    0.    1.    0.    0.    0.    0.]]


  0%|▏                                                                   | 101/40000 [00:41<4:32:55,  2.44it/s]

# of episode :100, avg score : -0.5
[[   0.    0.    0.    0. -100.    0.    0.    0. -100.]
 [   1.    1.    1.    1.    1.    1.    1.    1.    1.]
 [   1.    1.    1.    1.    1.    1.    1.    1.    0.]
 [   1.    1.    0.    1.    0.    1.    0.    1.    0.]
 [   0.   -5.    0.    1.    0.    1.    0.    1.    0.]
 [   0.    1.    0.    1.    0.    1.    0.    1. -100.]
 [   1.    1.    0.    1.    0.    1.    0.    1.    1.]
 [   1.    1.    1.    1.    1.    1.    1.    1.    1.]
 [   1.    1.    1.    1.    1.    1.    1.    1.    1.]
 [   0.    0.    0.    0.    1.    0.    0.    0.    0.]]


  0%|▏                                                                   | 121/40000 [00:49<4:34:19,  2.42it/s]

# of episode :120, avg score : -0.8
[[   0.    0.    0.    0.    0.    0.    0.    0.    0.]
 [   1.    1.    1.   -5.    1.    1.    1.    1.    1.]
 [   0.    1.    1.    1.    1.    1.    1.    1.    0.]
 [   0.    1.    0.    1.    0.    1.    0.    1.    0.]
 [   0.    1.    0.    1.    0.    1.    0.    1. -100.]
 [   0.    1.    0.    1.    0.    1.    0.    1.    0.]
 [   1.    1.    0.    1.    0.    1.    0.    1.    1.]
 [   1.    1.    1.    1.    1.    1.    1.    1.    1.]
 [   1.    1.    1.    1.    1.    1.    1.    1.    1.]
 [   0.    0.    0.    0.    1.    0.    0.    0.    0.]]


  0%|▏                                                                   | 141/40000 [00:58<4:32:47,  2.44it/s]

# of episode :140, avg score : -0.5
[[-100. -100. -100.    0.    0.    0.    0.    0. -100.]
 [   1.    1.    1.    1.    1.    1.    1.    1.    1.]
 [-100.    1.    1.    1.    1.    1.   -5.    1.    0.]
 [   0.    1.    0.    1.    0.    1.    0.    1.    0.]
 [-100.    1.    0.    1.    0.    1.    0.    1.    0.]
 [   0.    1.    0.    1.    0.    1.    0.    1.    1.]
 [   1.    1.    0.    1.    0.    1.    0.    1.    1.]
 [   1.    1.    1.    1.    1.    1.    1.    1.    1.]
 [   1.    1.    1.    1.    1.    1.    1.    1.    1.]
 [   0.    0.    0.    0.    1.    0.    0.    0.    0.]]


  0%|▎                                                                   | 161/40000 [01:06<4:40:18,  2.37it/s]

# of episode :160, avg score : -0.8
[[-100.    0. -100.    0.    0.    0.    0.    0. -100.]
 [   1.    1.    1.    1.    1.    1.    1.    1.    1.]
 [   1.    1.    1.    1.    1.    1.    1.    1.    0.]
 [-100.    1.    0.    1.    0.    1.    0.    1. -100.]
 [   0.    1.    0.    1.    0.    1.    0.    1.    0.]
 [   0.    1.    0.    1.    0.    1.    0.    1.    0.]
 [   1.    1.    0.    1.    0.    1.    0.    1.    1.]
 [   1.   -5.    1.    1.    1.    1.    1.    1.    1.]
 [   1.    1.    1.    1.    1.    1.    1.    1.    1.]
 [   0.    0.    0.    0.    1.    0.    0.    0.    0.]]


  0%|▎                                                                   | 181/40000 [01:12<3:26:49,  3.21it/s]

# of episode :180, avg score : -0.8
[[   0.    0.    0.    0. -100.    0.    0.    0.    0.]
 [   1.    1.    1.    1.    1.    1.    1.    1.    1.]
 [   0.    1.    1.    1.    1.    1.    1.    1.    0.]
 [   0.    1.    0.    1.    0.    1.    0.    1. -100.]
 [   0.    1.    0.    1.    0.    1.    0.    1.    0.]
 [   0.    1.    0.    1.    0.    1.    0.    1. -100.]
 [   1.    1.    0.    1.    0.    1.    0.    1.    1.]
 [  -5.    1.    1.    1.    1.    1.    1.    1.    1.]
 [   1.    1.    1.    1.    1.    1.    1.    1.    1.]
 [   0.    0.    0.    0.    1.    0.    0.    0.    0.]]


  1%|▎                                                                   | 201/40000 [01:19<3:29:03,  3.17it/s]

# of episode :200, avg score : -0.5
[[   0. -100.    0. -100. -100.    0. -100.    0.    0.]
 [   1.    1.    1.    1.    1.    1.    1.    1.    1.]
 [   0.   -5.    1.    1.    1.    1.    1.    1. -100.]
 [   0.    1.    0.    1.    0.    1.    0.    1.    0.]
 [   0.    1.    0.    1.    0.    1.    0.    1.    0.]
 [   1.    1.    0.    1.    0.    1.    0.    1.    0.]
 [   1.    1.    0.    1.    0.    1.    0.    1.    1.]
 [   1.    1.    1.    1.    1.    1.    1.    1.    1.]
 [   1.    1.    1.    1.    1.    1.    1.    1.    1.]
 [   0.    0.    0.    0.    1.    0.    0.    0.    0.]]


  1%|▍                                                                   | 221/40000 [01:25<3:29:24,  3.17it/s]

# of episode :220, avg score : -0.7
[[   0. -100. -100.    0.    0.    0. -100.    0.    0.]
 [   1.    1.    1.    1.    1.    1.    1.    1.    1.]
 [-100.    1.    1.    1.    1.    1.    1.    1.    0.]
 [   0.    1.    0.    1.    0.    1.    0.    1.    0.]
 [   0.    1.    0.   -5.    0.    1.    0.    1. -100.]
 [-100.    1.    0.    1.    0.    1.    0.    1.    0.]
 [   1.    1.    0.    1.    0.    1.    0.    1.    1.]
 [   1.    1.    1.    1.    1.    1.    1.    1.    1.]
 [   1.    1.    1.    1.    1.    1.    1.    1.    1.]
 [   0.    0.    0.    0.    1.    0.    0.    0.    0.]]


  1%|▍                                                                   | 241/40000 [01:31<3:33:28,  3.10it/s]

# of episode :240, avg score : -0.8
[[-100.    0.    0.    0.    0.    0.    0.    0.    1.]
 [   1.    1.    1.    1.    1.    1.    1.    1.    1.]
 [   0.    1.    1.    1.    1.    1.    1.    1.    0.]
 [   0.    1.    0.    1.    0.    1.    0.   -5.    0.]
 [   0.    1.    0.    1.    0.    1.    0.    1.    0.]
 [   0.    1.    0.    1.    0.    1.    0.    1.    0.]
 [   1.    1.    0.    1.    0.    1.    0.    1.    1.]
 [   1.    1.    1.    1.    1.    1.    1.    1.    1.]
 [   1.    1.    1.    1.    1.    1.    1.    1.    1.]
 [   0.    0.    0.    0.    1.    0.    0.    0.    0.]]


  1%|▍                                                                   | 261/40000 [01:38<4:15:34,  2.59it/s]

# of episode :260, avg score : -0.7
[[   0.    0.    0.    0.    0.    0.    0.    0. -100.]
 [   1.    1.    1.    1.    1.    1.    1.    1.    1.]
 [   0.    1.    1.    1.    1.    1.    1.    1.    0.]
 [   0.    1.    0.    1.    0.    1.    0.    1.    0.]
 [   0.    1.    0.    1.    0.    1.    0.    1.    0.]
 [   0.    1.    0.    1.    0.    1.    0.    1.    1.]
 [   1.    1.    0.    1.    0.    1.    0.    1.   -5.]
 [   1.    1.    1.    1.    1.    1.    1.    1.    1.]
 [   1.    1.    1.    1.    1.    1.    1.    1.    1.]
 [   0.    0.    0.    0.    1.    0.    0.    0.    0.]]


  1%|▍                                                                   | 281/40000 [01:45<3:28:31,  3.17it/s]

# of episode :280, avg score : -0.8
[[   0. -100. -100.    0.    0.    1.    0.    0.    1.]
 [   1.    1.    1.    1.    1.    1.    1.    1.    1.]
 [-100.    1.    1.    1.    1.    1.    1.    1.    1.]
 [   0.    1.    0.    1.    0.    1.    0.    1.    0.]
 [   0.    1.    0.    1.    0.    1.    0.    1.    0.]
 [   0.    1.    0.    1.    0.    1.    0.    1.    0.]
 [   1.    1.    0.    1.    0.    1.    0.    1.    1.]
 [   1.    1.    1.    1.    1.    1.    1.    1.    1.]
 [   1.    1.    1.   -5.    1.    1.    1.    1.    1.]
 [   0.    0.    0.    0.    1.    0.    0.    0.    0.]]


  1%|▌                                                                   | 301/40000 [01:51<3:48:43,  2.89it/s]

# of episode :300, avg score : -0.7
[[-100.    0. -100.    0.    0.    0.    0.    0. -100.]
 [   1.    1.    1.    1.    1.    1.    1.    1.    1.]
 [   0.    1.    1.    1.    1.    1.    1.    1.    0.]
 [-100.    1.    0.    1.    0.    1.    0.    1. -100.]
 [-100.    1.    0.    1.    0.    1.    0.    1. -100.]
 [   0.    1.    0.    1.    0.    1.    0.   -5.    0.]
 [   1.    1.    0.    1.    0.    1.    0.    1.    1.]
 [   1.    1.    1.    1.    1.    1.    1.    1.    1.]
 [   1.    1.    1.    1.    1.    1.    1.    1.    1.]
 [   0.    0.    0.    0.    1.    0.    0.    0.    0.]]


  1%|▌                                                                   | 321/40000 [02:00<4:36:02,  2.40it/s]

# of episode :320, avg score : -0.7
[[   0. -100.    0.    0. -100.    0.    0.    0.    0.]
 [   1.    1.    1.    1.    1.    1.    1.    1.    1.]
 [   0.    1.    1.    1.    1.    1.    1.    1.    0.]
 [   0.    1.    0.    1.    0.    1.    0.    1.    0.]
 [   0.    1.    0.    1.    0.    1.    0.    1.    0.]
 [-100.    1.    0.    1.    0.    1.    0.    1.    0.]
 [   1.    1.    0.    1.    0.    1.    0.   -5.    1.]
 [   1.    1.    1.    1.    1.    1.    1.    1.    1.]
 [   1.    1.    1.    1.    1.    1.    1.    1.    1.]
 [   0.    0.    0.    0.    1.    0.    0.    0.    0.]]


  1%|▌                                                                   | 341/40000 [02:08<4:45:01,  2.32it/s]

# of episode :340, avg score : -0.5
[[   0.    0.    1.    1.    0.    0. -100.    0.    0.]
 [   1.    1.    1.    1.    1.    1.    1.    1.    1.]
 [   0.    1.    1.    1.    1.    1.    1.    1.    0.]
 [   0.    1.    0.   -5.    0.    1.    0.    1. -100.]
 [   0.    1.    0.    1.    0.    1.    0.    1.    0.]
 [   1.    1.    0.    1.    0.    1.    0.    1. -100.]
 [   1.    1.    0.    1.    0.    1.    0.    1.    1.]
 [   1.    1.    1.    1.    1.    1.    1.    1.    1.]
 [   1.    1.    1.    1.    1.    1.    1.    1.    1.]
 [   0.    0.    0.    0.    1.    0.    0.    0.    0.]]


  1%|▌                                                                   | 361/40000 [02:16<4:33:59,  2.41it/s]

# of episode :360, avg score : -0.7
[[   0.    0.    1.    0.    0.    0.    1.    0.    1.]
 [   1.    1.    1.    1.    1.    1.    1.    1.    1.]
 [   1.    1.    1.    1.    1.    1.    1.    1.    0.]
 [   0.    1.    0.    1.    0.    1.    0.    1.    0.]
 [-100.    1.    0.    1.    0.    1.    0.    1.    0.]
 [   0.    1.    0.    1.    0.    1.    0.    1.    0.]
 [   1.    1.    0.    1.    0.    1.    0.    1.    1.]
 [   1.    1.    1.    1.    1.    1.    1.    1.    1.]
 [   1.    1.    1.    1.    1.    1.    1.   -5.    1.]
 [   0.    0.    0.    0.    1.    0.    0.    0.    0.]]


  1%|▋                                                                   | 381/40000 [02:25<4:35:53,  2.39it/s]

# of episode :380, avg score : -0.8
[[   0.    0.    0.    0.    1.    0.    1.    1.    0.]
 [   1.    1.    1.    1.    1.    1.    1.    1.    1.]
 [   0.    1.   -5.    1.    1.    1.    1.    1.    0.]
 [   0.    1.    0.    1.    0.    1.    0.    1.    0.]
 [-100.    1.    0.    1.    0.    1.    0.    1.    0.]
 [   0.    1.    0.    1.    0.    1.    0.    1. -100.]
 [   1.    1.    0.    1.    0.    1.    0.    1.    1.]
 [   1.    1.    1.    1.    1.    1.    1.    1.    1.]
 [   1.    1.    1.    1.    1.    1.    1.    1.    1.]
 [   0.    0.    0.    0.    1.    0.    0.    0.    0.]]


  1%|▋                                                                   | 401/40000 [02:33<4:34:49,  2.40it/s]

# of episode :400, avg score : -0.6
[[   0.    0.    0.    0.    0.    0.    0.    0.    0.]
 [   1.    1.    1.    1.    1.    1.    1.    1.    1.]
 [   1.    1.    1.    1.    1.    1.    1.    1.    0.]
 [   1.    1.    0.    1.    0.    1.    0.    1.    0.]
 [-100.    1.    0.    1.    0.    1.    0.    1.    0.]
 [   0.   -5.    0.    1.    0.    1.    0.    1.    0.]
 [   1.    1.    0.    1.    0.    1.    0.    1.    1.]
 [   1.    1.    1.    1.    1.    1.    1.    1.    1.]
 [   1.    1.    1.    1.    1.    1.    1.    1.    1.]
 [   0.    0.    0.    0.    1.    0.    0.    0.    0.]]


  1%|▋                                                                   | 421/40000 [02:41<4:28:41,  2.46it/s]

# of episode :420, avg score : -0.5
[[-100.    0.    0.    0.    0.    0.    0.    0.    0.]
 [   1.    1.    1.    1.    1.    1.    1.    1.    1.]
 [   0.    1.    1.    1.   -5.    1.    1.    1.    0.]
 [   0.    1.    0.    1.    0.    1.    0.    1. -100.]
 [   0.    1.    0.    1.    0.    1.    0.    1.    0.]
 [   0.    1.    0.    1.    0.    1.    0.    1.    0.]
 [   1.    1.    0.    1.    0.    1.    0.    1.    1.]
 [   1.    1.    1.    1.    1.    1.    1.    1.    1.]
 [   1.    1.    1.    1.    1.    1.    1.    1.    1.]
 [   0.    0.    0.    0.    1.    0.    0.    0.    0.]]


  1%|▋                                                                   | 441/40000 [02:49<4:38:03,  2.37it/s]

# of episode :440, avg score : -0.6
[[-100.    0.    0.    0.    0. -100.    0.    0. -100.]
 [   1.    1.    1.    1.    1.    1.    1.    1.    1.]
 [   0.    1.    1.    1.    1.    1.    1.    1.    0.]
 [   0.    1.    0.    1.    0.    1.    0.    1.    0.]
 [   0.    1.    0.    1.    0.    1.    0.    1.    0.]
 [   0.    1.    0.    1.    0.    1.    0.    1.    0.]
 [   1.    1.    0.    1.    0.    1.    0.    1.    1.]
 [   1.    1.    1.    1.    1.    1.    1.    1.    1.]
 [  -5.    1.    1.    1.    1.    1.    1.    1.    1.]
 [   0.    0.    0.    0.    1.    0.    0.    0.    0.]]


  1%|▊                                                                   | 461/40000 [02:58<4:35:25,  2.39it/s]

# of episode :460, avg score : -0.6
[[   0. -100.    0.    0.    0.    0.    0.    0.    0.]
 [   1.    1.    1.    1.    1.    1.    1.    1.    1.]
 [   0.    1.    1.    1.    1.    1.    1.    1.    0.]
 [   0.    1.    0.    1.    0.    1.    0.    1. -100.]
 [   0.    1.    0.    1.    0.    1.    0.    1.    0.]
 [   1.   -5.    0.    1.    0.    1.    0.    1.    0.]
 [   1.    1.    0.    1.    0.    1.    0.    1.    1.]
 [   1.    1.    1.    1.    1.    1.    1.    1.    1.]
 [   1.    1.    1.    1.    1.    1.    1.    1.    1.]
 [   0.    0.    0.    0.    1.    0.    0.    0.    0.]]


  1%|▊                                                                   | 463/40000 [03:08<4:28:08,  2.46it/s]


IndexError: list index out of range