# 1. Simulation

In [1]:
from string import ascii_uppercase
from draw_utils import *
from pyglet.gl import *
import numpy as np
import pandas as pd
import os



# reward
move_reward = -0.1
obs_reward = -0.2
goal_reward = 15
print('reward:' , move_reward, obs_reward, goal_reward)

local_path = '/home/zlxlekta924/YC' #os.path.abspath(os.path.join(os.path.dirname(__file__)))


class Simulator:
    def __init__(self):
        '''
        height : 그리드 높이
        width : 그리드 너비 
        inds : A ~ Q alphabet list
        '''

        
        # Load train data
        self.files = pd.read_csv(os.path.join(local_path, "./data/factory_order_train.csv"))
        self.height = 10
        self.width = 9
        self.inds = list(ascii_uppercase)[:17]
        self.clear_item = False

    def set_box(self):
        '''
        아이템들이 있을 위치를 미리 정해놓고 그 위치 좌표들에 아이템이 들어올 수 있으므로 그리드에 100으로 표시한다.
        데이터 파일에서 이번 에피소드 아이템 정보를 받아 가져와야 할 아이템이 있는 좌표만 -100으로 표시한다.
        self.local_target에 에이전트가 이번에 방문해야할 좌표들을 저장한다.
        따라서 가져와야하는 아이템 좌표와 end point 좌표(처음 시작했던 좌표로 돌아와야하므로)가 들어가게 된다.
        '''
        box_data = pd.read_csv(os.path.join(local_path, "./data/box.csv"))

        # 물건이 들어있을 수 있는 경우
        for box in box_data.itertuples(index = True, name ='Pandas'):
            self.grid[getattr(box, "row")][getattr(box, "col")] = 0    #####################수정 100 -> 0

        # 물건이 실제 들어있는 경우
        order_item = list(set(self.inds) & set(self.items))
        order_csv = box_data[box_data['item'].isin(order_item)]
    
        #print(order_csv) ######################################## 수정
        
        for order_box in order_csv.itertuples(index = True, name ='Pandas'):
            self.grid[getattr(order_box, "row")][getattr(order_box, "col")] = 2   ###################### 수정 -100 -> 0
            # local target에 가야 할 위치 좌표 넣기
            self.local_target.append(
                [getattr(order_box, "row"),
                 getattr(order_box, "col")]
                )
        self.local_target.append([9,4]) 
        #self.grid[self.local_target[0][0]][self.local_target[0][1]] = -100 #############################수정
        # 알파벳을 Grid에 넣어서 -> grid에 2Dconv 적용 가능

    def set_obstacle(self):
        '''
        장애물이 있어야하는 위치는 미리 obstacles.csv에 정의되어 있다. 이 좌표들을 0으로 표시한다.
        '''
        obstacles_data = pd.read_csv(os.path.join(local_path, "./data/obstacles.csv"))
        for obstacle in obstacles_data.itertuples(index = True, name ='Pandas'):
            self.grid[getattr(obstacle, "row")][getattr(obstacle, "col")] = 0

    def reset(self, epi):
        '''
        reset()은 첫 스텝에서 사용되며 그리드에서 에이전트 위치가 start point에 있게 한다.

        :param epi: episode, 에피소드 마다 가져와야 할 아이템 리스트를 불러올 때 사용
        :return: 초기셋팅 된 그리드
        :rtype: numpy.ndarray
        _____________________________________________________________________________________
        items : 이번 에피소드에서 가져와야하는 아이템들
        terminal_location : 현재 에이전트가 찾아가야하는 목적지
        local_target : 한 에피소드에서 찾아가야하는 아이템 좌표, 마지막 엔드 포인트 등의 위치좌표들
        actions: visualization을 위해 에이전트 action을 저장하는 리스트
        curloc : 현재 위치
        '''

        # initial episode parameter setting
        self.epi = epi
        self.items = list(self.files.iloc[self.epi])[0]
        self.cumulative_reward = 0
        self.terminal_location = None
        self.local_target = []
        self.actions = []

        # initial grid setting
        self.grid = np.ones((self.height, self.width), dtype="float16")

        # set information about the gridworld
        self.set_box()
        self.set_obstacle()

        # start point를 grid에 표시
        self.curloc = [9, 4]
        self.grid[int(self.curloc[0])][int(self.curloc[1])] = 3
        self.item_now = False

        self.done = False
        
        #print('###########################')  ###################################수정한 부분
        #print(f'items loc : {self.local_target}')
        
        
        return self.grid

    def apply_action(self, action, cur_x, cur_y):
        '''
        에이전트가 행한 action대로 현 에이전트의 위치좌표를 바꾼다.
        action은 discrete하며 4가지 up,down,left,right으로 정의된다.
        
        :param x: 에이전트의 현재 x 좌표
        :param y: 에이전트의 현재 y 좌표
        :return: action에 따라 변한 에이전트의 x 좌표, y 좌표
        :rtype: int, int
        '''
        new_x = cur_x
        new_y = cur_y
        # up
        if action == 0:
            new_x = cur_x - 1
        # down
        elif action == 1:
            new_x = cur_x + 1
        # left
        elif action == 2:
            new_y = cur_y - 1
        # right
        else:
            new_y = cur_y + 1

        return int(new_x), int(new_y)


    def get_reward(self, new_x, new_y, out_of_boundary):
        '''
        get_reward함수는 리워드를 계산하는 함수이며, 상황에 따라 에이전트가 action을 옳게 했는지 판단하는 지표가 된다.

        :param new_x: action에 따른 에이전트 새로운 위치좌표 x
        :param new_y: action에 따른 에이전트 새로운 위치좌표 y
        :param out_of_boundary: 에이전트 위치가 그리드 밖이 되지 않도록 제한
        :return: action에 따른 리워드
        :rtype: float
        '''

        # 바깥으로 나가는 경우
        if any(out_of_boundary):
            reward = obs_reward
                       
        else:
            # 장애물에 부딪히는 경우 
            if self.grid[new_x][new_y] == 0:
                reward = obs_reward  

            # 현재 목표에 도달한 경우
            elif self.grid[new_x][new_y] == 3:
                reward = goal_reward

            # 그냥 움직이는 경우 
            else:
                reward = move_reward

        return reward

    def step(self, action):
        ''' 
        에이전트의 action에 따라 step을 진행한다.
        action에 따라 에이전트 위치를 변환하고, action에 대해 리워드를 받고, 어느 상황에 에피소드가 종료되어야 하는지 등을 판단한다.
        에이전트가 endpoint에 도착하면 gif로 에피소드에서 에이전트의 행동이 저장된다.

        :param action: 에이전트 행동
        :return:
            grid, 그리드
            reward, 리워드
            cumulative_reward, 누적 리워드
            done, 종료 여부
            goal_ob_reward, goal까지 아이템을 모두 가지고 돌아오는 finish율 계산을 위한 파라미터

        :rtype: numpy.ndarray, float, float, bool, bool/str

        (Hint : 시작 위치 (9,4)에서 up말고 다른 action은 전부 장애물이므로 action을 고정하는 것이 좋음)
        '''
        if self.local_target:
            self.terminal_location = self.local_target[0]
            
        cur_x,cur_y = self.curloc
        self.actions.append((cur_x, cur_y))

        goal_ob_reward = False
        
        new_x, new_y = self.apply_action(action, cur_x, cur_y)

        out_of_boundary = [new_x < 0, new_x >= self.height, new_y < 0, new_y >= self.width]
        
        # 바깥으로 나가는 경우 종료
        if any(out_of_boundary):
            reward = self.get_reward(new_x, new_y, out_of_boundary) ######################수정
            # self.done = True
            # goal_ob_reward = True
            
        else:
            # 장애물에 부딪히는 경우 종료
            if self.grid[new_x][new_y] == 0:
                # self.done = True
                # goal_ob_reward = True
                reward = self.get_reward(new_x, new_y, out_of_boundary) ###############################수정

                
                
                # 1. 어떤 아이템이든 들어가면 0으로 바꾸고, 로컬타겟의 해당 좌표만 삭제
                #2. 모든 아이템을 다 먹으면 터미널로케이션으로 9,4 설정
                
            # 현재 목표에 도달한 경우, 다음 목표설정
            elif self.grid[new_x][new_y] == 2:
                
                # 타깃 삭제
                self.local_target.remove([new_x, new_y])
                
                #전 state 에 item 이 있었으면
                if self.item_now:
                    self.grid[cur_x][cur_y] = 0
                    self.grid[new_x][new_y] = 3
                else:
                    self.grid[cur_x][cur_y] = 1
                    self.grid[new_x][new_y] = 3

                self.item_now = True
#                 self.local_target.remove(self.local_target[0]) 삭제
                
                goal_ob_reward = True
                self.curloc = [new_x, new_y]
                
                reward = self.get_reward(new_x, new_y, out_of_boundary)
                
            elif [new_x, new_y] == [9,4] and len(self.local_target) == 1:
                self.done = True
                reward = 20
                
            else:
                # 그냥 움직이는 경우 
                
                if self.item_now:
                    self.grid[cur_x][cur_y] = 0
                    self.grid[new_x][new_y] = 3
                    self.item_now = False
                    
                else:
                    self.grid[cur_x][cur_y] = 1
                    self.grid[new_x][new_y] = 3
                    
                
                self.curloc = [new_x,new_y]
                
                reward = self.get_reward(new_x, new_y, out_of_boundary)
                
        #reward = self.get_reward(new_x, new_y, out_of_boundary)  ################################# 수정
        #print('reward : ' ,reward) ####################################수정한 부분
        
        self.cumulative_reward += reward

        if self.done == True:
            if [new_x, new_y] == [9, 4]:
                if self.terminal_location == [9, 4]:
                    # 완료되면 GIFS 저장
                    goal_ob_reward = 'finish'
                    height = 10
                    width = 9 
                    display = Display(visible=False, size=(width, height))
                    display.start()

                    start_point = (9, 4)
                    unit = 50
                    screen_height = height * unit
                    screen_width = width * unit
                    log_path = "./ppologs"
                    data_path = "./data"
                    render_cls = Render(screen_width, screen_height, unit, start_point, data_path, log_path)
                    for idx, new_pos in enumerate(self.actions):
                        render_cls.update_movement(new_pos, idx+1)
                    
                    render_cls.save_gif(self.epi)
                    render_cls.viewer.close()
                    display.stop()
        
        
        return self.grid, reward, self.cumulative_reward, self.done, goal_ob_reward


reward: -0.1 -0.2 15


## 1. Agent 구성


## 1-1 PPO

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical
from torch.distributions import MultivariateNormal
from torch.distributions import Categorical
import numpy as np
import time
from tqdm import tqdm



################################## PPO Policy ##################################
class RolloutBuffer:
    def __init__(self):
        self.actions = []
        self.states = []
        self.logprobs = []
        self.rewards = []
        self.is_terminals = []
    
    def clear(self):
        del self.actions[:]
        del self.states[:]
        del self.logprobs[:]
        del self.rewards[:]
        del self.is_terminals[:]


class ActorCritic(nn.Module):
    def __init__(self,action_dim, has_continuous_action_space, action_std_init):
        super(ActorCritic, self).__init__()

        self.has_continuous_action_space = has_continuous_action_space
        
        if has_continuous_action_space:
            self.action_dim = action_dim
            self.action_var = torch.full((action_dim,), action_std_init * action_std_init)
        # actor
        if has_continuous_action_space :
            self.actor = nn.Sequential(
                            nn.Flatten(),
                            nn.Linear(9, 64),
                            nn.Tanh(),
                            nn.Linear(64, 64),
                            nn.Tanh(),
                            nn.Linear(64, action_dim),
                        )
        else:
            self.actor = nn.Sequential(
                            nn.Flatten(),
                            nn.Linear(90, 128),
                            nn.Tanh(),
                            nn.Linear(128, 64),
                            nn.Tanh(),
                            nn.Linear(64, action_dim),
                            nn.Softmax(dim=-1)
                        )
        # critic
        self.critic = nn.Sequential(
                        nn.Flatten(),
                        nn.Linear(90, 128),
                        nn.Tanh(),
                        nn.Linear(128, 64),
                        nn.Tanh(),
                        nn.Linear(64, 1)
                    )
        
    def set_action_std(self, new_action_std):
        if self.has_continuous_action_space:
            self.action_var = torch.full((self.action_dim,), new_action_std * new_action_std)
        else:
            print("--------------------------------------------------------------------------------------------")
            print("WARNING : Calling ActorCritic::set_action_std() on discrete action space policy")
            print("--------------------------------------------------------------------------------------------")

    def forward(self):
        raise NotImplementedError
    
    def act(self, state):
        if self.has_continuous_action_space:
            action_mean = self.actor(state)
            cov_mat = torch.diag(self.action_var).unsqueeze(dim=0)
            dist = MultivariateNormal(action_mean, cov_mat)
        else:
            action_probs = self.actor(state)
            dist = Categorical(action_probs)

        action = dist.sample()
        action_logprob = dist.log_prob(action)
        
        return action.detach(), action_logprob.detach()
    
    def evaluate(self, state, action):

        if self.has_continuous_action_space:
            action_mean = self.actor(state)
            
            action_var = self.action_var.expand_as(action_mean)
            cov_mat = torch.diag_embed(action_var)
            dist = MultivariateNormal(action_mean, cov_mat)
            
            # For Single Action Environments.
            if self.action_dim == 1:
                action = action.reshape(-1, self.action_dim)
        else:
            action_probs = self.actor(state)
            dist = Categorical(action_probs)
        action_logprobs = dist.log_prob(action)
        dist_entropy = dist.entropy()
        state_values = self.critic(state)
        
        return action_logprobs, state_values, dist_entropy


class PPO:
    def __init__(self, action_dim, lr_actor, lr_critic, gamma, K_epochs, eps_clip, has_continuous_action_space, action_std_init=0.6):

        self.has_continuous_action_space = has_continuous_action_space

        if has_continuous_action_space:
            self.action_std = action_std_init

        self.gamma = gamma
        self.eps_clip = eps_clip
        self.K_epochs = K_epochs
        
        self.buffer = RolloutBuffer()

        self.policy = ActorCritic(action_dim, has_continuous_action_space, action_std_init)
        self.optimizer = torch.optim.Adam([
                        {'params': self.policy.actor.parameters(), 'lr': lr_actor},
                        {'params': self.policy.critic.parameters(), 'lr': lr_critic}
                    ])

        self.policy_old = ActorCritic(action_dim, has_continuous_action_space, action_std_init)
        self.policy_old.load_state_dict(self.policy.state_dict())
        
        self.MseLoss = nn.MSELoss()

    def set_action_std(self, new_action_std):
        if self.has_continuous_action_space:
            self.action_std = new_action_std
            self.policy.set_action_std(new_action_std)
            self.policy_old.set_action_std(new_action_std)
        else:
            print("--------------------------------------------------------------------------------------------")
            print("WARNING : Calling PPO::set_action_std() on discrete action space policy")
            print("--------------------------------------------------------------------------------------------")

    def decay_action_std(self, action_std_decay_rate, min_action_std):
        print("--------------------------------------------------------------------------------------------")
        if self.has_continuous_action_space:
            self.action_std = self.action_std - action_std_decay_rate
            self.action_std = round(self.action_std, 4)
            if (self.action_std <= min_action_std):
                self.action_std = min_action_std
                print("setting actor output action_std to min_action_std : ", self.action_std)
            else:
                print("setting actor output action_std to : ", self.action_std)
            self.set_action_std(self.action_std)

        else:
            print("WARNING : Calling PPO::decay_action_std() on discrete action space policy")
        print("--------------------------------------------------------------------------------------------")

    def select_action(self, state):

        if self.has_continuous_action_space:
            with torch.no_grad():
                state = torch.FloatTensor(state)
                action, action_logprob = self.policy_old.act(state)

            self.buffer.states.append(state)
            self.buffer.actions.append(action)
            self.buffer.logprobs.append(action_logprob)

            return action.detach().cpu().numpy().flatten()
        else:
            with torch.no_grad():
                state = torch.FloatTensor(state)
                action, action_logprob = self.policy_old.act(state)
            
            self.buffer.states.append(state)
            self.buffer.actions.append(action)
            self.buffer.logprobs.append(action_logprob)

            return action.item()

    def update(self):
        # Monte Carlo estimate of returns
        rewards = []
        discounted_reward = 0
        for reward, is_terminal in zip(reversed(self.buffer.rewards), reversed(self.buffer.is_terminals)):
            if is_terminal:
                discounted_reward = 0
            discounted_reward = reward + (self.gamma * discounted_reward)
            rewards.insert(0, discounted_reward)
            
        # Normalizing the rewards
        rewards = torch.tensor(rewards, dtype=torch.float32)
        rewards = (rewards - rewards.mean()) / (rewards.std() + 1e-7)

        # convert list to tensor
        old_states = torch.squeeze(torch.stack(self.buffer.states, dim=0)).detach()
        old_actions = torch.squeeze(torch.stack(self.buffer.actions, dim=0)).detach()
        old_logprobs = torch.squeeze(torch.stack(self.buffer.logprobs, dim=0)).detach()

        # Optimize policy for K epochs
        for _ in range(self.K_epochs):

            # Evaluating old actions and values
            logprobs, state_values, dist_entropy = self.policy.evaluate(old_states, old_actions)

            # match state_values tensor dimensions with rewards tensor
            state_values = torch.squeeze(state_values)
            
            # Finding the ratio (pi_theta / pi_theta__old)
            ratios = torch.exp(logprobs - old_logprobs.detach())

            # Finding Surrogate Loss
            advantages = rewards - state_values.detach()   
            surr1 = ratios * advantages
            surr2 = torch.clamp(ratios, 1-self.eps_clip, 1+self.eps_clip) * advantages

            # final loss of clipped objective PPO
            loss = -torch.min(surr1, surr2) + 0.5*self.MseLoss(state_values, rewards) - 0.01*dist_entropy
            
            # take gradient step
            self.optimizer.zero_grad()
            loss.mean().backward()
            self.optimizer.step()
            
        # Copy new weights into old policy
        self.policy_old.load_state_dict(self.policy.state_dict())

        # clear buffer
        self.buffer.clear()
    
    def save(self, checkpoint_path):
        torch.save(self.policy_old.state_dict(), checkpoint_path)
   
    def load(self, checkpoint_path):
        self.policy_old.load_state_dict(torch.load(checkpoint_path, map_location=lambda storage, loc: storage))
        self.policy.load_state_dict(torch.load(checkpoint_path, map_location=lambda storage, loc: storage))

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
import os
import glob
import time
from datetime import datetime

import torch
import numpy as np


################################### Training ###################################
def train():
    print("============================================================================================")

    ####### initialize environment hyperparameters ######
    env_name = "Grid World"

    has_continuous_action_space = False  # continuous action space; else discrete

    max_ep_len = 100                   # max timesteps in one episode
    max_training_timesteps = int(3e6)   # break training loop if timeteps > max_training_timesteps

    print_freq = max_ep_len * 20        # print avg reward in the interval (in num timesteps)
    log_freq = max_ep_len * 2           # log avg reward in the interval (in num timesteps)
    save_model_freq = int(1e5)          # save model frequency (in num timesteps)

    action_std = 0.6                    # starting std for action distribution (Multivariate Normal)
    action_std_decay_rate = 0.05        # linearly decay action_std (action_std = action_std - action_std_decay_rate)
    min_action_std = 0.1                # minimum action_std (stop decay after action_std <= min_action_std)
    action_std_decay_freq = int(2.5e5)  # action_std decay frequency (in num timesteps)
    #####################################################

    ## Note : print/log frequencies should be > than max_ep_len

    ################ PPO hyperparameters ################
    update_timestep = max_ep_len * 4      # update policy every n timesteps
    K_epochs = 80               # update policy for K epochs in one PPO update

    eps_clip = 0.2          # clip parameter for PPO
    gamma = 0.99            # discount factor

    lr_actor = 0.0003       # learning rate for actor network
    lr_critic = 0.001       # learning rate for critic network

    random_seed = 0         # set random seed if required (0 = no random seed)
    #####################################################

    env = Simulator()
    
#     state_dim = env.observation_space.shape[0]

    if has_continuous_action_space:
        action_dim = env.action_space.shape[0]
    else:
        action_dim = 4

    ###################### logging ######################

    #### log files for multiple runs are NOT overwritten
    log_dir = "PPO_logs"
    if not os.path.exists(log_dir):
          os.makedirs(log_dir)

#     log_dir = log_dir + '/' + env_name + '/'
#     if not os.path.exists(log_dir):
#           os.makedirs(log_dir)

    #### get number of log files in log directory
    run_num = 0
    current_num_files = next(os.walk(log_dir))[2]
    run_num = len(current_num_files)

    #### create new log file for each run
    log_f_name = log_dir + '/PPO_' + env_name + "_log_" + str(run_num) + ".csv"

    print("current logging run number for " + env_name + " : ", run_num)
    print("logging at : " + log_f_name)
    #####################################################

    ################### checkpointing ###################
    run_num_pretrained = 0      #### change this to prevent overwriting weights in same env_name folder

    directory = "PPO_preTrained"
    if not os.path.exists(directory):
          os.makedirs(directory)

#     directory = directory + '/' + env_name + '/'
#     if not os.path.exists(directory):
#           os.makedirs(directory)


    checkpoint_path = directory + "PPO_{}_{}_{}.pth".format(env_name, random_seed, run_num_pretrained)
    print("save checkpoint path : " + checkpoint_path)
    #####################################################

    
    ################# training procedure ################

    # initialize a PPO agent
    ppo_agent = PPO(action_dim, lr_actor, lr_critic, gamma, K_epochs, eps_clip, has_continuous_action_space, action_std)

    # track total training time
    start_time = datetime.now().replace(microsecond=0)
    print("Started training at (GMT) : ", start_time)

    print("============================================================================================")

    # logging file
    log_f = open(log_f_name,"w+")
    log_f.write('episode,timestep,reward\n')

    # printing and logging variables
    print_running_reward = 0
    print_running_episodes = 0

    log_running_reward = 0
    log_running_episodes = 0

    time_step = 0
    i_episode = 0

    # training loop
    while time_step <= max_training_timesteps:
        for epi in tqdm(range(39999)):
            state = env.reset(epi)
            current_ep_reward = 0


            for t in range(1, max_ep_len+1):

                state = torch.from_numpy(state).float()
                state = torch.reshape(state, (-1, 1, 10, 9))

                action = ppo_agent.select_action(state)

                state, reward, cumul, done, goal_reward = env.step(action)
    #             state, reward, drewardone, _ = env.step(action)
                if reward < 0:
                    reward = reward + reward*(t)*0.1

                # saving reward and is_terminals
                ppo_agent.buffer.rewards.append(reward)
                ppo_agent.buffer.is_terminals.append(done)

                time_step +=1
                current_ep_reward += reward

                # update PPO agent
                if time_step % update_timestep == 0:
                    ppo_agent.update()

                # if continuous action space; then decay action std of ouput action distribution
                if has_continuous_action_space and time_step % action_std_decay_freq == 0:
                    ppo_agent.decay_action_std(action_std_decay_rate, min_action_std)

                # log in logging file
                if time_step % log_freq == 0:

                    # log average reward till last episode
                    log_avg_reward = log_running_reward / log_running_episodes
                    log_avg_reward = round(log_avg_reward, 4)

                    log_f.write('{},{},{}\n'.format(i_episode, time_step, log_avg_reward))
                    log_f.flush()

                    log_running_reward = 0
                    log_running_episodes = 0

                # printing average reward
                if time_step % print_freq == 0:

                    # print average reward till last episode
                    print_avg_reward = print_running_reward / print_running_episodes
                    print_avg_reward = round(print_avg_reward, 2)

                    print("Episode : {} \t\t Timestep : {} \t\t Average Reward : {}".format(i_episode, time_step, print_avg_reward))

                    print_running_reward = 0
                    print_running_episodes = 0

                # save model weights
                if time_step % save_model_freq == 0:
                    print("--------------------------------------------------------------------------------------------")
                    print("saving model at : " + checkpoint_path)
                    ppo_agent.save(checkpoint_path)
                    print("model saved")
                    print("Elapsed Time  : ", datetime.now().replace(microsecond=0) - start_time)
                    print("--------------------------------------------------------------------------------------------")

                # break; if the episode is over
                if done:
                    break

            print_running_reward += current_ep_reward
            print_running_episodes += 1

            log_running_reward += current_ep_reward
            log_running_episodes += 1

            i_episode += 1

    log_f.close()
    env.close()

    # print total training time
    print("============================================================================================")
    end_time = datetime.now().replace(microsecond=0)
    print("Started training at (GMT) : ", start_time)
    print("Finished training at (GMT) : ", end_time)
    print("Total training time  : ", end_time - start_time)
    print("============================================================================================")


if __name__ == '__main__':

    train()

current logging run number for Grid World :  21
logging at : PPO_logs/PPO_Grid World_log_21.csv
save checkpoint path : PPO_preTrainedPPO_Grid World_0_0.pth
Started training at (GMT) :  2022-05-25 02:30:09


  0%|                                                                     | 21/39999 [00:04<2:48:51,  3.95it/s]

Episode : 19 		 Timestep : 2000 		 Average Reward : 985.03


  0%|                                                                     | 41/39999 [00:09<2:47:29,  3.98it/s]

Episode : 39 		 Timestep : 4000 		 Average Reward : 955.31


  0%|                                                                     | 62/39999 [00:14<2:27:42,  4.51it/s]

Episode : 59 		 Timestep : 6000 		 Average Reward : 1003.35


  0%|▏                                                                    | 82/39999 [00:19<2:17:11,  4.85it/s]

Episode : 79 		 Timestep : 8000 		 Average Reward : 1004.33


  0%|▏                                                                   | 100/39999 [00:24<3:25:45,  3.23it/s]

Episode : 99 		 Timestep : 10000 		 Average Reward : 1097.45


  0%|▏                                                                   | 122/39999 [00:30<2:49:12,  3.93it/s]

Episode : 119 		 Timestep : 12000 		 Average Reward : 1127.4


  0%|▏                                                                   | 141/39999 [00:34<2:39:29,  4.17it/s]

Episode : 139 		 Timestep : 14000 		 Average Reward : 1065.57


  0%|▎                                                                   | 162/39999 [00:39<2:07:47,  5.20it/s]

Episode : 159 		 Timestep : 16000 		 Average Reward : 1221.56


  0%|▎                                                                   | 182/39999 [00:44<2:15:29,  4.90it/s]

Episode : 179 		 Timestep : 18000 		 Average Reward : 1258.54


  1%|▎                                                                   | 202/39999 [00:49<2:32:08,  4.36it/s]

Episode : 199 		 Timestep : 20000 		 Average Reward : 1281.46


  1%|▍                                                                   | 221/39999 [00:54<2:58:35,  3.71it/s]

Episode : 219 		 Timestep : 22000 		 Average Reward : 1317.16


  1%|▍                                                                   | 240/39999 [00:59<3:35:32,  3.07it/s]

Episode : 239 		 Timestep : 24000 		 Average Reward : 1338.31


  1%|▍                                                                   | 261/39999 [01:05<3:19:42,  3.32it/s]

Episode : 259 		 Timestep : 26000 		 Average Reward : 1282.56


  1%|▍                                                                   | 280/39999 [01:10<3:20:22,  3.30it/s]

Episode : 279 		 Timestep : 28000 		 Average Reward : 1297.13


  1%|▌                                                                   | 300/39999 [01:15<3:31:30,  3.13it/s]

Episode : 299 		 Timestep : 30000 		 Average Reward : 1258.35


  1%|▌                                                                   | 322/39999 [01:20<2:27:38,  4.48it/s]

Episode : 319 		 Timestep : 32000 		 Average Reward : 1252.78


  1%|▌                                                                   | 342/39999 [01:25<2:52:31,  3.83it/s]

Episode : 339 		 Timestep : 34000 		 Average Reward : 1123.87


  1%|▌                                                                   | 362/39999 [01:30<2:26:41,  4.50it/s]

Episode : 359 		 Timestep : 36000 		 Average Reward : 1029.58


  1%|▋                                                                   | 381/39999 [01:35<3:02:14,  3.62it/s]

Episode : 379 		 Timestep : 38000 		 Average Reward : 1175.36


  1%|▋                                                                   | 401/39999 [01:41<2:58:59,  3.69it/s]

Episode : 399 		 Timestep : 40000 		 Average Reward : 1351.49


  1%|▋                                                                   | 422/39999 [01:46<2:30:44,  4.38it/s]

Episode : 419 		 Timestep : 42000 		 Average Reward : 1243.02


  1%|▊                                                                   | 442/39999 [01:51<2:16:22,  4.83it/s]

Episode : 439 		 Timestep : 44000 		 Average Reward : 1244.5


  1%|▊                                                                   | 461/39999 [01:56<3:09:06,  3.48it/s]

Episode : 459 		 Timestep : 46000 		 Average Reward : 1192.7


  1%|▊                                                                   | 481/39999 [02:01<2:47:06,  3.94it/s]

Episode : 479 		 Timestep : 48000 		 Average Reward : 1132.25


  1%|▊                                                                   | 502/39999 [02:06<2:13:28,  4.93it/s]

Episode : 499 		 Timestep : 50000 		 Average Reward : 1218.03


  1%|▉                                                                   | 521/39999 [02:11<2:47:31,  3.93it/s]

Episode : 519 		 Timestep : 52000 		 Average Reward : 1196.15


  1%|▉                                                                   | 540/39999 [02:16<3:38:27,  3.01it/s]

Episode : 539 		 Timestep : 54000 		 Average Reward : 1231.69


  1%|▉                                                                   | 561/39999 [02:21<2:51:15,  3.84it/s]

Episode : 559 		 Timestep : 56000 		 Average Reward : 1262.08


  1%|▉                                                                   | 581/39999 [02:26<2:48:56,  3.89it/s]

Episode : 579 		 Timestep : 58000 		 Average Reward : 1233.77


  2%|█                                                                   | 600/39999 [02:30<3:12:59,  3.40it/s]

Episode : 599 		 Timestep : 60000 		 Average Reward : 1289.91


  2%|█                                                                   | 621/39999 [02:35<2:38:10,  4.15it/s]

Episode : 619 		 Timestep : 62000 		 Average Reward : 1239.6


  2%|█                                                                   | 642/39999 [02:40<2:01:51,  5.38it/s]

Episode : 639 		 Timestep : 64000 		 Average Reward : 1235.59


  2%|█▏                                                                  | 662/39999 [02:45<2:14:05,  4.89it/s]

Episode : 659 		 Timestep : 66000 		 Average Reward : 1354.21


  2%|█▏                                                                  | 682/39999 [02:50<2:33:20,  4.27it/s]

Episode : 679 		 Timestep : 68000 		 Average Reward : 1369.69


  2%|█▏                                                                  | 701/39999 [02:54<2:40:19,  4.09it/s]

Episode : 699 		 Timestep : 70000 		 Average Reward : 1332.51


  2%|█▏                                                                  | 721/39999 [03:00<3:21:59,  3.24it/s]

Episode : 719 		 Timestep : 72000 		 Average Reward : 1423.12


  2%|█▎                                                                  | 742/39999 [03:04<2:11:24,  4.98it/s]

Episode : 739 		 Timestep : 74000 		 Average Reward : 1305.9


  2%|█▎                                                                  | 761/39999 [03:10<3:12:40,  3.39it/s]

Episode : 759 		 Timestep : 76000 		 Average Reward : 1330.34


  2%|█▎                                                                  | 782/39999 [03:15<2:19:26,  4.69it/s]

Episode : 779 		 Timestep : 78000 		 Average Reward : 1341.9


  2%|█▎                                                                  | 801/39999 [03:20<3:08:58,  3.46it/s]

Episode : 799 		 Timestep : 80000 		 Average Reward : 1391.9


  2%|█▍                                                                  | 822/39999 [03:25<2:27:39,  4.42it/s]

Episode : 819 		 Timestep : 82000 		 Average Reward : 1333.14


  2%|█▍                                                                  | 842/39999 [03:30<2:19:05,  4.69it/s]

Episode : 839 		 Timestep : 84000 		 Average Reward : 1405.84


  2%|█▍                                                                  | 862/39999 [03:35<2:06:26,  5.16it/s]

Episode : 859 		 Timestep : 86000 		 Average Reward : 1355.77


  2%|█▍                                                                  | 882/39999 [03:40<2:05:23,  5.20it/s]

Episode : 879 		 Timestep : 88000 		 Average Reward : 1332.38


  2%|█▌                                                                  | 902/39999 [03:45<2:13:25,  4.88it/s]

Episode : 899 		 Timestep : 90000 		 Average Reward : 1336.63


  2%|█▌                                                                  | 922/39999 [03:49<2:04:24,  5.24it/s]

Episode : 919 		 Timestep : 92000 		 Average Reward : 1326.64


  2%|█▌                                                                  | 942/39999 [03:55<2:31:03,  4.31it/s]

Episode : 939 		 Timestep : 94000 		 Average Reward : 1330.3


  2%|█▋                                                                  | 962/39999 [03:59<2:22:36,  4.56it/s]

Episode : 959 		 Timestep : 96000 		 Average Reward : 1369.45


  2%|█▋                                                                  | 981/39999 [04:04<2:06:46,  5.13it/s]

Episode : 979 		 Timestep : 98000 		 Average Reward : 1344.15


  3%|█▊                                                                 | 1082/39999 [04:29<2:24:53,  4.48it/s]

Episode : 1079 		 Timestep : 108000 		 Average Reward : 1264.52


  3%|█▊                                                                 | 1102/39999 [04:34<2:33:48,  4.21it/s]

Episode : 1099 		 Timestep : 110000 		 Average Reward : 1353.82


  3%|█▉                                                                 | 1122/39999 [04:40<2:11:57,  4.91it/s]

Episode : 1119 		 Timestep : 112000 		 Average Reward : 1341.57


  3%|█▉                                                                 | 1142/39999 [04:45<2:05:21,  5.17it/s]

Episode : 1139 		 Timestep : 114000 		 Average Reward : 1315.67


  3%|█▉                                                                 | 1162/39999 [04:49<2:05:24,  5.16it/s]

Episode : 1159 		 Timestep : 116000 		 Average Reward : 1332.96


  3%|█▉                                                                 | 1181/39999 [04:54<2:40:34,  4.03it/s]

Episode : 1179 		 Timestep : 118000 		 Average Reward : 1383.13


  3%|██                                                                 | 1202/39999 [04:58<2:14:55,  4.79it/s]

Episode : 1199 		 Timestep : 120000 		 Average Reward : 1331.24


  3%|██                                                                 | 1221/39999 [05:03<2:58:26,  3.62it/s]

Episode : 1219 		 Timestep : 122000 		 Average Reward : 1248.03


  3%|██                                                                 | 1241/39999 [05:09<3:01:41,  3.56it/s]

Episode : 1239 		 Timestep : 124000 		 Average Reward : 1299.37


  3%|██                                                                 | 1262/39999 [05:14<2:34:23,  4.18it/s]

Episode : 1259 		 Timestep : 126000 		 Average Reward : 1348.72


  3%|██▏                                                                | 1282/39999 [05:18<2:06:05,  5.12it/s]

Episode : 1279 		 Timestep : 128000 		 Average Reward : 1341.45


  3%|██▏                                                                | 1301/39999 [05:23<2:36:17,  4.13it/s]

Episode : 1299 		 Timestep : 130000 		 Average Reward : 1282.68


  3%|██▏                                                                | 1322/39999 [05:28<2:30:47,  4.27it/s]

Episode : 1319 		 Timestep : 132000 		 Average Reward : 1325.56


  3%|██▏                                                                | 1341/39999 [05:34<3:23:08,  3.17it/s]

Episode : 1339 		 Timestep : 134000 		 Average Reward : 1288.15


  3%|██▎                                                                | 1361/39999 [05:39<2:47:28,  3.84it/s]

Episode : 1359 		 Timestep : 136000 		 Average Reward : 1348.31


  3%|██▎                                                                | 1382/39999 [05:44<2:36:43,  4.11it/s]

Episode : 1379 		 Timestep : 138000 		 Average Reward : 1314.96


  4%|██▎                                                                | 1401/39999 [05:49<2:31:12,  4.25it/s]

Episode : 1399 		 Timestep : 140000 		 Average Reward : 1300.04


  4%|██▍                                                                | 1422/39999 [05:54<2:28:04,  4.34it/s]

Episode : 1419 		 Timestep : 142000 		 Average Reward : 1166.92


  4%|██▍                                                                | 1441/39999 [05:59<2:31:01,  4.26it/s]

Episode : 1439 		 Timestep : 144000 		 Average Reward : 1250.55


  4%|██▍                                                                | 1462/39999 [06:04<2:22:15,  4.51it/s]

Episode : 1459 		 Timestep : 146000 		 Average Reward : 1296.95


  4%|██▍                                                                | 1481/39999 [06:09<2:37:47,  4.07it/s]

Episode : 1479 		 Timestep : 148000 		 Average Reward : 1239.02


  4%|██▌                                                                | 1502/39999 [06:14<2:30:52,  4.25it/s]

Episode : 1499 		 Timestep : 150000 		 Average Reward : 1314.52


  4%|██▌                                                                | 1521/39999 [06:19<2:44:25,  3.90it/s]

Episode : 1519 		 Timestep : 152000 		 Average Reward : 1332.64


  4%|██▌                                                                | 1542/39999 [06:24<2:35:33,  4.12it/s]

Episode : 1539 		 Timestep : 154000 		 Average Reward : 1238.19


  4%|██▌                                                                | 1562/39999 [06:29<2:14:41,  4.76it/s]

Episode : 1559 		 Timestep : 156000 		 Average Reward : 1283.04


  4%|██▋                                                                | 1582/39999 [06:34<2:13:52,  4.78it/s]

Episode : 1579 		 Timestep : 158000 		 Average Reward : 1209.83


  4%|██▋                                                                | 1602/39999 [06:39<2:15:07,  4.74it/s]

Episode : 1599 		 Timestep : 160000 		 Average Reward : 1330.42


  4%|██▋                                                                | 1622/39999 [06:44<2:17:08,  4.66it/s]

Episode : 1619 		 Timestep : 162000 		 Average Reward : 1340.43


  4%|██▊                                                                | 1642/39999 [06:49<1:56:37,  5.48it/s]

Episode : 1639 		 Timestep : 164000 		 Average Reward : 1381.47


  4%|██▊                                                                | 1663/39999 [06:54<2:04:08,  5.15it/s]

Episode : 1659 		 Timestep : 166000 		 Average Reward : 1337.12


  4%|██▊                                                                | 1682/39999 [06:58<2:13:29,  4.78it/s]

Episode : 1679 		 Timestep : 168000 		 Average Reward : 1329.61


  4%|██▊                                                                | 1702/39999 [07:03<2:24:35,  4.41it/s]

Episode : 1699 		 Timestep : 170000 		 Average Reward : 1345.64


  4%|██▉                                                                | 1721/39999 [07:08<2:34:54,  4.12it/s]

Episode : 1719 		 Timestep : 172000 		 Average Reward : 1339.18


  4%|██▉                                                                | 1742/39999 [07:13<2:15:43,  4.70it/s]

Episode : 1739 		 Timestep : 174000 		 Average Reward : 1350.54


  4%|██▉                                                                | 1762/39999 [07:18<2:16:20,  4.67it/s]

Episode : 1759 		 Timestep : 176000 		 Average Reward : 1301.7


  4%|██▉                                                                | 1781/39999 [07:22<2:34:48,  4.11it/s]

Episode : 1779 		 Timestep : 178000 		 Average Reward : 1221.71


  5%|███                                                                | 1801/39999 [07:27<2:45:51,  3.84it/s]

Episode : 1799 		 Timestep : 180000 		 Average Reward : 1264.48


  5%|███                                                                | 1822/39999 [07:33<2:21:50,  4.49it/s]

Episode : 1819 		 Timestep : 182000 		 Average Reward : 1306.77


  5%|███                                                                | 1841/39999 [07:37<2:28:31,  4.28it/s]

Episode : 1839 		 Timestep : 184000 		 Average Reward : 1309.1


  5%|███                                                                | 1861/39999 [07:42<2:28:21,  4.28it/s]

Episode : 1859 		 Timestep : 186000 		 Average Reward : 1329.02


  5%|███▏                                                               | 1882/39999 [07:46<1:49:35,  5.80it/s]

Episode : 1879 		 Timestep : 188000 		 Average Reward : 1321.31


  5%|███▏                                                               | 1902/39999 [07:51<2:08:59,  4.92it/s]

Episode : 1899 		 Timestep : 190000 		 Average Reward : 1285.95


  5%|███▏                                                               | 1922/39999 [07:56<2:32:23,  4.16it/s]

Episode : 1919 		 Timestep : 192000 		 Average Reward : 1265.42


  5%|███▎                                                               | 1942/39999 [08:00<2:01:34,  5.22it/s]

Episode : 1939 		 Timestep : 194000 		 Average Reward : 1213.74


  5%|███▎                                                               | 1962/39999 [08:05<2:24:17,  4.39it/s]

Episode : 1959 		 Timestep : 196000 		 Average Reward : 1349.46


  5%|███▎                                                               | 1981/39999 [08:10<2:35:55,  4.06it/s]

Episode : 1979 		 Timestep : 198000 		 Average Reward : 1347.92


  5%|███▎                                                               | 2001/39999 [08:16<2:55:55,  3.60it/s]

Episode : 1999 		 Timestep : 200000 		 Average Reward : 1308.92
--------------------------------------------------------------------------------------------
saving model at : PPO_preTrainedPPO_Grid World_0_0.pth
model saved
Elapsed Time  :  0:08:17
--------------------------------------------------------------------------------------------


  5%|███▍                                                               | 2022/39999 [08:20<2:14:41,  4.70it/s]

Episode : 2019 		 Timestep : 202000 		 Average Reward : 1298.71


  5%|███▍                                                               | 2042/39999 [08:25<2:01:53,  5.19it/s]

Episode : 2039 		 Timestep : 204000 		 Average Reward : 1298.2


  5%|███▍                                                               | 2061/39999 [08:31<2:42:07,  3.90it/s]

Episode : 2059 		 Timestep : 206000 		 Average Reward : 1327.31


  5%|███▍                                                               | 2081/39999 [08:36<3:06:38,  3.39it/s]

Episode : 2079 		 Timestep : 208000 		 Average Reward : 1347.51


  5%|███▌                                                               | 2101/39999 [08:40<2:24:11,  4.38it/s]

Episode : 2099 		 Timestep : 210000 		 Average Reward : 1349.79


  5%|███▌                                                               | 2122/39999 [08:46<2:29:36,  4.22it/s]

Episode : 2119 		 Timestep : 212000 		 Average Reward : 1344.1


  5%|███▌                                                               | 2142/39999 [08:50<2:17:09,  4.60it/s]

Episode : 2139 		 Timestep : 214000 		 Average Reward : 1330.16


  5%|███▌                                                               | 2162/39999 [08:55<2:13:02,  4.74it/s]

Episode : 2159 		 Timestep : 216000 		 Average Reward : 1349.67


  5%|███▋                                                               | 2181/39999 [09:00<2:41:45,  3.90it/s]

Episode : 2179 		 Timestep : 218000 		 Average Reward : 1343.71


  6%|███▋                                                               | 2201/39999 [09:05<2:40:05,  3.94it/s]

Episode : 2199 		 Timestep : 220000 		 Average Reward : 1309.81


  6%|███▋                                                               | 2221/39999 [09:10<2:41:05,  3.91it/s]

Episode : 2219 		 Timestep : 222000 		 Average Reward : 1394.06


  6%|███▊                                                               | 2242/39999 [09:14<1:59:09,  5.28it/s]

Episode : 2239 		 Timestep : 224000 		 Average Reward : 1395.31


  6%|███▊                                                               | 2261/39999 [09:19<2:50:58,  3.68it/s]

Episode : 2259 		 Timestep : 226000 		 Average Reward : 1327.84


  6%|███▊                                                               | 2281/39999 [09:24<2:25:10,  4.33it/s]

Episode : 2279 		 Timestep : 228000 		 Average Reward : 1235.83


  6%|███▊                                                               | 2302/39999 [09:29<2:20:40,  4.47it/s]

Episode : 2299 		 Timestep : 230000 		 Average Reward : 1304.35


  6%|███▉                                                               | 2322/39999 [09:34<2:04:29,  5.04it/s]

Episode : 2319 		 Timestep : 232000 		 Average Reward : 1380.67


  6%|███▉                                                               | 2342/39999 [09:39<2:07:08,  4.94it/s]

Episode : 2339 		 Timestep : 234000 		 Average Reward : 1327.76


  6%|███▉                                                               | 2362/39999 [09:43<1:54:29,  5.48it/s]

Episode : 2359 		 Timestep : 236000 		 Average Reward : 1264.74


  6%|███▉                                                               | 2380/39999 [09:48<2:34:25,  4.06it/s]

Episode : 2379 		 Timestep : 238000 		 Average Reward : 1286.86


  6%|████                                                               | 2402/39999 [09:53<2:12:07,  4.74it/s]

Episode : 2399 		 Timestep : 240000 		 Average Reward : 1276.93


  6%|████                                                               | 2422/39999 [09:58<2:01:44,  5.14it/s]

Episode : 2419 		 Timestep : 242000 		 Average Reward : 1325.29


  6%|████                                                               | 2442/39999 [10:03<2:03:58,  5.05it/s]

Episode : 2439 		 Timestep : 244000 		 Average Reward : 1348.24


  6%|████                                                               | 2461/39999 [10:08<2:43:33,  3.83it/s]

Episode : 2459 		 Timestep : 246000 		 Average Reward : 1339.87


  6%|████▏                                                              | 2482/39999 [10:13<1:50:12,  5.67it/s]

Episode : 2479 		 Timestep : 248000 		 Average Reward : 1401.23


  6%|████▏                                                              | 2500/39999 [10:18<3:22:52,  3.08it/s]

Episode : 2499 		 Timestep : 250000 		 Average Reward : 1374.39


  6%|████▏                                                              | 2522/39999 [10:23<2:18:37,  4.51it/s]

Episode : 2519 		 Timestep : 252000 		 Average Reward : 1340.24


  6%|████▎                                                              | 2542/39999 [10:28<2:27:55,  4.22it/s]

Episode : 2539 		 Timestep : 254000 		 Average Reward : 1257.01


  6%|████▎                                                              | 2561/39999 [10:33<2:50:08,  3.67it/s]

Episode : 2559 		 Timestep : 256000 		 Average Reward : 1286.26


  6%|████▎                                                              | 2581/39999 [10:38<2:54:28,  3.57it/s]

Episode : 2579 		 Timestep : 258000 		 Average Reward : 1330.26


  7%|████▎                                                              | 2602/39999 [10:43<2:15:02,  4.62it/s]

Episode : 2599 		 Timestep : 260000 		 Average Reward : 1360.36


  7%|████▍                                                              | 2622/39999 [10:48<2:12:18,  4.71it/s]

Episode : 2619 		 Timestep : 262000 		 Average Reward : 1336.69


  7%|████▍                                                              | 2642/39999 [10:53<2:11:58,  4.72it/s]

Episode : 2639 		 Timestep : 264000 		 Average Reward : 1384.26


  7%|████▍                                                              | 2661/39999 [10:58<2:41:38,  3.85it/s]

Episode : 2659 		 Timestep : 266000 		 Average Reward : 1306.54


  7%|████▍                                                              | 2682/39999 [11:03<2:40:40,  3.87it/s]

Episode : 2679 		 Timestep : 268000 		 Average Reward : 1300.08


  7%|████▌                                                              | 2701/39999 [11:08<2:39:52,  3.89it/s]

Episode : 2699 		 Timestep : 270000 		 Average Reward : 1219.92


  7%|████▌                                                              | 2722/39999 [11:13<2:19:40,  4.45it/s]

Episode : 2719 		 Timestep : 272000 		 Average Reward : 1250.17


  7%|████▌                                                              | 2742/39999 [11:19<2:35:04,  4.00it/s]

Episode : 2739 		 Timestep : 274000 		 Average Reward : 1213.42


  7%|████▋                                                              | 2762/39999 [11:24<1:34:19,  6.58it/s]

Episode : 2759 		 Timestep : 276000 		 Average Reward : 1235.7


  7%|████▋                                                              | 2782/39999 [11:27<1:23:42,  7.41it/s]

Episode : 2779 		 Timestep : 278000 		 Average Reward : 1261.37


  7%|████▋                                                              | 2802/39999 [11:30<1:25:04,  7.29it/s]

Episode : 2799 		 Timestep : 280000 		 Average Reward : 1346.84


  7%|████▋                                                              | 2821/39999 [11:33<1:47:34,  5.76it/s]

Episode : 2819 		 Timestep : 282000 		 Average Reward : 1361.4


  7%|████▊                                                              | 2841/39999 [11:35<1:26:06,  7.19it/s]

Episode : 2839 		 Timestep : 284000 		 Average Reward : 1342.32


  7%|████▊                                                              | 2862/39999 [11:39<1:34:27,  6.55it/s]

Episode : 2859 		 Timestep : 286000 		 Average Reward : 1342.44


  7%|████▊                                                              | 2882/39999 [11:42<1:26:22,  7.16it/s]

Episode : 2879 		 Timestep : 288000 		 Average Reward : 1386.54


  7%|████▊                                                              | 2902/39999 [11:45<1:39:44,  6.20it/s]

Episode : 2899 		 Timestep : 290000 		 Average Reward : 1271.74


  7%|████▉                                                              | 2923/39999 [11:49<1:25:23,  7.24it/s]

Episode : 2919 		 Timestep : 292000 		 Average Reward : 1361.04


  7%|████▉                                                              | 2941/39999 [11:52<1:34:20,  6.55it/s]

Episode : 2939 		 Timestep : 294000 		 Average Reward : 1315.16


  7%|████▉                                                              | 2962/39999 [11:55<1:41:28,  6.08it/s]

Episode : 2959 		 Timestep : 296000 		 Average Reward : 1322.75


  7%|████▉                                                              | 2982/39999 [11:59<1:42:00,  6.05it/s]

Episode : 2979 		 Timestep : 298000 		 Average Reward : 1337.32


  8%|█████                                                              | 3003/39999 [12:02<1:11:07,  8.67it/s]

Episode : 2999 		 Timestep : 300000 		 Average Reward : 1292.23
--------------------------------------------------------------------------------------------
saving model at : PPO_preTrainedPPO_Grid World_0_0.pth
model saved
Elapsed Time  :  0:12:03
--------------------------------------------------------------------------------------------


  8%|█████                                                              | 3023/39999 [12:05<1:10:18,  8.76it/s]

Episode : 3019 		 Timestep : 302000 		 Average Reward : 1273.72


  8%|█████                                                              | 3041/39999 [12:08<1:37:07,  6.34it/s]

Episode : 3039 		 Timestep : 304000 		 Average Reward : 1217.26


  8%|█████▏                                                             | 3062/39999 [12:12<1:51:52,  5.50it/s]

Episode : 3059 		 Timestep : 306000 		 Average Reward : 1190.64


  8%|█████▏                                                             | 3082/39999 [12:16<2:43:44,  3.76it/s]

Episode : 3079 		 Timestep : 308000 		 Average Reward : 1352.59


  8%|█████▏                                                             | 3103/39999 [12:19<1:20:49,  7.61it/s]

Episode : 3099 		 Timestep : 310000 		 Average Reward : 1366.58


  8%|█████▏                                                             | 3123/39999 [12:22<1:23:19,  7.38it/s]

Episode : 3119 		 Timestep : 312000 		 Average Reward : 1381.86


  8%|█████▎                                                             | 3143/39999 [12:26<1:23:37,  7.35it/s]

Episode : 3139 		 Timestep : 314000 		 Average Reward : 1304.71


  8%|█████▎                                                             | 3161/39999 [12:29<2:02:24,  5.02it/s]

Episode : 3159 		 Timestep : 316000 		 Average Reward : 1270.1


  8%|█████▎                                                             | 3182/39999 [12:33<1:31:44,  6.69it/s]

Episode : 3179 		 Timestep : 318000 		 Average Reward : 1258.66


  8%|█████▎                                                             | 3202/39999 [12:36<1:40:57,  6.07it/s]

Episode : 3199 		 Timestep : 320000 		 Average Reward : 1346.57


  8%|█████▍                                                             | 3222/39999 [12:40<1:41:13,  6.06it/s]

Episode : 3219 		 Timestep : 322000 		 Average Reward : 1361.03


  8%|█████▍                                                             | 3242/39999 [12:44<1:41:14,  6.05it/s]

Episode : 3239 		 Timestep : 324000 		 Average Reward : 1255.59


  8%|█████▍                                                             | 3262/39999 [12:47<1:41:41,  6.02it/s]

Episode : 3259 		 Timestep : 326000 		 Average Reward : 1296.8


  8%|█████▍                                                             | 3282/39999 [12:51<1:42:42,  5.96it/s]

Episode : 3279 		 Timestep : 328000 		 Average Reward : 1270.54


  8%|█████▌                                                             | 3302/39999 [12:56<2:07:26,  4.80it/s]

Episode : 3299 		 Timestep : 330000 		 Average Reward : 1253.27


  8%|█████▌                                                             | 3322/39999 [13:01<2:13:14,  4.59it/s]

Episode : 3319 		 Timestep : 332000 		 Average Reward : 1170.08


  8%|█████▌                                                             | 3341/39999 [13:06<3:14:51,  3.14it/s]

Episode : 3339 		 Timestep : 334000 		 Average Reward : 1249.79


  8%|█████▋                                                             | 3362/39999 [13:11<2:27:09,  4.15it/s]

Episode : 3359 		 Timestep : 336000 		 Average Reward : 1308.46


  8%|█████▋                                                             | 3382/39999 [13:16<2:04:27,  4.90it/s]

Episode : 3379 		 Timestep : 338000 		 Average Reward : 1255.96


  9%|█████▋                                                             | 3401/39999 [13:21<2:37:14,  3.88it/s]

Episode : 3399 		 Timestep : 340000 		 Average Reward : 1377.81


  9%|█████▋                                                             | 3421/39999 [13:25<2:47:06,  3.65it/s]

Episode : 3419 		 Timestep : 342000 		 Average Reward : 1297.2


  9%|█████▊                                                             | 3442/39999 [13:31<2:14:00,  4.55it/s]

Episode : 3439 		 Timestep : 344000 		 Average Reward : 1336.28


  9%|█████▊                                                             | 3461/39999 [13:35<2:24:32,  4.21it/s]

Episode : 3459 		 Timestep : 346000 		 Average Reward : 1360.55


  9%|█████▊                                                             | 3481/39999 [13:40<2:38:07,  3.85it/s]

Episode : 3479 		 Timestep : 348000 		 Average Reward : 1280.25


  9%|█████▊                                                             | 3501/39999 [13:45<2:32:34,  3.99it/s]

Episode : 3499 		 Timestep : 350000 		 Average Reward : 1364.59


  9%|█████▉                                                             | 3521/39999 [13:50<2:52:30,  3.52it/s]

Episode : 3519 		 Timestep : 352000 		 Average Reward : 1275.28


  9%|█████▉                                                             | 3541/39999 [13:55<2:25:32,  4.18it/s]

Episode : 3539 		 Timestep : 354000 		 Average Reward : 1332.49


  9%|█████▉                                                             | 3562/39999 [14:00<2:16:38,  4.44it/s]

Episode : 3559 		 Timestep : 356000 		 Average Reward : 1323.19


  9%|██████                                                             | 3582/39999 [14:05<1:56:29,  5.21it/s]

Episode : 3579 		 Timestep : 358000 		 Average Reward : 1276.31


  9%|██████                                                             | 3602/39999 [14:10<2:10:53,  4.63it/s]

Episode : 3599 		 Timestep : 360000 		 Average Reward : 1340.91


  9%|██████                                                             | 3622/39999 [14:15<1:59:38,  5.07it/s]

Episode : 3619 		 Timestep : 362000 		 Average Reward : 1356.62


  9%|██████                                                             | 3640/39999 [14:20<2:44:45,  3.68it/s]

Episode : 3639 		 Timestep : 364000 		 Average Reward : 1407.97


  9%|██████▏                                                            | 3661/39999 [14:25<2:45:21,  3.66it/s]

Episode : 3659 		 Timestep : 366000 		 Average Reward : 1367.22


  9%|██████▏                                                            | 3681/39999 [14:30<2:42:33,  3.72it/s]

Episode : 3679 		 Timestep : 368000 		 Average Reward : 1375.03


  9%|██████▏                                                            | 3701/39999 [14:35<2:25:24,  4.16it/s]

Episode : 3699 		 Timestep : 370000 		 Average Reward : 1358.75


  9%|██████▏                                                            | 3721/39999 [14:41<2:51:46,  3.52it/s]

Episode : 3719 		 Timestep : 372000 		 Average Reward : 1314.35


  9%|██████▎                                                            | 3741/39999 [14:45<2:31:30,  3.99it/s]

Episode : 3739 		 Timestep : 374000 		 Average Reward : 1264.39


  9%|██████▎                                                            | 3762/39999 [14:50<2:08:31,  4.70it/s]

Episode : 3759 		 Timestep : 376000 		 Average Reward : 1164.21


  9%|██████▎                                                            | 3782/39999 [14:55<1:43:50,  5.81it/s]

Episode : 3779 		 Timestep : 378000 		 Average Reward : 1255.17


 10%|██████▎                                                            | 3802/39999 [15:00<2:10:20,  4.63it/s]

Episode : 3799 		 Timestep : 380000 		 Average Reward : 1282.92


 10%|██████▍                                                            | 3822/39999 [15:05<2:14:52,  4.47it/s]

Episode : 3819 		 Timestep : 382000 		 Average Reward : 1274.39


 10%|██████▍                                                            | 3842/39999 [15:10<2:39:55,  3.77it/s]

Episode : 3839 		 Timestep : 384000 		 Average Reward : 1313.06


 10%|██████▍                                                            | 3862/39999 [15:15<1:58:26,  5.09it/s]

Episode : 3859 		 Timestep : 386000 		 Average Reward : 1263.91


 10%|██████▌                                                            | 3882/39999 [15:20<2:18:49,  4.34it/s]

Episode : 3879 		 Timestep : 388000 		 Average Reward : 1278.21


 10%|██████▌                                                            | 3902/39999 [15:24<1:58:31,  5.08it/s]

Episode : 3899 		 Timestep : 390000 		 Average Reward : 1333.3


 10%|██████▌                                                            | 3922/39999 [15:29<2:12:57,  4.52it/s]

Episode : 3919 		 Timestep : 392000 		 Average Reward : 1314.81


 10%|██████▌                                                            | 3940/39999 [15:34<2:45:51,  3.62it/s]

Episode : 3939 		 Timestep : 394000 		 Average Reward : 1303.97


 10%|██████▋                                                            | 3961/39999 [15:39<2:38:17,  3.79it/s]

Episode : 3959 		 Timestep : 396000 		 Average Reward : 1355.35


 10%|██████▋                                                            | 3982/39999 [15:44<2:12:13,  4.54it/s]

Episode : 3979 		 Timestep : 398000 		 Average Reward : 1368.78


 10%|██████▋                                                            | 4001/39999 [15:48<2:38:06,  3.79it/s]

Episode : 3999 		 Timestep : 400000 		 Average Reward : 1358.98
--------------------------------------------------------------------------------------------
saving model at : PPO_preTrainedPPO_Grid World_0_0.pth
model saved
Elapsed Time  :  0:15:49
--------------------------------------------------------------------------------------------


 10%|██████▋                                                            | 4022/39999 [15:53<2:07:29,  4.70it/s]

Episode : 4019 		 Timestep : 402000 		 Average Reward : 1295.56


 10%|██████▊                                                            | 4042/39999 [15:58<2:08:32,  4.66it/s]

Episode : 4039 		 Timestep : 404000 		 Average Reward : 1269.5


 10%|██████▊                                                            | 4062/39999 [16:03<2:06:22,  4.74it/s]

Episode : 4059 		 Timestep : 406000 		 Average Reward : 1342.16


 10%|██████▊                                                            | 4082/39999 [16:08<2:11:06,  4.57it/s]

Episode : 4079 		 Timestep : 408000 		 Average Reward : 1212.96


 10%|██████▊                                                            | 4102/39999 [16:13<2:03:30,  4.84it/s]

Episode : 4099 		 Timestep : 410000 		 Average Reward : 1236.97


 10%|██████▉                                                            | 4122/39999 [16:18<2:09:18,  4.62it/s]

Episode : 4119 		 Timestep : 412000 		 Average Reward : 1353.99


 10%|██████▉                                                            | 4142/39999 [16:23<2:06:10,  4.74it/s]

Episode : 4139 		 Timestep : 414000 		 Average Reward : 1369.7


 10%|██████▉                                                            | 4162/39999 [16:27<2:11:54,  4.53it/s]

Episode : 4159 		 Timestep : 416000 		 Average Reward : 1320.83


 10%|███████                                                            | 4181/39999 [16:32<3:04:54,  3.23it/s]

Episode : 4179 		 Timestep : 418000 		 Average Reward : 1319.91


 11%|███████                                                            | 4202/39999 [16:37<1:53:47,  5.24it/s]

Episode : 4199 		 Timestep : 420000 		 Average Reward : 1350.62


 11%|███████                                                            | 4221/39999 [16:43<3:00:20,  3.31it/s]

Episode : 4219 		 Timestep : 422000 		 Average Reward : 1360.43


 11%|███████                                                            | 4242/39999 [16:48<2:35:57,  3.82it/s]

Episode : 4239 		 Timestep : 424000 		 Average Reward : 1295.19


 11%|███████▏                                                           | 4261/39999 [16:53<2:35:45,  3.82it/s]

Episode : 4259 		 Timestep : 426000 		 Average Reward : 1279.69


 11%|███████▏                                                           | 4281/39999 [16:58<2:42:26,  3.66it/s]

Episode : 4279 		 Timestep : 428000 		 Average Reward : 1279.89


 11%|███████▏                                                           | 4301/39999 [17:04<2:56:43,  3.37it/s]

Episode : 4299 		 Timestep : 430000 		 Average Reward : 1253.67


 11%|███████▏                                                           | 4322/39999 [17:08<1:58:59,  5.00it/s]

Episode : 4319 		 Timestep : 432000 		 Average Reward : 1293.96


 11%|███████▎                                                           | 4342/39999 [17:13<1:49:38,  5.42it/s]

Episode : 4339 		 Timestep : 434000 		 Average Reward : 1325.67


 11%|███████▎                                                           | 4362/39999 [17:18<2:16:07,  4.36it/s]

Episode : 4359 		 Timestep : 436000 		 Average Reward : 1300.33


 11%|███████▎                                                           | 4381/39999 [17:23<2:14:55,  4.40it/s]

Episode : 4379 		 Timestep : 438000 		 Average Reward : 1316.3


 11%|███████▎                                                           | 4402/39999 [17:28<2:03:57,  4.79it/s]

Episode : 4399 		 Timestep : 440000 		 Average Reward : 1326.81


 11%|███████▍                                                           | 4421/39999 [17:33<2:33:45,  3.86it/s]

Episode : 4419 		 Timestep : 442000 		 Average Reward : 1269.11


 11%|███████▍                                                           | 4441/39999 [17:38<2:43:12,  3.63it/s]

Episode : 4439 		 Timestep : 444000 		 Average Reward : 1209.49


 11%|███████▍                                                           | 4461/39999 [17:44<3:11:34,  3.09it/s]

Episode : 4459 		 Timestep : 446000 		 Average Reward : 1257.66


 11%|███████▌                                                           | 4480/39999 [17:49<3:18:30,  2.98it/s]

Episode : 4479 		 Timestep : 448000 		 Average Reward : 1271.64


 11%|███████▌                                                           | 4502/39999 [17:55<1:53:05,  5.23it/s]

Episode : 4499 		 Timestep : 450000 		 Average Reward : 1252.34


 11%|███████▌                                                           | 4521/39999 [17:59<2:38:28,  3.73it/s]

Episode : 4519 		 Timestep : 452000 		 Average Reward : 1309.59


 11%|███████▌                                                           | 4541/39999 [18:04<2:16:39,  4.32it/s]

Episode : 4539 		 Timestep : 454000 		 Average Reward : 1333.7


 11%|███████▋                                                           | 4562/39999 [18:09<2:17:46,  4.29it/s]

Episode : 4559 		 Timestep : 456000 		 Average Reward : 1285.26


 11%|███████▋                                                           | 4582/39999 [18:14<2:14:53,  4.38it/s]

Episode : 4579 		 Timestep : 458000 		 Average Reward : 1236.84


 12%|███████▋                                                           | 4602/39999 [18:19<1:55:02,  5.13it/s]

Episode : 4599 		 Timestep : 460000 		 Average Reward : 1341.36


 12%|███████▋                                                           | 4622/39999 [18:24<1:52:43,  5.23it/s]

Episode : 4619 		 Timestep : 462000 		 Average Reward : 1317.38


 12%|███████▊                                                           | 4641/39999 [18:28<2:07:14,  4.63it/s]

Episode : 4639 		 Timestep : 464000 		 Average Reward : 1223.13


 12%|███████▊                                                           | 4662/39999 [18:34<2:39:39,  3.69it/s]

Episode : 4659 		 Timestep : 466000 		 Average Reward : 1215.77


 12%|███████▊                                                           | 4681/39999 [18:39<2:21:20,  4.16it/s]

Episode : 4679 		 Timestep : 468000 		 Average Reward : 1325.43


 12%|███████▊                                                           | 4701/39999 [18:44<2:54:23,  3.37it/s]

Episode : 4699 		 Timestep : 470000 		 Average Reward : 1297.4


 12%|███████▉                                                           | 4722/39999 [18:49<2:01:35,  4.84it/s]

Episode : 4719 		 Timestep : 472000 		 Average Reward : 1293.66


 12%|███████▉                                                           | 4740/39999 [18:53<2:58:34,  3.29it/s]

Episode : 4739 		 Timestep : 474000 		 Average Reward : 1286.39


 12%|███████▉                                                           | 4760/39999 [18:58<2:36:25,  3.75it/s]

Episode : 4759 		 Timestep : 476000 		 Average Reward : 1330.54


 12%|████████                                                           | 4782/39999 [19:03<1:49:32,  5.36it/s]

Episode : 4779 		 Timestep : 478000 		 Average Reward : 1255.9


 12%|████████                                                           | 4802/39999 [19:08<1:53:43,  5.16it/s]

Episode : 4799 		 Timestep : 480000 		 Average Reward : 1270.19


 12%|████████                                                           | 4820/39999 [19:12<2:43:47,  3.58it/s]

Episode : 4819 		 Timestep : 482000 		 Average Reward : 1320.99


 12%|████████                                                           | 4840/39999 [19:17<2:57:25,  3.30it/s]

Episode : 4839 		 Timestep : 484000 		 Average Reward : 1261.58


 12%|████████▏                                                          | 4862/39999 [19:23<2:06:48,  4.62it/s]

Episode : 4859 		 Timestep : 486000 		 Average Reward : 1375.84


 12%|████████▏                                                          | 4882/39999 [19:27<1:58:01,  4.96it/s]

Episode : 4879 		 Timestep : 488000 		 Average Reward : 1383.38


 12%|████████▏                                                          | 4902/39999 [19:32<1:59:44,  4.88it/s]

Episode : 4899 		 Timestep : 490000 		 Average Reward : 1301.77


 12%|████████▏                                                          | 4921/39999 [19:36<2:22:58,  4.09it/s]

Episode : 4919 		 Timestep : 492000 		 Average Reward : 1302.4


 12%|████████▎                                                          | 4942/39999 [19:42<2:09:48,  4.50it/s]

Episode : 4939 		 Timestep : 494000 		 Average Reward : 1171.2


 12%|████████▎                                                          | 4962/39999 [19:47<2:30:57,  3.87it/s]

Episode : 4959 		 Timestep : 496000 		 Average Reward : 1271.32


 12%|████████▎                                                          | 4982/39999 [19:52<2:14:29,  4.34it/s]

Episode : 4979 		 Timestep : 498000 		 Average Reward : 1304.76


 13%|████████▍                                                          | 5002/39999 [19:58<2:06:16,  4.62it/s]

Episode : 4999 		 Timestep : 500000 		 Average Reward : 1246.71
--------------------------------------------------------------------------------------------
saving model at : PPO_preTrainedPPO_Grid World_0_0.pth
model saved
Elapsed Time  :  0:19:58
--------------------------------------------------------------------------------------------


 13%|████████▍                                                          | 5021/39999 [20:02<2:22:52,  4.08it/s]

Episode : 5019 		 Timestep : 502000 		 Average Reward : 1381.12


 13%|████████▍                                                          | 5041/39999 [20:07<2:20:20,  4.15it/s]

Episode : 5039 		 Timestep : 504000 		 Average Reward : 1359.17


 13%|████████▍                                                          | 5061/39999 [20:12<2:38:18,  3.68it/s]

Episode : 5059 		 Timestep : 506000 		 Average Reward : 1314.94


 13%|████████▌                                                          | 5081/39999 [20:18<2:30:44,  3.86it/s]

Episode : 5079 		 Timestep : 508000 		 Average Reward : 1280.12


 13%|████████▌                                                          | 5101/39999 [20:23<2:40:11,  3.63it/s]

Episode : 5099 		 Timestep : 510000 		 Average Reward : 1315.16


 13%|████████▌                                                          | 5122/39999 [20:28<1:59:05,  4.88it/s]

Episode : 5119 		 Timestep : 512000 		 Average Reward : 1372.6


 13%|████████▌                                                          | 5140/39999 [20:33<3:55:18,  2.47it/s]

Episode : 5139 		 Timestep : 514000 		 Average Reward : 1303.65


 13%|████████▋                                                          | 5161/39999 [20:39<2:39:20,  3.64it/s]

Episode : 5159 		 Timestep : 516000 		 Average Reward : 1305.5


 13%|████████▋                                                          | 5181/39999 [20:44<2:44:47,  3.52it/s]

Episode : 5179 		 Timestep : 518000 		 Average Reward : 1409.52


 13%|████████▋                                                          | 5202/39999 [20:49<2:08:27,  4.51it/s]

Episode : 5199 		 Timestep : 520000 		 Average Reward : 1361.1


 13%|████████▋                                                          | 5220/39999 [20:54<2:34:36,  3.75it/s]

Episode : 5219 		 Timestep : 522000 		 Average Reward : 1372.62


 13%|████████▊                                                          | 5241/39999 [21:00<3:03:58,  3.15it/s]

Episode : 5239 		 Timestep : 524000 		 Average Reward : 1292.42


 13%|████████▊                                                          | 5262/39999 [21:05<1:45:04,  5.51it/s]

Episode : 5259 		 Timestep : 526000 		 Average Reward : 1352.01


 13%|████████▊                                                          | 5281/39999 [21:10<2:07:04,  4.55it/s]

Episode : 5279 		 Timestep : 528000 		 Average Reward : 1212.24


 13%|████████▉                                                          | 5302/39999 [21:15<2:22:17,  4.06it/s]

Episode : 5299 		 Timestep : 530000 		 Average Reward : 1273.43


 13%|████████▉                                                          | 5322/39999 [21:20<2:01:29,  4.76it/s]

Episode : 5319 		 Timestep : 532000 		 Average Reward : 1384.46


 13%|████████▉                                                          | 5342/39999 [21:24<1:59:50,  4.82it/s]

Episode : 5339 		 Timestep : 534000 		 Average Reward : 1258.7


 13%|████████▉                                                          | 5361/39999 [21:29<2:26:13,  3.95it/s]

Episode : 5359 		 Timestep : 536000 		 Average Reward : 1282.75


 13%|█████████                                                          | 5381/39999 [21:34<2:37:07,  3.67it/s]

Episode : 5379 		 Timestep : 538000 		 Average Reward : 1311.38


 14%|█████████                                                          | 5401/39999 [21:39<2:24:29,  3.99it/s]

Episode : 5399 		 Timestep : 540000 		 Average Reward : 1347.92


 14%|█████████                                                          | 5421/39999 [21:44<2:32:24,  3.78it/s]

Episode : 5419 		 Timestep : 542000 		 Average Reward : 1304.36


 14%|█████████                                                          | 5442/39999 [21:50<2:25:38,  3.95it/s]

Episode : 5439 		 Timestep : 544000 		 Average Reward : 1368.45


 14%|█████████▏                                                         | 5462/39999 [21:55<2:11:09,  4.39it/s]

Episode : 5459 		 Timestep : 546000 		 Average Reward : 1328.44


 14%|█████████▏                                                         | 5482/39999 [22:00<1:50:00,  5.23it/s]

Episode : 5479 		 Timestep : 548000 		 Average Reward : 1286.08


 14%|█████████▏                                                         | 5502/39999 [22:05<2:37:37,  3.65it/s]

Episode : 5499 		 Timestep : 550000 		 Average Reward : 1270.03


 14%|█████████▏                                                         | 5521/39999 [22:10<2:34:04,  3.73it/s]

Episode : 5519 		 Timestep : 552000 		 Average Reward : 1300.32


 14%|█████████▎                                                         | 5541/39999 [22:15<2:13:52,  4.29it/s]

Episode : 5539 		 Timestep : 554000 		 Average Reward : 1299.62


 14%|█████████▎                                                         | 5562/39999 [22:20<2:03:53,  4.63it/s]

Episode : 5559 		 Timestep : 556000 		 Average Reward : 1322.48


 14%|█████████▎                                                         | 5582/39999 [22:25<2:04:13,  4.62it/s]

Episode : 5579 		 Timestep : 558000 		 Average Reward : 1358.31


 14%|█████████▍                                                         | 5601/39999 [22:29<2:24:46,  3.96it/s]

Episode : 5599 		 Timestep : 560000 		 Average Reward : 1420.86


 14%|█████████▍                                                         | 5621/39999 [22:34<2:07:03,  4.51it/s]

Episode : 5619 		 Timestep : 562000 		 Average Reward : 1307.37


 14%|█████████▍                                                         | 5641/39999 [22:39<2:21:14,  4.05it/s]

Episode : 5639 		 Timestep : 564000 		 Average Reward : 1257.22


 14%|█████████▍                                                         | 5661/39999 [22:45<2:22:46,  4.01it/s]

Episode : 5659 		 Timestep : 566000 		 Average Reward : 1269.53


 14%|█████████▌                                                         | 5682/39999 [22:49<2:03:29,  4.63it/s]

Episode : 5679 		 Timestep : 568000 		 Average Reward : 1336.72


 14%|█████████▌                                                         | 5701/39999 [22:54<2:42:52,  3.51it/s]

Episode : 5699 		 Timestep : 570000 		 Average Reward : 1317.21


 14%|█████████▌                                                         | 5721/39999 [22:59<2:40:59,  3.55it/s]

Episode : 5719 		 Timestep : 572000 		 Average Reward : 1277.55


 14%|█████████▌                                                         | 5742/39999 [23:04<1:59:27,  4.78it/s]

Episode : 5739 		 Timestep : 574000 		 Average Reward : 1330.19


 14%|█████████▋                                                         | 5762/39999 [23:09<2:04:59,  4.57it/s]

Episode : 5759 		 Timestep : 576000 		 Average Reward : 1306.51


 14%|█████████▋                                                         | 5781/39999 [23:14<2:36:31,  3.64it/s]

Episode : 5779 		 Timestep : 578000 		 Average Reward : 1318.16


 15%|█████████▋                                                         | 5802/39999 [23:18<2:08:24,  4.44it/s]

Episode : 5799 		 Timestep : 580000 		 Average Reward : 1324.0


 15%|█████████▊                                                         | 5822/39999 [23:23<1:49:56,  5.18it/s]

Episode : 5819 		 Timestep : 582000 		 Average Reward : 1271.94


 15%|█████████▊                                                         | 5842/39999 [23:27<1:49:38,  5.19it/s]

Episode : 5839 		 Timestep : 584000 		 Average Reward : 1326.65


 15%|█████████▊                                                         | 5862/39999 [23:32<1:44:36,  5.44it/s]

Episode : 5859 		 Timestep : 586000 		 Average Reward : 1330.63


 15%|█████████▊                                                         | 5882/39999 [23:37<2:19:40,  4.07it/s]

Episode : 5879 		 Timestep : 588000 		 Average Reward : 1270.19


 15%|█████████▉                                                         | 5901/39999 [23:42<2:26:58,  3.87it/s]

Episode : 5899 		 Timestep : 590000 		 Average Reward : 1362.26


 15%|█████████▉                                                         | 5922/39999 [23:47<2:09:38,  4.38it/s]

Episode : 5919 		 Timestep : 592000 		 Average Reward : 1253.6


 15%|█████████▉                                                         | 5941/39999 [23:51<2:06:12,  4.50it/s]

Episode : 5939 		 Timestep : 594000 		 Average Reward : 1293.5


 15%|█████████▉                                                         | 5962/39999 [23:56<1:55:40,  4.90it/s]

Episode : 5959 		 Timestep : 596000 		 Average Reward : 1218.95


 15%|██████████                                                         | 5981/39999 [24:00<2:07:18,  4.45it/s]

Episode : 5979 		 Timestep : 598000 		 Average Reward : 1298.62


 15%|██████████                                                         | 6002/39999 [24:05<2:05:59,  4.50it/s]

Episode : 5999 		 Timestep : 600000 		 Average Reward : 1308.81
--------------------------------------------------------------------------------------------
saving model at : PPO_preTrainedPPO_Grid World_0_0.pth
model saved
Elapsed Time  :  0:24:06
--------------------------------------------------------------------------------------------


 15%|██████████                                                         | 6022/39999 [24:10<1:56:32,  4.86it/s]

Episode : 6019 		 Timestep : 602000 		 Average Reward : 1426.06


 15%|██████████                                                         | 6042/39999 [24:15<1:56:08,  4.87it/s]

Episode : 6039 		 Timestep : 604000 		 Average Reward : 1386.1


 15%|██████████▏                                                        | 6061/39999 [24:20<2:20:44,  4.02it/s]

Episode : 6059 		 Timestep : 606000 		 Average Reward : 1334.42


 15%|██████████▏                                                        | 6081/39999 [24:25<2:30:05,  3.77it/s]

Episode : 6079 		 Timestep : 608000 		 Average Reward : 1372.96


 15%|██████████▏                                                        | 6102/39999 [24:30<2:02:46,  4.60it/s]

Episode : 6099 		 Timestep : 610000 		 Average Reward : 1387.93


 15%|██████████▎                                                        | 6121/39999 [24:35<2:32:33,  3.70it/s]

Episode : 6119 		 Timestep : 612000 		 Average Reward : 1386.72


 15%|██████████▎                                                        | 6141/39999 [24:39<2:05:10,  4.51it/s]

Episode : 6139 		 Timestep : 614000 		 Average Reward : 1377.49


 15%|██████████▎                                                        | 6162/39999 [24:44<1:54:59,  4.90it/s]

Episode : 6159 		 Timestep : 616000 		 Average Reward : 1223.06


 15%|██████████▎                                                        | 6182/39999 [24:48<1:41:36,  5.55it/s]

Episode : 6179 		 Timestep : 618000 		 Average Reward : 1306.18


 16%|██████████▍                                                        | 6202/39999 [24:53<2:05:03,  4.50it/s]

Episode : 6199 		 Timestep : 620000 		 Average Reward : 1312.89


 16%|██████████▍                                                        | 6222/39999 [24:58<1:54:27,  4.92it/s]

Episode : 6219 		 Timestep : 622000 		 Average Reward : 1269.7


 16%|██████████▍                                                        | 6242/39999 [25:03<2:00:53,  4.65it/s]

Episode : 6239 		 Timestep : 624000 		 Average Reward : 1343.61


 16%|██████████▍                                                        | 6262/39999 [25:08<2:16:14,  4.13it/s]

Episode : 6259 		 Timestep : 626000 		 Average Reward : 1385.61


 16%|██████████▌                                                        | 6281/39999 [25:13<2:48:54,  3.33it/s]

Episode : 6279 		 Timestep : 628000 		 Average Reward : 1196.37


 16%|██████████▌                                                        | 6302/39999 [25:18<2:01:07,  4.64it/s]

Episode : 6299 		 Timestep : 630000 		 Average Reward : 1327.79


 16%|██████████▌                                                        | 6322/39999 [25:23<1:51:32,  5.03it/s]

Episode : 6319 		 Timestep : 632000 		 Average Reward : 1194.04


 16%|██████████▌                                                        | 6341/39999 [25:28<2:14:33,  4.17it/s]

Episode : 6339 		 Timestep : 634000 		 Average Reward : 1310.54


 16%|██████████▋                                                        | 6362/39999 [25:33<2:02:18,  4.58it/s]

Episode : 6359 		 Timestep : 636000 		 Average Reward : 1197.09


 16%|██████████▋                                                        | 6370/39999 [25:35<2:10:03,  4.31it/s]