## 필요한 모듈 설치 확인

In [1]:
import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.autograd as autograd
import numpy as np

import random
import os
import pickle
import time
from collections import deque
from tqdm import tqdm

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set()

from wrappers import wrap, wrap_cover, SubprocVecEnv
from runner import Runner

## 하이퍼 파라미터 정의하기

In [2]:
'''PPO Settings'''
TRAJ_LEN = 128
N_OPT_EPOCHS = 10
ENT_COEF = 1e-2
CLIP_RANGE = 0.1
LAMBDA = 0.95

'''RND Settings'''
# RND start step for input normalization
RND_START = int(0)
# Discount rate for intrinsic reward
INT_GAMMA = 0.99

'''Environment Settings'''
# sequential images to define state
STATE_LEN = 4
# openai gym env name
ENV_NAME = 'BreakoutNoFrameskip-v4'
# number of environments for A2C
N_ENVS = 4
# define gym 
env = SubprocVecEnv([wrap_cover(ENV_NAME) for i in range(N_ENVS)])
# check gym setting
N_ACTIONS = env.action_space.n;print('N_ACTIONS : ',N_ACTIONS) #  6
N_STATES = env.observation_space.shape;print('N_STATES : ',N_STATES) # (4, 84, 84)
# Total simulation step
N_STEP = int(1e+7)
# gamma for MDP
GAMMA = 0.9999
# visualize for agent playing
RENDERING = False

'''Training settings'''
# check GPU usage
USE_GPU = torch.cuda.is_available()
print('USE GPU: '+str(USE_GPU))
# mini-batch size
BATCH_SIZE = 128
# learning rage
LR = 1e-4
# clip gradient
MAX_GRAD_NORM = 0.1
# log optimization
LOG_OPT = False

'''Save&Load Settings'''
# log frequency
LOG_FREQ = 10
# check save/load
SAVE = True
LOAD = False
# paths for predction net, target net, result log
NET_PATH = './data/model/ppo_net.pkl'
PRED_PATH = './data/model/pred_net.pkl'



N_ACTIONS :  4
N_STATES :  (4, 84, 84)
USE GPU: True


Process Process-3:
Process Process-4:
Process Process-2:
Process Process-1:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/sungyubkim/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/home/sungyubkim/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/home/sungyubkim/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/home/sungyubkim/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/home/sungyubkim/anaconda3/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/home/sungyubkim/anaconda3/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/home/sungyubkim/anaconda3/l

## 네트워크 구조 정의하기

In [3]:
class ConvNet(nn.Module):
    def __init__(self):
        super(ConvNet, self).__init__()
        # architecture def
        self.feature_extraction = nn.Sequential(
            nn.Conv2d(STATE_LEN, 32, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1)
        )
        self.fc1 = nn.Linear(7 * 7 * 64, 256)
        self.fc2 = nn.Linear(256, 256)
        # actor
        self.actor = nn.Linear(256, N_ACTIONS)
        # extrinsic critic
        self.critic = nn.Linear(256, 1)
        # intrinsic critic
        self.int_critic = nn.Linear(256, 1)
            
        # parameter initialization
        for m in self.modules():
            if isinstance(m, nn.Conv2d) or isinstance(m, nn.Linear):
                nn.init.orthogonal_(m.weight, gain = np.sqrt(2))
                nn.init.constant_(m.bias, 0.0)

    def forward(self, x):
        # x is a tensor of (m, 4, 84, 84)
        x = self.feature_extraction(x / 255.0)
        # x.size(0) : mini-batch size
        x = x.view(x.size(0), -1)
        x = self.fc1(F.relu(x))
        x = self.fc2(F.relu(x))
        # use log_softmax for numerical stability
        action_log_prob = F.log_softmax(self.actor(F.relu(x)), dim=1)
        state_value = self.critic(F.relu(x))
        int_state_value = self.int_critic(F.relu(x))

        return action_log_prob, state_value, int_state_value

    def save(self, PATH):
        torch.save(self.state_dict(),PATH)

    def load(self, PATH):
        self.load_state_dict(torch.load(PATH))
        
class RandomPredNet(nn.Module):
    def __init__(self):
        super(RandomPredNet, self).__init__()
        # architecture def
        self.feature_extraction = nn.Sequential(
            nn.Conv2d(4, 32, kernel_size=8, stride=4),
            nn.LeakyReLU(negative_slope=2e-1),
            nn.Conv2d(32, 64, kernel_size=4, stride=2),
            nn.LeakyReLU(negative_slope=2e-1),
            nn.Conv2d(64, 64, kernel_size=3, stride=1)
        )
        self.fc1 = nn.Linear(7 * 7 * 64, 256)
        # one more layer than target network for enough capacity
        self.fc2 = nn.Linear(256, 256)
        
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.orthogonal_(m.weight, gain = np.sqrt(2))
                nn.init.constant_(m.bias, 0.0)
            elif isinstance(m, nn.Linear):
                nn.init.kaiming_normal_(m.weight)
                nn.init.constant_(m.bias, 0.0)
            

    def forward(self, x):
        # if you use feature normalization in RND, remove division by 255.0
        x = self.feature_extraction(x / 255.0)
        # x.size(0) : mini-batch size
        x = x.view(x.size(0), -1)
        x = self.fc1(F.leaky_relu(x, negative_slope=2e-1))
        x = self.fc2(F.leaky_relu(x, negative_slope=2e-1))
        return x
    
    def save(self, PATH):
        torch.save(self.state_dict(),PATH)

    def load(self, PATH):
        self.load_state_dict(torch.load(PATH))
        
class RandomTargetNet(nn.Module):
    def __init__(self):
        super(RandomTargetNet, self).__init__()
        # architecture def
        self.feature_extraction = nn.Sequential(
            nn.Conv2d(4, 32, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1),
        )
        self.fc1 = nn.Linear(7 * 7 * 64, 256)
        
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.orthogonal_(m.weight, gain = np.sqrt(2))
                nn.init.constant_(m.bias, 0.0)
            elif isinstance(m, nn.Linear):
                nn.init.kaiming_normal_(m.weight)
                nn.init.constant_(m.bias, 0.0)

    def forward(self, x):
        # if you use feature normalization in RND, remove division by 255.0
        x = self.feature_extraction(x / 255.0)
        # x.size(0) : mini-batch size
        x = x.view(x.size(0), -1)
        x = self.fc1(F.relu(x))
        return x

## RND 정의하기

In [4]:
class PPO:
    def __init__(self):
        self.net = ConvNet()
        self.rand_target = RandomTargetNet()
        self.rand_pred = RandomPredNet()
        # use gpu
        if USE_GPU:
            self.net = self.net.cuda()
            self.rand_target = self.rand_target.cuda()
            self.rand_pred = self.rand_pred.cuda()
            
        # simulator step conter
        self.memory_counter = 0
        
        # define optimizer
        self.optimizer = torch.optim.Adam(self.net.parameters(), lr=LR)
        # define optimizer for predict network
        self.rand_pred_opt = torch.optim.Adam(self.rand_pred.parameters(), lr=LR)
        
        # ppo clip range
        self.clip_range = CLIP_RANGE
        
        # observation statistics for RND (if you use feature normalization in RND)
        self.s_mu = None
        self.s_sigma = None
        
    def save_model(self):
        self.net.cpu()
        self.rand_pred.cpu()
        
        self.net.save(NET_PATH)
        self.rand_pred.save(PRED_PATH)
        if USE_GPU:
            self.net.cuda()
            self.rand_pred.cuda()
            
    def load_model(self):
        self.net.cpu()
        self.rand_pred.cpu()
        
        self.net.load(NET_PATH)
        self.rand_pred.load(PRED_PATH)
        if USE_GPU:
            self.net.cuda()
            self.rand_pred.cuda()
        
    def choose_action(self, x):
        self.memory_counter += 1
        # Assume that x is a np.array of shape (nenvs, 4, 84, 84)
        x = torch.FloatTensor(x)
        if USE_GPU:
            x = x.cuda()
        # get action log probs and state values
        action_log_probs, state_values, int_state_values = self.net(x) # (nenvs, N_ACTIONS)
        probs = F.softmax(action_log_probs, dim=1).data.cpu().numpy()
        # sample actions
        actions = np.array([np.random.choice(N_ACTIONS,p=probs[i]) for i in range(len(probs))])
        # convert tensor to np.array
        action_log_probs = action_log_probs.data.cpu().numpy()
        state_values = state_values.squeeze(1).data.cpu().numpy()
        int_state_values = int_state_values.squeeze(1).data.cpu().numpy()
        # calc selected logprob
        selected_log_probs = np.array([action_log_probs[i][actions[i]] for i in range(len(probs))])
        return actions, state_values, int_state_values, selected_log_probs
    
    def r_int(self, s):
        s = torch.FloatTensor(s)
        # feature normalization part in RND
        # get intrinsic reward
#         r_input = list()
#         for i in range(len(s)):
#             r_input.append((s[i, -1] - self.s_mu)/(self.s_sigma + 1e-8))
#         s = torch.clamp(torch.FloatTensor(r_input).unsqueeze(1), -5., 5.) # (N_ENVS, 1, 84, 84)
        if USE_GPU:
            s = s.cuda()
        r_target = self.rand_target(s) # (N_ENVS, 256)
        r_pred = self.rand_pred(s) # (N_ENVS, 256)
        r_int = torch.mean(F.mse_loss(r_target, r_pred, reduction='none'), dim=1)
        
        return r_int.data.cpu().numpy()
    
    def learn_predict(self, s):
        s = torch.FloatTensor(s)
        # feature normalization part in RND
        # RND pred net optimize
#         r_input = list()
#         for i in range(len(s)):
#             r_input.append((obs[i, -1] - self.s_mu)/(self.s_sigma + 1e-8))
#         s = torch.clamp(torch.FloatTensor(r_input).unsqueeze(1), -5., 5.) # (N_ENVS, 1, 84, 84)
        if USE_GPU:
            s = s.cuda()
        s.requires_grad = True
        r_target = self.rand_target(s) # (N_ENVS, 256)
        r_pred = self.rand_pred(s) # (N_ENVS, 256)
        r_int = torch.mean(F.mse_loss(r_pred, r_target, reduction='none'), dim=1)
        # (N_ENVS)
        # zero-centered gradient penalty for vanishing gradient problem. You can remove this part
        # check https://arxiv.org/abs/1801.04406 for more information.
        grad = autograd.grad(r_int, s, create_graph=True,
                        grad_outputs=torch.ones_like(r_int),
                        retain_graph=True, only_inputs=True)[0].view(len(s), -1)
        grad = grad.norm(dim=1)
        loss = r_int.mean() + 100.0 * ((grad)**2).mean()
        
        self.rand_pred_opt.zero_grad()
        loss.backward()
        nn.utils.clip_grad_norm_(self.rand_pred.parameters(), MAX_GRAD_NORM)
        self.rand_pred_opt.step()
        return r_int.data.cpu().numpy()

    def learn(self, obs, returns, int_returns, masks, actions, values, int_values,  selected_log_probs):
        # np.array -> torch.Tensor
        obs = torch.FloatTensor(obs) # (m, 4, 84, 84)
        returns = torch.FloatTensor(returns) # (m)
        int_returns = torch.FloatTensor(int_returns) # (m)
        actions = torch.LongTensor(actions) # (m)
        selected_log_probs = torch.FloatTensor(selected_log_probs) # (m)
        values = torch.FloatTensor(values) # (m)
        int_values = torch.FloatTensor(int_values) # (m)
        if USE_GPU:
            obs = obs.cuda()
            returns = returns.cuda()
            int_returns = int_returns.cuda()
            actions = actions.cuda()
            selected_log_probs = selected_log_probs.cuda()
            values = values.cuda()
            int_values = int_values.cuda()
        
        # get action log probs and state values
        action_log_probs, state_values, int_state_values = self.net(obs)
        # (m, N_ACTIONS), (m, 1)
        
        # calculate the advantages
        # original RND
#         advs = 2 * (returns - values) + (int_returns - int_values)
        # only intrinsic motivation agent's advantage
        advs = (int_returns - int_values)
        advs = (advs - advs.mean())/(advs.std() + 1e-8)
        
        # calc probs
        probs = F.softmax(action_log_probs, dim=1)
        # (m, N_ACTIONS)
        
        # calc entropy loss
        ent_loss = ENT_COEF *((action_log_probs * probs).sum(dim=1)).mean()
        # (1)
        
        # calc log probs
        cur_log_probs = action_log_probs.gather(1,actions.unsqueeze(1))
        # cur : (m, 1)
        ratio = torch.exp(cur_log_probs.squeeze(1)-selected_log_probs)
        # (m)
        
        # actor loss
        surr1 = ratio * advs # (m)
        surr2 = torch.clamp(ratio, 1.0 - self.clip_range, 1.0 + self.clip_range)*advs # (m)
        actor_loss = -torch.min(surr1, surr2).mean() # (1)
        # critic loss
        critic_loss = F.smooth_l1_loss(state_values.squeeze(1), returns) # (1)
        # int critic loss
        int_critic_loss = F.smooth_l1_loss(int_state_values.squeeze(1), int_returns) # (1)

        loss = actor_loss + critic_loss + ent_loss + int_critic_loss # (1)
        
        actor_loss, critic_loss, ent_loss, total_loss = actor_loss.data.cpu().numpy(), \
        critic_loss.data.cpu().numpy(), ent_loss.data.cpu().numpy(), loss.data.cpu().numpy()

        self.optimizer.zero_grad()
        loss.backward()
        nn.utils.clip_grad_norm_(self.net.parameters(), MAX_GRAD_NORM)
        self.optimizer.step()
        
        return round(float(actor_loss), 4), round(float(critic_loss), 4),\
    round(float(ent_loss), 4), round(float(total_loss), 4)

## 학습

In [5]:
ppo = PPO()
runner = Runner(env=env, model=ppo, nsteps=TRAJ_LEN, gamma=GAMMA, int_gamma=INT_GAMMA, lam=LAMBDA, rnd_start=RND_START)

# model load with check
if LOAD and os.path.isfile(PRED_PATH) and os.path.isfile(TARGET_PATH):
    ppo.load_model()
    pkl_file = open(RESULT_PATH,'rb')
    result = pickle.load(pkl_file)
    pkl_file.close()
    print('Load complete!')
else:
    result = []
    print('Initialize results!')

print('Collecting experience...')

# episode step for accumulate reward 
epinfobuf = deque(maxlen=100)
# in PPO, we iterate over optimization step
nbatch = N_ENVS * TRAJ_LEN
nupdates = N_STEP// nbatch
# check learning time
start_time = time.time()

for update in tqdm(range(1, nupdates+1)):
    # get minibatch
    obs, returns, int_rewards, int_returns, masks, actions, values, int_values, neglogpacs, epinfos = runner.run()
    epinfobuf.extend(epinfos)
    
    if ppo.memory_counter > RND_START:
        # calculate loss
        inds = np.arange(nbatch)
        for _ in range(N_OPT_EPOCHS):
            a_losses, c_losses, e_losses, t_losses = list(), list(), list(), list()
            # shuffle indices for i.i.d.
            np.random.shuffle(inds)
            # 0 to batch_size with batch_train_size step
            for start in range(0, nbatch, BATCH_SIZE):
                end = start + BATCH_SIZE
                mbinds = inds[start:end]
                slices = (arr[mbinds] for arr in (obs, returns, int_returns, masks, actions, values, int_values, neglogpacs))
                actor_loss, critic_loss, ent_loss, total_loss = ppo.learn(*slices)
                if np.random.rand() <= 0.25:
                    ppo.learn_predict(obs[mbinds])
            
        if update % LOG_FREQ == 0:
            # print log and save
            # check time interval
            time_interval = round(time.time() - start_time, 2)
            # calc mean return
            mean_100_ep_return = round(np.mean([epinfo['r'] for epinfo in epinfobuf]),2)
            result.append(mean_100_ep_return)
            # print epi log
            print('N : ',update,
                  '| Return mean: ', mean_100_ep_return,
                  '| R_int_mean : ', round(np.mean(int_rewards),3),
                  '| R_int_std : ', round(np.std(int_rewards),3),
                  '| Values : ', round(np.mean(values), 3),
                  '| Int_values : ', round(np.mean(int_values), 3),
                  '| Time:',time_interval,
                  '| Used Step:',ppo.memory_counter*N_ENVS)
            # save model
            if SAVE:
                ppo.save_model()
env.close()

  0%|          | 0/19531 [00:00<?, ?it/s]

Initialize results!
Collecting experience...


  0%|          | 10/19531 [00:17<9:18:23,  1.72s/it]

N :  10 | Return mean:  8.05 | R_int_mean :  0.046 | R_int_std :  0.006 | Values :  0.421 | Int_values :  3.121 | Time: 17.17 | Used Step: 5200


  0%|          | 20/19531 [00:33<8:35:40,  1.59s/it]

N :  20 | Return mean:  8.0 | R_int_mean :  0.077 | R_int_std :  0.009 | Values :  0.453 | Int_values :  1.718 | Time: 33.72 | Used Step: 10400


  0%|          | 30/19531 [00:49<8:27:14,  1.56s/it]

N :  30 | Return mean:  8.47 | R_int_mean :  0.082 | R_int_std :  0.011 | Values :  0.582 | Int_values :  1.887 | Time: 49.3 | Used Step: 15600


  0%|          | 40/19531 [01:05<8:26:53,  1.56s/it]

N :  40 | Return mean:  8.86 | R_int_mean :  0.076 | R_int_std :  0.01 | Values :  0.619 | Int_values :  1.729 | Time: 65.97 | Used Step: 20800


  0%|          | 50/19531 [01:22<9:09:16,  1.69s/it]

N :  50 | Return mean:  8.88 | R_int_mean :  0.078 | R_int_std :  0.01 | Values :  0.587 | Int_values :  2.096 | Time: 82.32 | Used Step: 26000


  0%|          | 60/19531 [01:38<9:09:11,  1.69s/it]

N :  60 | Return mean:  9.07 | R_int_mean :  0.067 | R_int_std :  0.009 | Values :  0.877 | Int_values :  2.457 | Time: 98.73 | Used Step: 31200


  0%|          | 70/19531 [01:55<9:26:36,  1.75s/it]

N :  70 | Return mean:  9.54 | R_int_mean :  0.075 | R_int_std :  0.009 | Values :  0.774 | Int_values :  1.934 | Time: 115.72 | Used Step: 36400


  0%|          | 80/19531 [02:12<8:59:48,  1.67s/it]

N :  80 | Return mean:  9.0 | R_int_mean :  0.066 | R_int_std :  0.008 | Values :  0.6 | Int_values :  1.915 | Time: 132.0 | Used Step: 41600


  0%|          | 90/19531 [02:29<8:42:02,  1.61s/it] 

N :  90 | Return mean:  8.07 | R_int_mean :  0.093 | R_int_std :  0.012 | Values :  0.461 | Int_values :  1.925 | Time: 149.21 | Used Step: 46800


  1%|          | 100/19531 [02:46<8:59:29,  1.67s/it]

N :  100 | Return mean:  7.95 | R_int_mean :  0.083 | R_int_std :  0.011 | Values :  0.699 | Int_values :  1.938 | Time: 166.31 | Used Step: 52000


  1%|          | 110/19531 [03:02<8:57:21,  1.66s/it]

N :  110 | Return mean:  8.08 | R_int_mean :  0.073 | R_int_std :  0.009 | Values :  0.925 | Int_values :  2.128 | Time: 182.58 | Used Step: 57200


  1%|          | 120/19531 [03:18<8:49:58,  1.64s/it]

N :  120 | Return mean:  8.45 | R_int_mean :  0.076 | R_int_std :  0.01 | Values :  0.84 | Int_values :  2.018 | Time: 198.75 | Used Step: 62400


  1%|          | 130/19531 [03:35<9:05:49,  1.69s/it]

N :  130 | Return mean:  8.41 | R_int_mean :  0.074 | R_int_std :  0.01 | Values :  0.839 | Int_values :  2.152 | Time: 215.28 | Used Step: 67600


  1%|          | 140/19531 [03:51<8:26:35,  1.57s/it]

N :  140 | Return mean:  8.72 | R_int_mean :  0.061 | R_int_std :  0.007 | Values :  0.832 | Int_values :  2.244 | Time: 231.25 | Used Step: 72800


  1%|          | 150/19531 [04:07<8:30:13,  1.58s/it]

N :  150 | Return mean:  9.53 | R_int_mean :  0.064 | R_int_std :  0.008 | Values :  0.655 | Int_values :  2.095 | Time: 247.61 | Used Step: 78000


  1%|          | 160/19531 [04:25<9:26:05,  1.75s/it]

N :  160 | Return mean:  10.14 | R_int_mean :  0.072 | R_int_std :  0.009 | Values :  0.637 | Int_values :  2.186 | Time: 265.0 | Used Step: 83200


  1%|          | 170/19531 [04:41<8:44:30,  1.63s/it]

N :  170 | Return mean:  10.25 | R_int_mean :  0.065 | R_int_std :  0.008 | Values :  0.582 | Int_values :  1.659 | Time: 281.19 | Used Step: 88400


  1%|          | 180/19531 [04:59<9:56:27,  1.85s/it]

N :  180 | Return mean:  10.23 | R_int_mean :  0.078 | R_int_std :  0.01 | Values :  0.872 | Int_values :  2.274 | Time: 299.08 | Used Step: 93600


  1%|          | 190/19531 [05:15<8:26:49,  1.57s/it]

N :  190 | Return mean:  10.0 | R_int_mean :  0.064 | R_int_std :  0.009 | Values :  0.933 | Int_values :  2.449 | Time: 315.8 | Used Step: 98800


  1%|          | 200/19531 [05:32<8:28:56,  1.58s/it]

N :  200 | Return mean:  9.89 | R_int_mean :  0.082 | R_int_std :  0.011 | Values :  0.713 | Int_values :  1.759 | Time: 332.15 | Used Step: 104000


  1%|          | 210/19531 [05:49<9:42:27,  1.81s/it]

N :  210 | Return mean:  9.8 | R_int_mean :  0.084 | R_int_std :  0.011 | Values :  0.876 | Int_values :  2.583 | Time: 349.18 | Used Step: 109200


  1%|          | 220/19531 [06:05<8:18:34,  1.55s/it]

N :  220 | Return mean:  9.95 | R_int_mean :  0.076 | R_int_std :  0.009 | Values :  0.721 | Int_values :  1.633 | Time: 365.54 | Used Step: 114400


  1%|          | 230/19531 [06:22<9:01:06,  1.68s/it]

N :  230 | Return mean:  9.92 | R_int_mean :  0.078 | R_int_std :  0.01 | Values :  0.82 | Int_values :  2.287 | Time: 382.68 | Used Step: 119600


  1%|          | 240/19531 [06:39<8:36:26,  1.61s/it]

N :  240 | Return mean:  9.8 | R_int_mean :  0.067 | R_int_std :  0.008 | Values :  0.855 | Int_values :  2.032 | Time: 399.39 | Used Step: 124800


  1%|▏         | 250/19531 [06:56<9:15:52,  1.73s/it]

N :  250 | Return mean:  9.64 | R_int_mean :  0.061 | R_int_std :  0.008 | Values :  0.878 | Int_values :  2.115 | Time: 416.08 | Used Step: 130000


  1%|▏         | 260/19531 [07:13<9:07:09,  1.70s/it]

N :  260 | Return mean:  9.8 | R_int_mean :  0.065 | R_int_std :  0.009 | Values :  1.109 | Int_values :  2.063 | Time: 433.11 | Used Step: 135200


  1%|▏         | 270/19531 [07:29<8:37:06,  1.61s/it]

N :  270 | Return mean:  9.89 | R_int_mean :  0.08 | R_int_std :  0.011 | Values :  0.758 | Int_values :  1.729 | Time: 449.15 | Used Step: 140400


  1%|▏         | 280/19531 [07:47<9:53:11,  1.85s/it]

N :  280 | Return mean:  10.34 | R_int_mean :  0.059 | R_int_std :  0.008 | Values :  0.949 | Int_values :  2.119 | Time: 467.33 | Used Step: 145600


  1%|▏         | 290/19531 [08:04<9:07:28,  1.71s/it] 

N :  290 | Return mean:  9.97 | R_int_mean :  0.077 | R_int_std :  0.01 | Values :  0.49 | Int_values :  1.837 | Time: 484.7 | Used Step: 150800


  2%|▏         | 300/19531 [08:21<9:13:34,  1.73s/it]

N :  300 | Return mean:  9.78 | R_int_mean :  0.053 | R_int_std :  0.007 | Values :  1.605 | Int_values :  2.79 | Time: 501.95 | Used Step: 156000


  2%|▏         | 310/19531 [08:38<8:34:55,  1.61s/it]

N :  310 | Return mean:  9.57 | R_int_mean :  0.09 | R_int_std :  0.012 | Values :  0.842 | Int_values :  1.933 | Time: 518.91 | Used Step: 161200


  2%|▏         | 320/19531 [08:55<8:51:12,  1.66s/it]

N :  320 | Return mean:  9.46 | R_int_mean :  0.06 | R_int_std :  0.008 | Values :  0.938 | Int_values :  1.672 | Time: 535.53 | Used Step: 166400


  2%|▏         | 330/19531 [09:11<8:24:19,  1.58s/it]

N :  330 | Return mean:  9.04 | R_int_mean :  0.063 | R_int_std :  0.008 | Values :  1.064 | Int_values :  2.289 | Time: 551.6 | Used Step: 171600


  2%|▏         | 340/19531 [09:29<9:23:57,  1.76s/it]

N :  340 | Return mean:  8.97 | R_int_mean :  0.082 | R_int_std :  0.011 | Values :  0.854 | Int_values :  2.041 | Time: 569.08 | Used Step: 176800


  2%|▏         | 350/19531 [09:45<9:11:45,  1.73s/it]

N :  350 | Return mean:  8.89 | R_int_mean :  0.059 | R_int_std :  0.007 | Values :  0.628 | Int_values :  1.993 | Time: 585.58 | Used Step: 182000


  2%|▏         | 360/19531 [10:03<9:15:42,  1.74s/it]

N :  360 | Return mean:  8.91 | R_int_mean :  0.064 | R_int_std :  0.008 | Values :  1.069 | Int_values :  2.198 | Time: 603.42 | Used Step: 187200


  2%|▏         | 370/19531 [10:20<8:53:26,  1.67s/it]

N :  370 | Return mean:  8.89 | R_int_mean :  0.058 | R_int_std :  0.008 | Values :  1.428 | Int_values :  2.906 | Time: 620.23 | Used Step: 192400


  2%|▏         | 380/19531 [10:37<9:11:40,  1.73s/it]

N :  380 | Return mean:  8.96 | R_int_mean :  0.056 | R_int_std :  0.007 | Values :  1.028 | Int_values :  1.928 | Time: 637.14 | Used Step: 197600


  2%|▏         | 390/19531 [10:53<8:28:12,  1.59s/it]

N :  390 | Return mean:  9.09 | R_int_mean :  0.069 | R_int_std :  0.01 | Values :  1.118 | Int_values :  2.441 | Time: 653.06 | Used Step: 202800


  2%|▏         | 400/19531 [11:10<8:53:02,  1.67s/it]

N :  400 | Return mean:  9.08 | R_int_mean :  0.059 | R_int_std :  0.008 | Values :  1.31 | Int_values :  2.608 | Time: 670.22 | Used Step: 208000


  2%|▏         | 410/19531 [11:27<9:07:45,  1.72s/it]

N :  410 | Return mean:  9.35 | R_int_mean :  0.05 | R_int_std :  0.006 | Values :  1.566 | Int_values :  2.809 | Time: 687.75 | Used Step: 213200


  2%|▏         | 420/19531 [11:44<8:54:02,  1.68s/it]

N :  420 | Return mean:  9.3 | R_int_mean :  0.063 | R_int_std :  0.008 | Values :  1.297 | Int_values :  2.433 | Time: 704.29 | Used Step: 218400


  2%|▏         | 430/19531 [12:00<8:55:47,  1.68s/it]

N :  430 | Return mean:  9.51 | R_int_mean :  0.054 | R_int_std :  0.007 | Values :  1.452 | Int_values :  2.73 | Time: 720.42 | Used Step: 223600


  2%|▏         | 440/19531 [12:16<8:27:58,  1.60s/it]

N :  440 | Return mean:  9.67 | R_int_mean :  0.08 | R_int_std :  0.01 | Values :  0.94 | Int_values :  2.325 | Time: 736.52 | Used Step: 228800


  2%|▏         | 450/19531 [12:32<8:34:49,  1.62s/it]

N :  450 | Return mean:  9.97 | R_int_mean :  0.07 | R_int_std :  0.01 | Values :  1.727 | Int_values :  3.07 | Time: 752.63 | Used Step: 234000


  2%|▏         | 460/19531 [12:48<8:46:13,  1.66s/it]

N :  460 | Return mean:  9.98 | R_int_mean :  0.059 | R_int_std :  0.008 | Values :  1.013 | Int_values :  2.204 | Time: 768.96 | Used Step: 239200


  2%|▏         | 470/19531 [13:05<8:51:56,  1.67s/it]

N :  470 | Return mean:  10.05 | R_int_mean :  0.062 | R_int_std :  0.008 | Values :  1.196 | Int_values :  2.852 | Time: 785.75 | Used Step: 244400


  2%|▏         | 480/19531 [13:22<8:53:13,  1.68s/it]

N :  480 | Return mean:  10.15 | R_int_mean :  0.062 | R_int_std :  0.008 | Values :  1.29 | Int_values :  2.856 | Time: 802.53 | Used Step: 249600


  3%|▎         | 490/19531 [13:38<8:31:59,  1.61s/it]

N :  490 | Return mean:  10.12 | R_int_mean :  0.06 | R_int_std :  0.008 | Values :  0.998 | Int_values :  2.193 | Time: 818.66 | Used Step: 254800


  3%|▎         | 500/19531 [13:54<8:06:02,  1.53s/it]

N :  500 | Return mean:  10.07 | R_int_mean :  0.075 | R_int_std :  0.009 | Values :  0.612 | Int_values :  1.576 | Time: 834.66 | Used Step: 260000


  3%|▎         | 510/19531 [14:12<9:15:59,  1.75s/it]

N :  510 | Return mean:  9.81 | R_int_mean :  0.067 | R_int_std :  0.009 | Values :  1.016 | Int_values :  2.084 | Time: 852.22 | Used Step: 265200


  3%|▎         | 520/19531 [14:29<9:28:16,  1.79s/it]

N :  520 | Return mean:  9.76 | R_int_mean :  0.05 | R_int_std :  0.006 | Values :  1.165 | Int_values :  3.124 | Time: 869.43 | Used Step: 270400


  3%|▎         | 530/19531 [14:45<8:50:58,  1.68s/it]

N :  530 | Return mean:  9.65 | R_int_mean :  0.05 | R_int_std :  0.006 | Values :  1.775 | Int_values :  3.201 | Time: 885.51 | Used Step: 275600


  3%|▎         | 540/19531 [15:02<9:02:24,  1.71s/it]

N :  540 | Return mean:  9.42 | R_int_mean :  0.066 | R_int_std :  0.008 | Values :  1.589 | Int_values :  2.711 | Time: 902.03 | Used Step: 280800


  3%|▎         | 550/19531 [15:19<8:42:24,  1.65s/it]

N :  550 | Return mean:  9.53 | R_int_mean :  0.063 | R_int_std :  0.008 | Values :  1.828 | Int_values :  2.904 | Time: 919.07 | Used Step: 286000


  3%|▎         | 560/19531 [15:34<8:36:36,  1.63s/it]

N :  560 | Return mean:  9.59 | R_int_mean :  0.073 | R_int_std :  0.009 | Values :  1.139 | Int_values :  2.187 | Time: 934.88 | Used Step: 291200


  3%|▎         | 570/19531 [15:51<8:28:30,  1.61s/it]

N :  570 | Return mean:  9.45 | R_int_mean :  0.062 | R_int_std :  0.008 | Values :  1.034 | Int_values :  2.259 | Time: 951.36 | Used Step: 296400


  3%|▎         | 580/19531 [16:06<7:53:41,  1.50s/it]

N :  580 | Return mean:  9.18 | R_int_mean :  0.066 | R_int_std :  0.009 | Values :  0.867 | Int_values :  2.199 | Time: 966.78 | Used Step: 301600


  3%|▎         | 590/19531 [16:23<8:19:44,  1.58s/it]

N :  590 | Return mean:  9.15 | R_int_mean :  0.067 | R_int_std :  0.009 | Values :  1.309 | Int_values :  2.798 | Time: 983.57 | Used Step: 306800


  3%|▎         | 600/19531 [16:40<8:35:42,  1.63s/it]

N :  600 | Return mean:  8.98 | R_int_mean :  0.049 | R_int_std :  0.006 | Values :  1.835 | Int_values :  3.202 | Time: 1000.46 | Used Step: 312000


  3%|▎         | 607/19531 [16:52<9:00:25,  1.71s/it]

KeyboardInterrupt: 

## 결과 시각화

In [None]:
plt.plot(range(len(result)), result)
plt.tight_layout()
plt.show()

In [None]:
from matplotlib import animation

def display_frames_as_gif(frames):
    patch = plt.imshow(frames[0])
    plt.axis('off')
    def animate(i):
        patch.set_data(frames[i])
        
    anim = animation.FuncAnimation(plt.gcf(), animate, frames = len(frames), interval=5)
    anim.save('./rnd_breakout_result.gif', writer='imagemagick', fps=30)

In [None]:
env = wrap(gym.make('BreakoutNoFrameskip-v4'))
s = np.array(env.reset())
total_reward = 0
frames = []
done_counter = 0

for t in range(10000):
    # Render into buffer. 
    frames.append(env.render(mode = 'rgb_array'))
    a, v, int_v, l = ppo.choose_action(np.expand_dims(s,axis=0))
    # take action and get next state
    s_, r, done, info = env.step(a)
    s_ = np.array(s_)
    total_reward += r
    if done:
        done_counter += 1
        if done_counter == 5:
            break
    s = s_
env.close()
print('Total Reward : %.2f'%total_reward)
display_frames_as_gif(frames)

![alt text](./rnd_breakout_result.gif "segment")