## 필요한 모듈 설치 확인

In [1]:
import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

import random
import os
import pickle
import time
from collections import deque

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set()

from wrappers import wrap, wrap_cover, SubprocVecEnv
from runner import Runner

## 하이퍼 파라미터 정의하기

In [2]:
'''PPO Settings'''
TRAJ_LEN = 1000
N_OPT_EPOCHS = 4
ENT_COEF = 1e-2
LAMBDA = 0.95

'''Environment Settings'''
# sequential images to define state
STATE_LEN = 4
# openai gym env name
ENV_NAME = 'BreakoutNoFrameskip-v4'
# number of environments for A2C
N_ENVS = 4
# define gym 
env = SubprocVecEnv([wrap_cover(ENV_NAME) for i in range(N_ENVS)])
# check gym setting
N_ACTIONS = env.action_space.n;print('N_ACTIONS : ',N_ACTIONS) #  6
N_STATES = env.observation_space.shape;print('N_STATES : ',N_STATES) # (4, 84, 84)
# Total simulation step
N_STEP = 10**7
# gamma for MDP
GAMMA = 0.99
# visualize for agent playing
RENDERING = False

'''Training settings'''
# check GPU usage
USE_GPU = torch.cuda.is_available()
print('USE GPU: '+str(USE_GPU))
# mini-batch size
BATCH_SIZE = 32
# learning rage
LR = 1e-4
# clip gradient
MAX_GRAD_NORM = 0.1
# log optimization
LOG_OPT = False

'''Save&Load Settings'''
# log frequency
LOG_FREQ = 1
# check save/load
SAVE = True
LOAD = False
# paths for predction net, target net, result log
NET_PATH = './data/model/ppo_net.pkl'
REGRET_PATH = './data/model/regret_net.pkl'



N_ACTIONS :  4
N_STATES :  (4, 84, 84)
USE GPU: True




## 네트워크 구조 정의하기

In [3]:
class ConvNet(nn.Module):
    def __init__(self):
        super(ConvNet, self).__init__()
        # architecture def
        self.feature_extraction = nn.Sequential(
            nn.Conv2d(STATE_LEN, 32, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1),
            nn.ReLU(),
        )
        self.fc = nn.Linear(7 * 7 * 64, 256)
        # actor
        self.actor = nn.Linear(256, N_ACTIONS)
        # critic
        self.critic = nn.Linear(256, 1)
            
        # parameter initialization
        for m in self.modules():
            if isinstance(m, nn.Conv2d) or isinstance(m, nn.Linear):
                nn.init.orthogonal_(m.weight, gain = np.sqrt(2))
                nn.init.constant_(m.bias, 0.0)
            

    def forward(self, x):
        # x is a tensor of (m, 4, 84, 84)
        x = self.feature_extraction(x / 255.0)
        # x.size(0) : mini-batch size
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc(x))
        # use log_softmax for numerical stability
        action_log_prob = F.log_softmax(self.actor(x), dim=1)
        state_value = self.critic(x)

        return action_log_prob, state_value

    def save(self, PATH):
        torch.save(self.state_dict(),PATH)

    def load(self, PATH):
        self.load_state_dict(torch.load(PATH))
        
class RegretNet(nn.Module):
    def __init__(self):
        super(RegretNet, self).__init__()
        # architecture def
        self.feature_extraction = nn.Sequential(
            nn.Conv2d(STATE_LEN, 32, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1),
            nn.ReLU(),
        )
        self.fc = nn.Linear(7 * 7 * 64, 256)
        # actor
        self.action_critic = nn.Linear(256, N_ACTIONS)
        
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.orthogonal_(m.weight, gain = np.sqrt(2))
                nn.init.constant_(m.bias, 0.0)
            elif isinstance(m, nn.Linear):
                nn.init.kaiming_normal_(m.weight)
                nn.init.constant_(m.bias, 0.0)
            

    def forward(self, x):
        # x is a tensor of (m, 4, 84, 84)
        x = self.feature_extraction(x / 255.0)
        # x.size(0) : mini-batch size
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc(x))
        action_value = self.action_critic(x)

        return action_value

    def save(self, PATH):
        torch.save(self.state_dict(),REGRET_PATH)

    def load(self, PATH):
        self.load_state_dict(torch.load(REGRET_PATH))

## PPO 정의하기

In [4]:
class PPO:
    def __init__(self):
        self.net = ConvNet()
        self.regret_net = RegretNet()
        self.regret_net_old = RegretNet()
        # use gpu
        if USE_GPU:
            self.net = self.net.cuda()
            self.regret_net = self.regret_net.cuda()
            self.regret_net_old = self.regret_net_old.cuda()
            
        # simulator step conter
        self.memory_counter = 0
        
        # define optimizer
        self.optimizer = torch.optim.Adam(self.net.parameters(), lr=LR)
        self.regret_opt = torch.optim.Adam(self.regret_net.parameters(), lr=LR)
        
    def save_model(self):
        self.net.cpu()
        self.net.save(NET_PATH)
        if USE_GPU:
            self.net.cuda()
            
    def load_model(self):
        self.net.cpu()
        self.net.load(NET_PATH)
        if USE_GPU:
            self.net.cuda()
            
    def update_target(self, target, pred, update_rate):
        # update target network parameters using predcition network
        for target_param, pred_param in zip(target.parameters(), pred.parameters()):
            target_param.data.copy_((1.0 - update_rate) \
                                    * target_param.data + update_rate*pred_param.data)
        
    def choose_action(self, x):
        self.memory_counter += 1
        # Assume that x is a np.array of shape (nenvs, 4, 84, 84)
        x = torch.FloatTensor(x)
        if USE_GPU:
            x = x.cuda()
        # get action log probs and state values
        action_log_probs, state_values = self.net(x) # (nenvs, N_ACTIONS)
        probs = F.softmax(action_log_probs, dim=1).data.cpu().numpy()
        probs = (probs+1e-8)/np.sum((probs+1e-8), axis=1, keepdims=True)
        # sample actions
        actions = np.array([np.random.choice(N_ACTIONS,p=probs[i]) for i in range(len(probs))])
        # convert tensor to np.array
        action_log_probs , state_values = action_log_probs.data.cpu().numpy() , state_values.squeeze(1).data.cpu().numpy()
        # calc selected logprob
        selected_log_probs = np.array([action_log_probs[i][actions[i]] for i in range(len(probs))])
        return actions, state_values, selected_log_probs

    def learn_value(self, obs, returns, masks, actions, values, selected_log_probs):
        # np.array -> torch.Tensor
        obs = torch.FloatTensor(obs) # (m, 4, 84, 84)
        returns = torch.FloatTensor(returns) # (m)
        actions = torch.LongTensor(actions) # (m)
        if USE_GPU:
            obs = obs.cuda()
            returns = returns.cuda()
            actions = actions.cuda()
            
        # get action log probs and state values
        action_log_probs, state_values = self.net(obs)
        # critic loss
        critic_loss = F.smooth_l1_loss(state_values.squeeze(1), returns) # (1)
        
        self.optimizer.zero_grad()
        critic_loss.backward()
        self.optimizer.step()
        
    def learn_regret(self, obs, returns, masks, actions, values, selected_log_probs):
        
        # calculate the advantages
        advs = returns - values
        
        # np.array -> torch.Tensor
        obs = torch.FloatTensor(obs) # (m, 4, 84, 84)
        returns = torch.FloatTensor(returns) # (m)
        advs = torch.FloatTensor(advs) # (m)
        actions = torch.LongTensor(actions) # (m)
        if USE_GPU:
            obs = obs.cuda()
            returns = returns.cuda()
            advs = advs.cuda()
            actions = actions.cuda()
        
        # get action log probs and state values
        action_log_probs, state_values = self.net(obs)
        # (m, N_ACTIONS), (m, 1)
        regret = self.regret_net(obs).gather(1,actions.unsqueeze(1)).squeeze(1)
        regret_target = self.regret_net_old(obs).gather(1,actions.unsqueeze(1)).squeeze(1).detach() \
        + advs
        # (m)
        
        # regret loss
        regret_loss = F.smooth_l1_loss(regret, regret_target)
        
        self.regret_opt.zero_grad()
        regret_loss.backward()
        self.regret_opt.step()
        return round(regret_loss.item(),4)
    
    def learn_policy(self, obs, returns, masks, actions, values, selected_log_probs):
        
        # np.array -> torch.Tensor
        obs = torch.FloatTensor(obs) # (m, 4, 84, 84)
        actions = torch.LongTensor(actions) # (m)
        selected_log_probs = torch.FloatTensor(selected_log_probs) # (m)
        if USE_GPU:
            obs = obs.cuda()
            actions = actions.cuda()
            selected_log_probs = selected_log_probs.cuda()
        
        # get action log probs and state values
        action_log_probs, state_values = self.net(obs)
        # (m, N_ACTIONS), (m, 1)
        target = F.softmax(self.regret_net(obs), dim=1).detach() # (m, N_ACTIONS)
        
        # calc probs
        probs = F.softmax(action_log_probs, dim=1) # (m, N_ACTIONS)
        # (m, N_ACTIONS)
        
        loss = torch.sum(probs * (action_log_probs - torch.log(target + 1e-8)))

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

## 학습

In [None]:
ppo = PPO()
runner = Runner(env=env, model=ppo, nsteps=TRAJ_LEN, gamma=GAMMA, lam=LAMBDA)

# model load with check
if LOAD and os.path.isfile(PRED_PATH) and os.path.isfile(TARGET_PATH):
    ppo.load_model()
    pkl_file = open(RESULT_PATH,'rb')
    result = pickle.load(pkl_file)
    pkl_file.close()
    print('Load complete!')
else:
    result = []
    print('Initialize results!')

print('Collecting experience...')

# episode step for accumulate reward 
epinfobuf = deque(maxlen=100)
# in PPO, we iterate over optimization step
nbatch = N_ENVS * TRAJ_LEN
nupdates = N_STEP// nbatch
# check learning time
start_time = time.time()

for update in range(1, nupdates+1):
    # get minibatch
    obs, returns, masks, actions, values, neglogpacs, epinfos = runner.run()
    epinfobuf.extend(epinfos)
    
    # calculate loss
    inds = np.arange(nbatch)
    for _ in range(N_OPT_EPOCHS):
        a_losses, c_losses, e_losses, t_losses = list(), list(), list(), list()
        # shuffle indices for i.i.d.
        np.random.shuffle(inds)
        # 0 to batch_size with batch_train_size step
        for start in range(0, nbatch, BATCH_SIZE):
            end = start + BATCH_SIZE
            mbinds = inds[start:end]
            slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs))
            ppo.learn_value(*slices)
            
    # calculate loss
    inds = np.arange(nbatch)
    for _ in range(N_OPT_EPOCHS):
        a_losses, c_losses, e_losses, t_losses = list(), list(), list(), list()
        # shuffle indices for i.i.d.
        np.random.shuffle(inds)
        # 0 to batch_size with batch_train_size step
        for start in range(0, nbatch, BATCH_SIZE):
            end = start + BATCH_SIZE
            mbinds = inds[start:end]
            slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs))
            regret_loss = ppo.learn_regret(*slices)
            
    ppo.update_target(ppo.regret_net_old, ppo.regret_net, 1.0)
            
    # calculate loss
    inds = np.arange(nbatch)
    for _ in range(N_OPT_EPOCHS):
        a_losses, c_losses, e_losses, t_losses = list(), list(), list(), list()
        # shuffle indices for i.i.d.
        np.random.shuffle(inds)
        # 0 to batch_size with batch_train_size step
        for start in range(0, nbatch, BATCH_SIZE):
            end = start + BATCH_SIZE
            mbinds = inds[start:end]
            slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs))
            ppo.learn_policy(*slices)
            
    if update % LOG_FREQ == 0:
        # print log and save
        # check time interval
        time_interval = round(time.time() - start_time, 2)
        # calc mean return
        mean_100_ep_return = round(np.mean([epinfo['r'] for epinfo in epinfobuf]),2)
        result.append(mean_100_ep_return)
        # print epi log
        print('N update: ',update,
              '| Mean ep 100 return: ', mean_100_ep_return,
              '/Used Time:',time_interval,
              '/Used Step:',ppo.memory_counter*N_ENVS)
        # save model
        if SAVE:
            ppo.save_model()

Initialize results!
Collecting experience...
N update:  1 | Mean ep 100 return:  0.88 /Used Time: 8.98 /Used Step: 4004
N update:  2 | Mean ep 100 return:  0.81 /Used Time: 17.95 /Used Step: 8008
N update:  3 | Mean ep 100 return:  0.9 /Used Time: 26.81 /Used Step: 12012
N update:  4 | Mean ep 100 return:  1.01 /Used Time: 35.91 /Used Step: 16016
N update:  5 | Mean ep 100 return:  1.08 /Used Time: 44.93 /Used Step: 20020
N update:  6 | Mean ep 100 return:  1.3 /Used Time: 53.89 /Used Step: 24024
N update:  7 | Mean ep 100 return:  1.29 /Used Time: 62.91 /Used Step: 28028
N update:  8 | Mean ep 100 return:  1.44 /Used Time: 71.87 /Used Step: 32032
N update:  9 | Mean ep 100 return:  1.46 /Used Time: 80.83 /Used Step: 36036
N update:  10 | Mean ep 100 return:  1.61 /Used Time: 89.82 /Used Step: 40040
N update:  11 | Mean ep 100 return:  1.81 /Used Time: 98.75 /Used Step: 44044
N update:  12 | Mean ep 100 return:  1.98 /Used Time: 107.7 /Used Step: 48048
N update:  13 | Mean ep 100 retur

N update:  104 | Mean ep 100 return:  12.64 /Used Time: 902.08 /Used Step: 416416
N update:  105 | Mean ep 100 return:  12.76 /Used Time: 910.63 /Used Step: 420420
N update:  106 | Mean ep 100 return:  12.83 /Used Time: 919.11 /Used Step: 424424
N update:  107 | Mean ep 100 return:  12.8 /Used Time: 927.66 /Used Step: 428428
N update:  108 | Mean ep 100 return:  12.77 /Used Time: 936.43 /Used Step: 432432
N update:  109 | Mean ep 100 return:  13.13 /Used Time: 944.98 /Used Step: 436436
N update:  110 | Mean ep 100 return:  13.16 /Used Time: 953.46 /Used Step: 440440
N update:  111 | Mean ep 100 return:  13.39 /Used Time: 961.98 /Used Step: 444444
N update:  112 | Mean ep 100 return:  13.22 /Used Time: 970.68 /Used Step: 448448
N update:  113 | Mean ep 100 return:  13.59 /Used Time: 979.21 /Used Step: 452452
N update:  114 | Mean ep 100 return:  13.75 /Used Time: 987.9 /Used Step: 456456
N update:  115 | Mean ep 100 return:  13.97 /Used Time: 996.62 /Used Step: 460460
N update:  116 | M

## 결과 시각화

In [None]:
plt.plot(range(len(result)), result)
plt.tight_layout()
plt.show()

In [None]:
from matplotlib import animation

def display_frames_as_gif(frames):
    patch = plt.imshow(frames[0])
    plt.axis('off')
    def animate(i):
        patch.set_data(frames[i])
        
    anim = animation.FuncAnimation(plt.gcf(), animate, frames = len(frames), interval=5)
    anim.save('./cfh_breakout_result.gif', writer='imagemagick', fps=30)

In [None]:
env = wrap(gym.make('BreakoutNoFrameskip-v4'))
s = np.array(env.reset())
total_reward = 0
frames = []

for t in range(10000):
    # Render into buffer. 
    frames.append(env.render(mode = 'rgb_array'))
    a, v, l = ppo.choose_action(np.expand_dims(s,axis=0))
    # take action and get next state
    s_, r, done, info = env.step(a)
    s_ = np.array(s_)
    total_reward += r
    if done:
        break
    s = s_
env.close()
print('Total Reward : %.2f'%total_reward)
display_frames_as_gif(frames)

![alt text](./ppo_pong_result.gif "segment")