## 필요한 모듈 설치 확인

In [1]:
import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.autograd as autograd
import numpy as np

import random
import os
import pickle
import time
from collections import deque

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set()

from wrappers import wrap, wrap_cover, SubprocVecEnv
from runner import Runner

## 하이퍼 파라미터 정의하기

In [2]:
'''PPO Settings'''
TRAJ_LEN = 1000
N_OPT_EPOCHS = 4
ENT_COEF = 1e-2
CLIP_RANGE = 0.1
LAMBDA = 0.95

'''Environment Settings'''
# sequential images to define state
STATE_LEN = 4
# openai gym env name
ENV_NAME = 'BreakoutNoFrameskip-v4'
# number of environments for A2C
N_ENVS = 4
# define gym 
env = SubprocVecEnv([wrap_cover(ENV_NAME) for i in range(N_ENVS)])
# check gym setting
N_ACTIONS = env.action_space.n;print('N_ACTIONS : ',N_ACTIONS) #  6
N_STATES = env.observation_space.shape;print('N_STATES : ',N_STATES) # (4, 84, 84)
# Total simulation step
N_STEP = 10**7
# gamma for MDP
GAMMA = 0.99
# visualize for agent playing
RENDERING = False

'''Training settings'''
# check GPU usage
USE_GPU = torch.cuda.is_available()
print('USE GPU: '+str(USE_GPU))
# mini-batch size
BATCH_SIZE = 32
# learning rage
LR = 1e-4
# clip gradient
MAX_GRAD_NORM = 0.1
ZERO_GP = True
# log optimization
LOG_OPT = False

'''Save&Load Settings'''
# log frequency
LOG_FREQ = 1
# check save/load
SAVE = True
LOAD = False
# paths for predction net, target net, result log
NET_PATH = './data/model/ppo_net.pkl'
DIS_PATH = './data/model/gail_dis_net.pkl'



N_ACTIONS :  4
N_STATES :  (4, 84, 84)
USE GPU: True


Process Process-2:
Process Process-3:
Process Process-4:
Process Process-1:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/sungyubkim/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/home/sungyubkim/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/home/sungyubkim/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/home/sungyubkim/anaconda3/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/home/sungyubkim/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/home/sungyubkim/anaconda3/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/home/sungyubkim/Dropbox/Dee

## 네트워크 구조 정의하기

In [3]:
class ConvNet(nn.Module):
    def __init__(self):
        super(ConvNet, self).__init__()
        # architecture def
        self.feature_extraction = nn.Sequential(
            nn.Conv2d(STATE_LEN, 32, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1),
            nn.ReLU(),
        )
        self.fc = nn.Linear(7 * 7 * 64, 256)
        # actor
        self.actor = nn.Linear(256, N_ACTIONS)
        # critic
        self.critic = nn.Linear(256, 1)
            
        # parameter initialization
        for m in self.modules():
            if isinstance(m, nn.Conv2d) or isinstance(m, nn.Linear):
                nn.init.orthogonal_(m.weight, gain = np.sqrt(2))
                nn.init.constant_(m.bias, 0.0)
            

    def forward(self, x):
        # x is a tensor of (m, 4, 84, 84)
        x = self.feature_extraction(x / 255.0)
        # x.size(0) : mini-batch size
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc(x))
        # use log_softmax for numerical stability
        action_log_prob = F.log_softmax(self.actor(x), dim=1)
        state_value = self.critic(x)

        return action_log_prob, state_value

    def save(self, PATH):
        torch.save(self.state_dict(),PATH)

    def load(self, PATH):
        self.load_state_dict(torch.load(PATH))
        
class Discriminator(nn.Module):
    def __init__(self):
        super(Discriminator, self).__init__()
        # architecture def
        self.feature_extraction = nn.Sequential(
            nn.Conv2d(STATE_LEN, 32, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1),
            nn.ReLU(),
        )
        self.fc1 = nn.Linear(7 * 7 * 64, 256)
        self.action_feature = nn.Linear(N_ACTIONS, 256)
        # actor
        self.fc2 = nn.Linear(256, 1)
        
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.orthogonal_(m.weight, gain = np.sqrt(2))
                nn.init.constant_(m.bias, 0.0)
            elif isinstance(m, nn.Linear):
                nn.init.kaiming_normal_(m.weight)
                nn.init.constant_(m.bias, 0.0)
            

    def forward(self, x, a):
        # x is a tensor of (m, 4, 84, 84)
        x = self.feature_extraction(x / 255.0)
        # x.size(0) : mini-batch size
        x = x.view(x.size(0), -1)
        a_onehot = torch.zeros(x.size(0), N_ACTIONS)
        if USE_GPU:
            a_onehot = a_onehot.cuda(device=0)
        a_onehot.scatter_(1, a.unsqueeze(1), 1)
        x = F.relu(self.fc1(x) * F.leaky_relu(self.action_feature(a_onehot), negative_slope=2e-1))
        action_value = self.fc2(x)

        return action_value

    def save(self, PATH):
        torch.save(self.state_dict(),PATH)

    def load(self, PATH):
        self.load_state_dict(torch.load(PATH))

## PPO 정의하기

In [4]:
class PPO:
    def __init__(self):
        self.net = ConvNet()
        self.dis_net = Discriminator()
        # use gpu
        if USE_GPU:
            self.net = self.net.cuda()
            self.dis_net = self.dis_net.cuda()
            
        # simulator step conter
        self.memory_counter = 0
        
        # create the replay buffer for expert
        with open( "replay.pkl", "rb" ) as f:
            self.expert_replay_buffer = pickle.load(f)
        
        # define optimizer
        self.optimizer = torch.optim.Adam(self.net.parameters(), lr=LR)
        self.dis_opt = torch.optim.RMSprop(self.dis_net.parameters(), lr=LR, alpha=0.9)
        
        # ppo clip range
        self.clip_range = CLIP_RANGE
        
    def save_model(self):
        self.net.cpu()
        self.dis_net.cpu()
        self.net.save(NET_PATH)
        self.dis_net.save(DIS_PATH)
        if USE_GPU:
            self.net.cuda()
            self.dis_net.cuda()
            
    def load_model(self):
        self.net.cpu()
        self.dis_net.cpu()
        self.net.load(NET_PATH)
        self.dis_net.load(DIS_PATH)
        if USE_GPU:
            self.net.cuda()
            self.dis_net.cuda()
        
    def choose_action(self, x):
        self.memory_counter += 1
        # Assume that x is a np.array of shape (nenvs, 4, 84, 84)
        x = torch.FloatTensor(x)
        if USE_GPU:
            x = x.cuda()
        # get action log probs and state values
        action_log_probs, state_values = self.net(x) # (nenvs, N_ACTIONS)
        probs = F.softmax(action_log_probs, dim=1).data.cpu().numpy()
        probs = (probs+1e-8)/np.sum((probs+1e-8), axis=1, keepdims=True)
        # sample actions
        actions = np.array([np.random.choice(N_ACTIONS,p=probs[i]) for i in range(len(probs))])
        # convert tensor to np.array
        action_log_probs , state_values = action_log_probs.data.cpu().numpy() , state_values.squeeze(1).data.cpu().numpy()
        # calc selected logprob
        selected_log_probs = np.array([action_log_probs[i][actions[i]] for i in range(len(probs))])
        return actions, state_values, selected_log_probs
    
    def reward_dis(self, s, a):
        s = torch.FloatTensor(s)
        a = torch.LongTensor(a)
        
        if USE_GPU:
            s , a = s.cuda(), a.cuda()
        
        d_reward = -torch.log(torch.sigmoid(self.dis_net(s, a)))+torch.log(1-torch.sigmoid(self.dis_net(s, a)))
        
        return d_reward.squeeze(1).data.cpu().numpy()
    
    def learn_dis(self, obs, returns, masks, actions, values, selected_log_probs):
        # optimize discriminator
        b_s = torch.FloatTensor(obs)
        b_s.requires_grad = True
        b_a = torch.LongTensor(actions)
        
        # data sample from expert experience replay
        e_state_memory, e_action_memory, e_reward_memory, \
        e_next_state_memory, e_done = self.expert_replay_buffer.sample(len(b_s))
        
        e_s = torch.FloatTensor(e_state_memory)
        e_s.requires_grad = True
        e_a = torch.LongTensor(e_action_memory)
        
        if USE_GPU:
            b_s, b_a, e_s, e_a = b_s.cuda(device=0), b_a.cuda(device=0), e_s.cuda(device=0), e_a.cuda(device=0)
        
        d_policy = self.dis_net(b_s, b_a).squeeze(1) # (m)
        d_expert = self.dis_net(e_s, e_a).squeeze(1) # (m)
        
        d_loss = -torch.log(torch.sigmoid(d_policy)).mean() -torch.log(1-torch.sigmoid(d_expert)).mean()
        loss = d_loss
        
        # calc gradient penalty
        if ZERO_GP:
            b_grad = autograd.grad(d_policy, b_s, create_graph=True,
                            grad_outputs=torch.ones_like(d_policy),
                            retain_graph=True, only_inputs=True)[0].view(BATCH_SIZE, -1)
            e_grad = autograd.grad(d_expert, e_s, create_graph=True,
                            grad_outputs=torch.ones_like(d_expert),
                            retain_graph=True, only_inputs=True)[0].view(BATCH_SIZE, -1)
            b_grad = b_grad.norm(dim=1)
            e_grad = e_grad.norm(dim=1)
            gp_loss = 1e+5 * ((b_grad)**2 + (e_grad)**2).mean()
            loss += gp_loss
        
        self.dis_opt.zero_grad()
        loss.backward()
        self.dis_opt.step()
        
        return round(float(d_loss.item()), 4), round(float(gp_loss.item()), 4)

    def learn(self, obs, returns, masks, actions, values, selected_log_probs):
        
        # calculate the advantages
        advs = returns - values
        advs = (advs - advs.mean())/(advs.std() + 1e-8)
        
        # np.array -> torch.Tensor
        obs = torch.FloatTensor(obs) # (m, 4, 84, 84)
        returns = torch.FloatTensor(returns) # (m)
        advs = torch.FloatTensor(advs) # (m)
        actions = torch.LongTensor(actions) # (m)
        selected_log_probs = torch.FloatTensor(selected_log_probs) # (m)
        values = torch.FloatTensor(values) # (m)
        if USE_GPU:
            obs = obs.cuda()
            returns = returns.cuda()
            advs = advs.cuda()
            actions = actions.cuda()
            selected_log_probs = selected_log_probs.cuda()
            values = values.cuda()
        
        # get action log probs and state values
        action_log_probs, state_values = self.net(obs)
        # (m, N_ACTIONS), (m, 1)
        
        # calc probs
        probs = F.softmax(action_log_probs, dim=1)
        # (m, N_ACTIONS)
        
        # calc entropy loss
        ent_loss = ENT_COEF *((action_log_probs * probs).sum(dim=1)).mean()
        # (1)
        
        # calc log probs
        cur_log_probs = action_log_probs.gather(1,actions.unsqueeze(1))
        # cur : (m, 1)
        ratio = torch.exp(cur_log_probs.squeeze(1)-selected_log_probs)
        # (m)
        
        # actor loss
        surr1 = ratio * advs # (m)
        surr2 = torch.clamp(ratio, 1.0 - self.clip_range, 1.0 + self.clip_range)*advs # (m)
        actor_loss = -torch.min(surr1, surr2).mean() # (1)
        # critic loss
        critic_loss = F.smooth_l1_loss(state_values.squeeze(1), returns) # (1)

        loss = actor_loss + critic_loss + ent_loss # (1)
        
        actor_loss, critic_loss, ent_loss, total_loss = actor_loss.data.cpu().numpy(), \
        critic_loss.data.cpu().numpy(), ent_loss.data.cpu().numpy(), loss.data.cpu().numpy()

        self.optimizer.zero_grad()
        loss.backward()
        nn.utils.clip_grad_norm_(self.net.parameters(), MAX_GRAD_NORM)
        self.optimizer.step()
        
        return round(float(actor_loss), 4), round(float(critic_loss), 4),\
    round(float(ent_loss), 4), round(float(total_loss), 4)

## 학습

In [5]:
ppo = PPO()
runner = Runner(env=env, model=ppo, nsteps=TRAJ_LEN, gamma=GAMMA, lam=LAMBDA)

# model load with check
if LOAD and os.path.isfile(PRED_PATH) and os.path.isfile(TARGET_PATH):
    ppo.load_model()
    pkl_file = open(RESULT_PATH,'rb')
    result = pickle.load(pkl_file)
    pkl_file.close()
    print('Load complete!')
else:
    result = []
    print('Initialize results!')

print('Collecting experience...')

# episode step for accumulate reward 
epinfobuf = deque(maxlen=100)
# in PPO, we iterate over optimization step
nbatch = N_ENVS * TRAJ_LEN
nupdates = N_STEP// nbatch
# check learning time
start_time = time.time()

for update in range(1, nupdates+1):
    # get minibatch
    obs, returns, masks, actions, values, neglogpacs, epinfos = runner.run()
    epinfobuf.extend(epinfos)
    
    inds = np.arange(nbatch)
    for start in range(0, nbatch, BATCH_SIZE):
        end = start + BATCH_SIZE
        mbinds = inds[start:end]
        slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs))
        d_loss, gp_loss = ppo.learn_dis(*slices)
    
    # calculate loss
    inds = np.arange(nbatch)
    for _ in range(N_OPT_EPOCHS):
        a_losses, c_losses, e_losses, t_losses = list(), list(), list(), list()
        # shuffle indices for i.i.d.
        np.random.shuffle(inds)
        # 0 to batch_size with batch_train_size step
        for start in range(0, nbatch, BATCH_SIZE):
            end = start + BATCH_SIZE
            mbinds = inds[start:end]
            slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs))
            actor_loss, critic_loss, ent_loss, total_loss = ppo.learn(*slices)
            # save opt log
            a_losses.append(actor_loss)
            c_losses.append(critic_loss)
            e_losses.append(ent_loss)
            t_losses.append(total_loss)
        # print opt log
        if LOG_OPT:
            print('Iter ',_,
                 'actor loss : ',round(float(np.mean(a_losses)), 3),
                 'critic loss : ', round(float(np.mean(c_losses)), 3),
                 'ent loss : ', round(float(np.mean(e_losses)), 3),
                 'total loss : ', round(float(np.mean(t_losses)), 3))
            
    if update % LOG_FREQ == 0:
        # print log and save
        # check time interval
        time_interval = round(time.time() - start_time, 2)
        # calc mean return
        mean_100_ep_return = round(np.mean([epinfo['r'] for epinfo in epinfobuf]),2)
        result.append(mean_100_ep_return)
        # print epi log
        print('N update: ',update,
              '| Mean ep 100 return: ', mean_100_ep_return,
              '/Used Time:',time_interval,
              '/Used Step:',ppo.memory_counter*N_ENVS,
             '| Dis loss : ',d_loss,
             '| GP loss : ', gp_loss)
        # save model
        if SAVE:
            ppo.save_model()

Initialize results!
Collecting experience...
N update:  1 | Mean ep 100 return:  1.79 /Used Time: 9.8 /Used Step: 4004 | Dis loss :  1.1856 | GP loss :  0.3294
N update:  2 | Mean ep 100 return:  1.82 /Used Time: 19.57 /Used Step: 8008 | Dis loss :  0.9765 | GP loss :  0.2363
N update:  3 | Mean ep 100 return:  1.69 /Used Time: 29.41 /Used Step: 12012 | Dis loss :  0.783 | GP loss :  0.2175
N update:  4 | Mean ep 100 return:  1.51 /Used Time: 39.19 /Used Step: 16016 | Dis loss :  0.6666 | GP loss :  0.1836
N update:  5 | Mean ep 100 return:  1.5 /Used Time: 49.11 /Used Step: 20020 | Dis loss :  0.72 | GP loss :  0.2073
N update:  6 | Mean ep 100 return:  1.4 /Used Time: 59.13 /Used Step: 24024 | Dis loss :  0.7069 | GP loss :  0.1763
N update:  7 | Mean ep 100 return:  1.28 /Used Time: 69.02 /Used Step: 28028 | Dis loss :  0.6624 | GP loss :  0.1751
N update:  8 | Mean ep 100 return:  1.21 /Used Time: 78.88 /Used Step: 32032 | Dis loss :  0.6206 | GP loss :  0.1813
N update:  9 | Mean 

KeyboardInterrupt: 

## 결과 시각화

In [None]:
plt.plot(range(len(result)), result)
plt.tight_layout()
plt.show()

In [None]:
from matplotlib import animation

def display_frames_as_gif(frames):
    patch = plt.imshow(frames[0])
    plt.axis('off')
    def animate(i):
        patch.set_data(frames[i])
        
    anim = animation.FuncAnimation(plt.gcf(), animate, frames = len(frames), interval=5)
    anim.save('./gail_breakout_result.gif', writer='imagemagick', fps=30)

In [None]:
env = wrap(gym.make('BreakoutNoFrameskip-v4'))
s = np.array(env.reset())
total_reward = 0
frames = []

for t in range(10000):
    # Render into buffer. 
    frames.append(env.render(mode = 'rgb_array'))
    a, v, l = ppo.choose_action(np.expand_dims(s,axis=0))
    # take action and get next state
    s_, r, done, info = env.step(a)
    s_ = np.array(s_)
    total_reward += r
    if done:
        break
    s = s_
env.close()
print('Total Reward : %.2f'%total_reward)
display_frames_as_gif(frames)

![alt text](./ppo_pong_result.gif "segment")