## 필요한 모듈 설치 확인

In [1]:
import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

import random
import os
import pickle
import time
from collections import deque

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set()

from wrappers import wrap_cover, SubprocVecEnv
from runner import Runner

## 하이퍼 파라미터 정의하기

In [2]:
'''A2C Settings'''
TRAJ_LEN = 1000
ENT_COEF = 1e-2
LAMBDA = 0.95

'''Environment Settings'''
# sequential images to define state
STATE_LEN = 4
# openai gym env name
ENV_NAME = 'PongNoFrameskip-v4'
# number of environments for A2C
N_ENVS = 4
# define gym 
env = SubprocVecEnv([wrap_cover(ENV_NAME) for i in range(N_ENVS)])
# check gym setting
N_ACTIONS = env.action_space.n;print('N_ACTIONS : ',N_ACTIONS) #  6
N_STATES = env.observation_space.shape;print('N_STATES : ',N_STATES) # (4, 84, 84)
# Total simulation step
N_STEP = 10**7
# gamma for MDP
GAMMA = 0.99
# visualize for agent playing
RENDERING = False

'''Training settings'''
# check GPU usage
USE_GPU = torch.cuda.is_available()
print('USE GPU: '+str(USE_GPU))
# learning rage
LR = 1e-4
# epsilon-greedy
EPSILON = 0.0
# clip gradient
MAX_GRAD_NORM = 0.1
# log optimization
LOG_OPT = False

'''Save&Load Settings'''
# log frequency
LOG_FREQ = 10
# check save/load
SAVE = True
LOAD = False
# paths for predction net, target net, result log
NET_PATH = './data/model/a2c_net.pkl'



N_ACTIONS :  6
N_STATES :  (4, 84, 84)
USE GPU: True




## 네트워크 구조 정의하기

In [3]:
class ConvNet(nn.Module):
    def __init__(self):
        super(ConvNet, self).__init__()
        # architecture def
        self.feature_extraction = nn.Sequential(
            nn.Conv2d(STATE_LEN, 32, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1),
            nn.ReLU(),
        )
        self.fc = nn.Linear(7 * 7 * 64, 256)
        # actor
        self.actor = nn.Linear(256, N_ACTIONS)
        # critic
        self.critic = nn.Linear(256, 1)
            
        # parameter initialization
        for m in self.modules():
            if isinstance(m, nn.Conv2d) or isinstance(m, nn.Linear):
                nn.init.orthogonal_(m.weight, gain = np.sqrt(2))
                nn.init.constant_(m.bias, 0.0)
            

    def forward(self, x):
        # x is a tensor of (m, 4, 84, 84)
        x = self.feature_extraction(x / 255.0)
        # x.size(0) : mini-batch size
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc(x))
        # use log_softmax for numerical stability
        action_log_prob = F.log_softmax(self.actor(x), dim=1)
        state_value = self.critic(x)

        return action_log_prob, state_value

    def save(self, PATH):
        torch.save(self.state_dict(),PATH)

    def load(self, PATH):
        self.load_state_dict(torch.load(PATH))

## A2C 정의하기

In [4]:
class A2C:
    def __init__(self):
        self.net = ConvNet()
        # use gpu
        if USE_GPU:
            self.net = self.net.cuda()
            
        # simulator step conter
        self.memory_counter = 0
        
        # define optimizer
        self.optimizer = torch.optim.Adam(self.net.parameters(), lr=LR, weight_decay=1e-4)
        
    def save_model(self):
        self.net.cpu()
        self.net.save(NET_PATH)
        if USE_GPU:
            self.net.cuda()
            
    def load_model(self):
        self.net.cpu()
        self.net.load(NET_PATH)
        if USE_GPU:
            self.net.cuda()
        
    def choose_action(self, x):
        self.memory_counter += 1
        # Assume that x is a np.array of shape (nenvs, 4, 84, 84)
        x = torch.FloatTensor(x)
        if USE_GPU:
            x = x.cuda()
        # get action log probs and state values
        action_log_probs, state_values = self.net(x) # (nenvs, N_ACTIONS)
        probs = F.softmax(action_log_probs, dim=1).data.cpu().numpy()
        probs = (probs+1e-8)/np.sum((probs+1e-8), axis=1, keepdims=True)
        # sample actions
        actions = np.array([np.random.choice(N_ACTIONS,p=probs[i]) for i in range(len(probs))])
        # convert tensor to np.array
        action_log_probs , state_values = action_log_probs.data.cpu().numpy() , state_values.squeeze(1).data.cpu().numpy()
        # calc selected logprob
        selected_log_probs = np.array([action_log_probs[i][actions[i]] for i in range(len(probs))])
        return actions, state_values, selected_log_probs

    def learn(self, obs, returns, masks, actions, values, selected_log_probs):
        
        # calculate the advantages
        advs = returns - values
        # Normalize the advantages for numerical stability
        advs = (advs - advs.mean()) / (advs.std() + 1e-8)
        
        # np.array -> torch.Tensor
        obs = torch.FloatTensor(obs) # (m, 4, 84, 84)
        advs = torch.FloatTensor(advs) # (m)
        actions = torch.LongTensor(actions) # (m)
        selected_log_probs = torch.FloatTensor(selected_log_probs) # (m)
        values = torch.FloatTensor(values) # (m)
        if USE_GPU:
            obs = obs.cuda()
            advs = advs.cuda()
            actions = actions.cuda()
            selected_log_probs = selected_log_probs.cuda()
            values = values.cuda()
        
        # get action log probs and state values
        action_log_probs, state_values = self.net(obs)
        # (m, N_ACTIONS), (m, 1)
        
        # calc probs
        probs = F.softmax(action_log_probs, dim=1)
        # (m, N_ACTIONS)
        
        # calc entropy loss
        ent_loss = ENT_COEF *((action_log_probs * probs).sum(dim=1)).mean()
        # (1)
        
        # calc log probs
        cur_log_probs = action_log_probs.gather(1,actions.unsqueeze(1))
        # cur : (m, 1)
        
        # actor loss
        actor_loss = torch.mean(- cur_log_probs.squeeze(1) * advs) # (1)
        # critic loss
        critic_loss = 0.5 * torch.mean( (state_values.squeeze(1) - values)**2 ) # (1)

        loss = actor_loss + critic_loss + ent_loss # (1)
        
        actor_loss, critic_loss, ent_loss, total_loss = actor_loss.data.cpu().numpy(), \
        critic_loss.data.cpu().numpy(), ent_loss.data.cpu().numpy(), loss.data.cpu().numpy()

        self.optimizer.zero_grad()
        loss.backward()
        nn.utils.clip_grad_norm_(self.net.parameters(), MAX_GRAD_NORM)
        self.optimizer.step()
        
        return round(float(actor_loss), 4), round(float(critic_loss), 4),\
    round(float(ent_loss), 4), round(float(total_loss), 4)

## 학습

In [None]:
a2c = A2C()
runner = Runner(env=env, model=a2c, nsteps=TRAJ_LEN, gamma=GAMMA, lam=LAMBDA)

# model load with check
if LOAD and os.path.isfile(PRED_PATH) and os.path.isfile(TARGET_PATH):
    a2c.load_model()
    pkl_file = open(RESULT_PATH,'rb')
    result = pickle.load(pkl_file)
    pkl_file.close()
    print('Load complete!')
else:
    result = []
    print('Initialize results!')

print('Collecting experience...')

# episode step for accumulate reward 
epinfobuf = deque(maxlen=100)
# in A2C, we iterate over optimization step
nbatch = N_ENVS * TRAJ_LEN
nupdates = N_STEP// nbatch
# check learning time
start_time = time.time()

for update in range(1, nupdates+1):
    # get minibatch
    obs, returns, masks, actions, values, neglogpacs, epinfos = runner.run()
    epinfobuf.extend(epinfos)
    slices = (obs, returns, masks, actions, values, neglogpacs)
    actor_loss, critic_loss, ent_loss, total_loss = a2c.learn(*slices)
    
    # print opt log
    if LOG_OPT and update % LOG_FREQ == 0:
        print('Iter ',_,
             'actor loss : ',round(actor_loss, 3),
             'critic loss : ', round(critic_loss, 3),
             'ent loss : ', round(ent_loss, 3),
             'total loss : ', round(total_loss, 3))
            
    if update % LOG_FREQ == 0:
        # print log and save
        # check time interval
        time_interval = round(time.time() - start_time, 2)
        # calc mean return
        mean_100_ep_return = round(np.mean([epinfo['r'] for epinfo in epinfobuf]),2)
        result.append(mean_100_ep_return)
        # print epi log
        print('N update: ',update,
              '| Mean ep 100 return: ', mean_100_ep_return,
              '/Used Time:',time_interval,
              '/Used Step:',a2c.memory_counter*N_ENVS)
        # save model
        if SAVE:
            a2c.save_model()

Initialize results!
Collecting experience...
N update:  10 | Mean ep 100 return:  -20.5 /Used Time: 35.44 /Used Step: 40040
N update:  20 | Mean ep 100 return:  -20.49 /Used Time: 71.21 /Used Step: 80080
N update:  30 | Mean ep 100 return:  -20.61 /Used Time: 106.67 /Used Step: 120120
N update:  40 | Mean ep 100 return:  -20.57 /Used Time: 142.51 /Used Step: 160160
N update:  50 | Mean ep 100 return:  -20.34 /Used Time: 177.75 /Used Step: 200200
N update:  60 | Mean ep 100 return:  -20.2 /Used Time: 213.81 /Used Step: 240240
N update:  70 | Mean ep 100 return:  -20.35 /Used Time: 248.9 /Used Step: 280280
N update:  80 | Mean ep 100 return:  -20.39 /Used Time: 283.52 /Used Step: 320320
N update:  90 | Mean ep 100 return:  -20.19 /Used Time: 318.41 /Used Step: 360360
N update:  100 | Mean ep 100 return:  -20.04 /Used Time: 353.7 /Used Step: 400400
N update:  110 | Mean ep 100 return:  -20.06 /Used Time: 388.95 /Used Step: 440440
N update:  120 | Mean ep 100 return:  -20.13 /Used Time: 42

N update:  980 | Mean ep 100 return:  -20.07 /Used Time: 3423.9 /Used Step: 3923920
N update:  990 | Mean ep 100 return:  -20.05 /Used Time: 3458.41 /Used Step: 3963960
N update:  1000 | Mean ep 100 return:  -20.12 /Used Time: 3492.92 /Used Step: 4004000
N update:  1010 | Mean ep 100 return:  -20.06 /Used Time: 3527.66 /Used Step: 4044040
N update:  1020 | Mean ep 100 return:  -20.01 /Used Time: 3562.22 /Used Step: 4084080
N update:  1030 | Mean ep 100 return:  -20.04 /Used Time: 3596.83 /Used Step: 4124120
N update:  1040 | Mean ep 100 return:  -20.07 /Used Time: 3631.88 /Used Step: 4164160
N update:  1050 | Mean ep 100 return:  -20.16 /Used Time: 3666.27 /Used Step: 4204200
N update:  1060 | Mean ep 100 return:  -20.16 /Used Time: 3701.01 /Used Step: 4244240
N update:  1070 | Mean ep 100 return:  -20.03 /Used Time: 3736.24 /Used Step: 4284280
N update:  1080 | Mean ep 100 return:  -19.88 /Used Time: 3770.99 /Used Step: 4324320
N update:  1090 | Mean ep 100 return:  -19.89 /Used Time:

## 결과 시각화

In [None]:
plt.plot(range(len(result)), result)
plt.tight_layout()
plt.show()

In [None]:
from matplotlib import animation

def display_frames_as_gif(frames):
    patch = plt.imshow(frames[0])
    plt.axis('off')
    def animate(i):
        patch.set_data(frames[i])
        
    anim = animation.FuncAnimation(plt.gcf(), animate, frames = len(frames), interval=5)
    anim.save('./a2c_pong_result.gif', writer='imagemagick', fps=30)

In [None]:
env = wrap(gym.make('PongNoFrameskip-v4'))
s = np.array(env.reset())
total_reward = 0
frames = []

dqn = DQN()
dqn.load_model()

for t in range(10000):
    # Render into buffer. 
    frames.append(env.render(mode = 'rgb_array'))
    a = dqn.choose_action(s, 1.0)
    # take action and get next state
    s_, r, done, info = env.step(a)
    s_ = np.array(s_)
    total_reward += r
    if done:
        break
    s = s_
env.close()
print('Total Reward : %.2f'%total_reward)
display_frames_as_gif(frames)

![alt text](./a2c_pong_result.gif "segment")