## 필요한 모듈 설치 확인

In [1]:
import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from replay_memory import ReplayBuffer, PrioritizedReplayBuffer

import random
import os
import pickle
import time
from copy import deepcopy

# 만약 opencv-python이 설치되어있지 않다면 다음을 통해서 설치해주세요.
# pip install opencv-python
# 만약 설치에 오류가 발생한다면 다음을 참고해주세요.
# https://opencv-python-tutroals.readthedocs.io/en/latest/py_tutorials/py_setup/py_table_of_contents_setup/py_table_of_contents_setup.html#py-table-of-content-setup
from wrappers import wrap

## 하이퍼 파라미터 정의하기

In [2]:
'''DQN settings'''
# sequential images to define state
STATE_LEN = 4
# target policy synchronize interval
TARGET_REPLACE_ITER = 10**4
# (prioritized) experience replay memory size
MEMORY_CAPACITY = 10**6
# check per
is_per = True
# alpha of PER
PER_ALPHA = 0.6
PER_BETA = 0.4
PER_EPSILON = 1e-6
# Double DQN
DOUBLE = False
# Dueling architecture
DUEL = False

'''Environment Settings'''
# openai gym env name
ENV_NAME = 'PongNoFrameskip-v4'
env = wrap(gym.make(ENV_NAME))
N_ACTIONS = env.action_space.n
N_STATES = env.observation_space.shape
# Total simulation step
STEP_NUM = 2*10**7
# gamma for MDP
GAMMA = 0.99
# visualize for agent playing
RENDERING = False

'''Training settings'''
# check GPU usage
USE_GPU = torch.cuda.is_available()
print('USE GPU: '+str(USE_GPU))
# mini-batch size
BATCH_SIZE = 32
# learning rage
LR = 1e-4
# epsilon-greedy
EPSILON = 0.0

'''Save&Load Settings'''
# check save/load
SAVE = True
LOAD = False
# save frequency
SAVE_FREQ = 10**4
# paths for predction net, target net, result log
PRED_PATH = './data/model/pred_net.pkl'
TARGET_PATH = './data/model/target_net.pkl'
RESULT_PATH = './data/plots/result.pkl'

USE GPU: True




## 네트워크 구조 정의하기

In [3]:
class ConvNet(nn.Module):
    def __init__(self):
        super(ConvNet, self).__init__()
        # nn.Sequential을 사용하면 다음과 같입 코드를 간결하게 바꿀 수 있습니다.
        self.feature_extraction = nn.Sequential(
            nn.Conv2d(STATE_LEN, 32, kernel_size=8, stride=4),
            nn.LeakyReLU(negative_slope=0.01),
            nn.Conv2d(32, 64, kernel_size=4, stride=2),
            nn.LeakyReLU(negative_slope=0.01),
            nn.Conv2d(64, 64, kernel_size=3, stride=1),
            nn.LeakyReLU(negative_slope=0.01),
        )
        self.fc = nn.Linear(7 * 7 * 64, 512)

        if DUEL:
            # advantage function/ state value function
            self.fc_adv = nn.Linear(512, N_ACTIONS)
            self.fc_val = nn.Linear(512, 1)
        else:
            # action value function
            self.fc_q = nn.Linear(512, N_ACTIONS) 
            
        # 파라미터 값 초기화 코드는 다음과 같이 간결하게 바꿀 수 있습니다.
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.orthogonal_(m.weight, gain = np.sqrt(2))
                nn.init.constant_(m.bias, 0.0)
            elif isinstance(m, nn.Linear):
                nn.init.kaiming_normal_(m.weight)
                nn.init.constant_(m.bias, 0.0)
            

    def forward(self, x):
        # x는 (m, 84, 84, 4)의 tensor
        x = self.feature_extraction(x / 255.0)
        # x.size(0) : mini-batch size
        x = x.view(x.size(0), -1)
        x = F.leaky_relu(self.fc(x), negative_slope=0.01)
        
        if DUEL:
            adv = self.fc_adv(x)
            val = self.fc_val(x)
            action_value = val + adv - adv.mean(1).unsqueeze(1)
        else:
            action_value = self.fc_q(x)

        return action_value

    def save(self, PATH):
        torch.save(self.state_dict(),PATH)

    def load(self, PATH):
        self.load_state_dict(torch.load(PATH))

## DQN 정의하기

In [4]:
class DQN(object):
    def __init__(self):
        self.pred_net, self.target_net = ConvNet(), ConvNet()
        # sync eval target
        self.update_target(self.target_net, self.pred_net, 1.0)
        # use gpu
        if USE_GPU:
            self.pred_net.cuda()
            self.target_net.cuda()
            
        # simulator step conter
        self.memory_counter = 0
        # target network step counter
        self.learn_step_counter = 0
        
        # ceate the replay buffer
        if is_per:
            self.replay_buffer = PrioritizedReplayBuffer(MEMORY_CAPACITY, alpha=PER_ALPHA)
        else:
            self.replay_buffer = ReplayBuffer(MEMORY_CAPACITY)
        
        # define optimizer
        self.optimizer = torch.optim.Adam(self.pred_net.parameters(), lr=LR)
        
    def update_target(self, target, pred, update_rate):
        # update target network parameters using predcition network
        for target_param, pred_param in zip(target.parameters(), pred.parameters()):
            target_param.data.copy_((1.0 - update_rate) \
                                    * target_param.data + update_rate*pred_param.data)
            
    def save_model(self):
        # save prediction network and target network
        self.pred_net.save(PRED_PATH)
        self.target_net.save(TARGET_PATH)

    def load_model(self):
        # load prediction network and target network
        self.pred_net.load(PRED_PATH)
        self.target_net.load(TARGET_PATH)

    def choose_action(self, x, EPSILON):
        x = torch.FloatTensor(x)
        if USE_GPU:
            x = x.cuda()

        if np.random.uniform() < EPSILON:
            # greedy case
            action_value = self.pred_net(x.unsqueeze(0))
            action = torch.argmax(action_value).data.cpu().numpy()
        else:
            # random exploration case
            action = np.random.randint(0, N_ACTIONS)
        return action

    def store_transition(self, s, a, r, s_, done):
        self.memory_counter += 1
        self.replay_buffer.add(s, a, r, s_, float(done))

    def learn(self, beta):
        self.learn_step_counter += 1
        # target parameter update
        if self.learn_step_counter % TARGET_REPLACE_ITER == 0:
            self.update_target(self.target_net, self.pred_net, 1.0)
        
        # data sample from experience replay
        if is_per:
            experience = self.replay_buffer.sample(BATCH_SIZE, beta=beta)
            (b_state_memory, b_action_memory, b_reward_memory, \
             b_next_state_memory, b_done, b_weights, b_idxes) = experience
        else:
            b_state_memory, b_action_memory, b_reward_memory, \
            b_next_state_memory, b_done = self.replay_buffer.sample(BATCH_SIZE)
            b_weights, b_idxes = np.ones_like(b_reward_memory), None
            
        b_s = torch.FloatTensor(b_state_memory)
        b_a = torch.LongTensor(b_action_memory)
        b_r = torch.FloatTensor(b_reward_memory)
        b_s_ = torch.FloatTensor(b_next_state_memory)
        b_d = torch.FloatTensor(b_done)

        if USE_GPU:
            b_s, b_a, b_r, b_s_, b_d = b_s.cuda(), b_a.cuda(), b_r.cuda(), b_s_.cuda(), b_d.cuda()

        # action value prediction
        q_eval = self.pred_net(b_s).gather(1, b_a.unsqueeze(1)).view(-1)
        # shape : (m, 1)

        if DOUBLE:
            # get best actions of next state
            _ , best_actions = self.pred_net(b_s_).detach().max(1)
            # get next state value
            q_next = self.target_net(b_s_).detach()
            # get target value
            q_target = b_r + GAMMA *(1.-b_d)* q_next.gather(1, best_actions.unsqueeze(1)).squeeze(1)
            # shape (m, 1)
        else:
            # get next state value
            q_next = self.target_net(b_s_).detach()
            # get target value
            q_target = b_r + GAMMA *(1.-b_d)* q_next.max(1)[0]
            # shape (m, 1)
            
        # calc huber loss, dont reduce for importance weight
        loss = F.smooth_l1_loss(q_eval, q_target, reduce= False)
        # calc importance weighted loss
        loss = torch.mean(torch.Tensor(b_weights).cuda()*loss)
        # get td error
        td_error = (q_target - q_eval).data.cpu().numpy()
        
        # update importance weight
        if is_per:
            new_priorities = np.abs(td_error) + PER_EPSILON
            self.replay_buffer.update_priorities(b_idxes, new_priorities)
        
        # backprop loss
        self.optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(self.pred_net.parameters(),10.)
        self.optimizer.step()

## 학습

In [None]:
dqn = DQN()

# model load with check
if LOAD and os.path.isfile(EVAL_PATH) and os.path.isfile(TARGET_PATH):
    dqn.load_model()
    pkl_file = open(RESULT_PATH,'rb')
    result = pickle.load(pkl_file)
    pkl_file.close()
    print('Load complete!')
else:
    result = []
    print('Initialize results!')

print('Collecting experience...')

# episode step for accumulate reward (since we are using EpisodicLifeEnv of OpenAI gym wrapper)
epi_step = 0
# accumulate return of current episode
entire_ep_r = 0.
# log for accumulate returns
entire_ep_rs = []
# check learning time
start_time = time.time()

while dqn.memory_counter <= STEP_NUM:
    # env reset
    s = np.array(env.reset())
    
    # initialize one episode reward
    ep_r = 0.

    while True:
        a = dqn.choose_action(s, EPSILON)

        # take action and get next state
        s_, r, done, info = env.step(a)
        s_ = np.array(s_)
        
        # accumulate return
        ep_r += r
        # clip rewards for numerical stability
        clip_r = np.sign(r)

        # store the transition
        dqn.store_transition(s, a, clip_r, s_, float(done))

        # annealing the epsilon(exploration strategy), beta(per smoothing)
        if dqn.memory_counter <= 1e+6:
            # linear annealing to 0.9 until million step
            EPSILON += 0.9/(1e+6)
        elif dqn.memory_counter <= (2e+7):
            # linear annealing to 0.99 until the end
            EPSILON += 0.09/(2e+7 - 1e+6)
            # linear annealing to 1 until the end
            PER_BETA += (1.0 - PER_BETA) /(2e+7 - 1e+6)

        # if memory fill 50K and mod 4 = 0(for speed issue), learn pred net
        if (5e+4 <= dqn.memory_counter) and (dqn.memory_counter % 4 == 0):
            dqn.learn(PER_BETA)
            
        # print log and save
        if dqn.memory_counter % SAVE_FREQ == 0:
            # check time interval
            time_interval = round(time.time() - start_time, 2)
            # calc mean return
            mean_100_ep_return = round(np.mean(entire_ep_rs[-101:-1]),2)
            result.append(mean_100_ep_return)
            # print log
            print('Ep: ',epi_step,
                  '| Mean ep 100 return: ', mean_100_ep_return,
                  '/Used Time:',time_interval,
                  '/Used Step:',dqn.memory_counter)
            # save model
            dqn.save_model()
            pkl_file = open(RESULT_PATH, 'wb')
            pickle.dump(np.array(result), pkl_file)
            pkl_file.close()
            print('Save complete!')
            
        # if agent meets end-of-life, update return, acc return
        if done:
            entire_ep_r += ep_r
            epi_step += 1
            if epi_step % 5 == 0:
                entire_ep_rs.append(entire_ep_r)
                entire_ep_r = 0.
            break

        s = s_

        if RENDERING:
            env.render()

Initialize results!
Collecting experience...




Ep:  10 | Mean ep 100 return:  -104.0 /Used Time: 9.7 /Used Step: 10000
Save complete!
Ep:  21 | Mean ep 100 return:  -101.0 /Used Time: 19.42 /Used Step: 20000
Save complete!
Ep:  32 | Mean ep 100 return:  -101.4 /Used Time: 29.18 /Used Step: 30000
Save complete!
Ep:  42 | Mean ep 100 return:  -101.14 /Used Time: 38.98 /Used Step: 40000
Save complete!




Ep:  53 | Mean ep 100 return:  -100.89 /Used Time: 49.01 /Used Step: 50000
Save complete!
Ep:  64 | Mean ep 100 return:  -101.0 /Used Time: 81.78 /Used Step: 60000
Save complete!
Ep:  74 | Mean ep 100 return:  -100.92 /Used Time: 115.1 /Used Step: 70000
Save complete!
Ep:  86 | Mean ep 100 return:  -100.88 /Used Time: 148.52 /Used Step: 80000
Save complete!
Ep:  97 | Mean ep 100 return:  -100.72 /Used Time: 181.5 /Used Step: 90000
Save complete!
Ep:  108 | Mean ep 100 return:  -100.95 /Used Time: 213.89 /Used Step: 100000
Save complete!
Ep:  118 | Mean ep 100 return:  -100.86 /Used Time: 246.43 /Used Step: 110000
Save complete!
Ep:  129 | Mean ep 100 return:  -100.79 /Used Time: 279.31 /Used Step: 120000
Save complete!
Ep:  140 | Mean ep 100 return:  -101.04 /Used Time: 311.77 /Used Step: 130000
Save complete!
Ep:  151 | Mean ep 100 return:  -101.03 /Used Time: 344.32 /Used Step: 140000
Save complete!
Ep:  162 | Mean ep 100 return:  -101.06 /Used Time: 376.72 /Used Step: 150000
Save co

Ep:  721 | Mean ep 100 return:  -90.82 /Used Time: 3079.87 /Used Step: 940000
Save complete!
Ep:  724 | Mean ep 100 return:  -90.82 /Used Time: 3115.14 /Used Step: 950000
Save complete!
Ep:  727 | Mean ep 100 return:  -90.39 /Used Time: 3150.28 /Used Step: 960000
Save complete!
Ep:  731 | Mean ep 100 return:  -89.99 /Used Time: 3185.73 /Used Step: 970000
Save complete!
Ep:  734 | Mean ep 100 return:  -89.99 /Used Time: 3221.26 /Used Step: 980000
Save complete!
Ep:  737 | Mean ep 100 return:  -89.45 /Used Time: 3258.1 /Used Step: 990000
Save complete!
Ep:  740 | Mean ep 100 return:  -88.71 /Used Time: 3293.6 /Used Step: 1000000
Save complete!
Ep:  743 | Mean ep 100 return:  -88.71 /Used Time: 3329.31 /Used Step: 1010000
Save complete!
Ep:  746 | Mean ep 100 return:  -88.12 /Used Time: 3365.13 /Used Step: 1020000
Save complete!
Ep:  749 | Mean ep 100 return:  -88.12 /Used Time: 3400.81 /Used Step: 1030000
Save complete!
Ep:  752 | Mean ep 100 return:  -87.25 /Used Time: 3436.36 /Used Ste