<a href="https://colab.research.google.com/github/thd0222/hmn-reinforcement/blob/master/dqn_breakout_deploy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%tensorflow_version 1.x

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import random
import numpy as np

import tensorflow as tf
import gym

from collections import deque
from skimage.color import rgb2gray
from skimage.transform import resize
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.layers import Dense, Flatten, Conv2D
from tensorflow.keras import backend as K
from tensorflow.losses import huber_loss
from tensorboardcolab import *

from gym import envs

print(tf.__version__)

1.15.2


Using TensorFlow backend.


In [None]:
!apt install python-opengl
!apt install ffmpeg
!apt install xvfb
!pip3 install pyvirtualdisplay

# Virtual display
from pyvirtualdisplay import Display

virtual_display = Display(visible=0, size=(1400, 900))
virtual_display.start()

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following package was automatically installed and is no longer required:
  libnvidia-common-440
Use 'apt autoremove' to remove it.
Suggested packages:
  libgle3
The following NEW packages will be installed:
  python-opengl
0 upgraded, 1 newly installed, 0 to remove and 35 not upgraded.
Need to get 496 kB of archives.
After this operation, 5,416 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 python-opengl all 3.1.0+dfsg-1 [496 kB]
Fetched 496 kB in 0s (5,978 kB/s)
Selecting previously unselected package python-opengl.
(Reading database ... 144487 files and directories currently installed.)
Preparing to unpack .../python-opengl_3.1.0+dfsg-1_all.deb ...
Unpacking python-opengl (3.1.0+dfsg-1) ...
Setting up python-opengl (3.1.0+dfsg-1) ...
Reading package lists... Done
Building dependency tree       
Reading state information... Done
ff

<pyvirtualdisplay.display.Display at 0x7fd334ffddd8>

In [None]:
GAME = "Breakout-v4"

In [None]:
import os
import io
import base64
from IPython.display import display, HTML

def ipython_show_video(path):
    """Show a video at `path` within IPython Notebook
    """
    if not os.path.isfile(path):
        raise NameError("Cannot access: {}".format(path))

    video = io.open(path, 'r+b').read()
    encoded = base64.b64encode(video)

    display(HTML(
        data="""
        <video alt="test" controls>
        <source src="data:video/mp4;base64,{0}" type="video/mp4" />
        </video>
        """.format(encoded.decode('ascii'))
    ))

In [None]:
import shutil
from datetime import datetime

base_path = '/your_path'

datetime_path = datetime.now().strftime('%Y-%m-%d_%H:%M:%S')

model_path = os.path.join(base_path, 'save_model', datetime_path)

os.makedirs(model_path, exist_ok=True)

### 학습 속도를 높이기 위해 흑백 화면으로 전처리

In [None]:
# 210*160*3(color) --> 84*84(mono)
# float --> integer (to reduce the size of replay memory)
def pre_processing(observe):
    processed_observe = np.uint8(
        resize(rgb2gray(observe), (84, 84), mode='constant') * 255)
    return processed_observe

### Agent 클래스 정의
* 하이퍼파라미터  
  * epsilon : 탐색을 위한 확률정보 값
  * epsilon_start, self.epsilon_end : 입실론 값의 범위 
  * exploration_steps : 입실론 값을 감소시킬 단계
  * epsilon_decay_step : 한번에 감소시킬 입실론 크기
  * batch_size : 배치 사이즈
  * train_start : 학습 시작 메모리 길이
  * update_target_rate : 타겟 네트워크를 업데이트 시킬 step
  * discount_factor : 감가율
  * memory : 학습정보를 담기위한 메모리 객체
  * no_op_steps : 30 스텝 이후 학습정보를 모으기 위한 설정값
* 함수
  * optimizer(self) : Huber Loss를 이용한 최적화 함수 정의
  * build_model(self) : 상태가 입력, 큐함수가 출력인 모델 생성
  * update_target_model(self) : 타겟 모델을 모델의 가중치로 업데이트
  * get_action(self, history) : 입실론 탐욕 정책으로 행동 선택
  * remember(self, history, action, reward, next_history, dead) : 샘플 <s,a,r,s'>을 리플레이 메모리에 저장
  * train_replay(self) : 리플레이 메모리에서 무작위로 추출한 배치로 모델 학습
  * save_model(self, name): 학습 모델 저장
  * load_model(self, filename): 학습 모델 로드
  * setup_summary(self): 각 에피소드당 학습 정보를 기록

In [None]:
class DQNAgent:
    def __init__(self, action_size, model_load=False):
        self.render = False
        self.load_model = False
        # 상태와 행동의 크기 정의
        self.state_size = (84, 84, 4)
        self.action_size = action_size
        # DQN 하이퍼파라미터 정의
        self.epsilon = 1.
        self.epsilon_start, self.epsilon_end = 1.0, 0.1
        self.exploration_steps = 10.
        self.epsilon_decay_step = (self.epsilon_start - self.epsilon_end) \
                                  / self.exploration_steps
        # 학습을 위한 파라미터 정의
        self.batch_size = 32
        self.train_start = 50000
        self.update_target_rate = 10000
        self.discount_factor = 0.99
        self.memory = deque(maxlen=400000)
        self.no_op_steps = 30

        # 모델과 타겟 모델을 생성하고 타겟 모델을 초기화
        self.model = self.build_model()
        self.target_model = self.build_model()
        self.update_target_model()

        self.optimizer = self.optimizer()

        self.sess = tf.InteractiveSession()
        K.set_session(self.sess)

        self.avg_q_max, self.avg_loss = 0, 0
        self.summary_placeholders, self.update_ops, self.summary_op = \
            self.setup_summary()
        
        tbc=TensorBoardColab()
        
        self.summary_writer = tbc.get_writer()
        self.summary_writer = tf.summary.FileWriter(
            './Graph', self.sess.graph)
        self.sess.run(tf.global_variables_initializer())

        if self.load_model:
            self.model.load_weights(os.path.join(model_path, "breakout_dqn.h5"))

    # Huber Loss를 이용한 최적화 함수 정의
    def optimizer(self):

        return train

    # 상태가 입력, 큐함수가 출력인 모델 생성
    def build_model(self):

        return model

    # 타겟 모델을 모델의 가중치로 업데이트
    def update_target_model(self):
        

    # 입실론 탐욕 정책으로 행동 선택
    def get_action(self, history):


    # 샘플 <s,a,r,s'>을 리플레이 메모리에 저장
    def remember(self, history, action, reward, next_history, dead):


    # 리플레이 메모리에서 무작위로 추출한 배치로 모델 학습
    def train_replay(self):
        if len(self.memory) < self.train_start:
            return
        if self.epsilon > self.epsilon_end:
            self.epsilon -= self.epsilon_decay_step

        mini_batch = random.sample(self.memory, self.batch_size)

        history = np.zeros((self.batch_size, self.state_size[0],
                            self.state_size[1], self.state_size[2]))
        next_history = np.zeros((self.batch_size, self.state_size[0],
                                 self.state_size[1], self.state_size[2]))
        target = np.zeros((self.batch_size,))
        action, reward, dead = [], [], []

        # replay 메모리에 담겨 있는 데이터 분할
        for i in range(self.batch_size):

        # 타겟 모델에서 Q-value 계산
        target_value = self.target_model.predict(next_history)

        # 타겟 모델에서 s' 상태에서의 최대 Q 함수 값을 가져옴
        for i in range(self.batch_size):
            if dead[i]:
                target[i] = reward[i]
            else:
                target[i] = reward[i] \
                            + self.discount_factor * np.amax(target_value[i])

        # self.optimizer 함수를 통해 모델 업데이트


    def save_model(self, name):
        self.model.save_weights(name)
    
    def load_model(self, filename):
        self.model.load_weights(filename)

    # 각 에피소드당 학습 정보를 기록
    def setup_summary(self):
        episode_total_reward = tf.Variable(0.)
        episode_avg_max_q = tf.Variable(0.)
        episode_duration = tf.Variable(0.)
        episode_avg_loss = tf.Variable(0.)

        tf.summary.scalar('Total_Reward/Episode', episode_total_reward)
        tf.summary.scalar('Average_Max_Q/Episode', episode_avg_max_q)
        tf.summary.scalar('Duration/Episode', episode_duration)
        tf.summary.scalar('Average_Loss/Episode', episode_avg_loss)

        summary_vars = [episode_total_reward, episode_avg_max_q,
                        episode_duration, episode_avg_loss]
        summary_placeholders = [tf.placeholder(tf.float32) for _ in
                                range(len(summary_vars))]
        update_ops = [summary_vars[i].assign(summary_placeholders[i]) for i in
                      range(len(summary_vars))]
        summary_op = tf.summary.merge_all()
        return summary_placeholders, update_ops, summary_op

In [None]:
EPISODES = 50000

In [None]:
if __name__ == "__main__":
    from gym import wrappers
    
    # breakout 게임 환경 생성
    env = gym.make('BreakoutDeterministic-v4')

    # colab(모니터가 없는 서버)에서 게임 시행을 위한 모니터 래퍼 추가
    env = wrappers.Monitor(env, f"/tmp/{GAME}", force=True)
    
    # DQN agent 객체 생성
    agent = DQNAgent(action_size=3)

    scores, episodes, global_step = [], [], 0

    for e in range(EPISODES):
        done = False
        dead = False
        
        # 1 episode = 5 lives
        step, score, start_life = 0, 0, 5
        observe = env.reset()

        # 30 no-op(30 타임세텝 동안 에이전트는 정지)
        for _ in range(random.randint(1, agent.no_op_steps)):
            observe, _, _, _ = env.step(1)

        # 에피소드 시작시 이전 프레임이 없으므로
        # 초기 상태를 복사하여 기록 생성

        while not done:
            if agent.render:
                env.render()
            global_step += 1
            step += 1

            # 바로 전 4개의 상태로 행동을 선택
            
            # 1: 정지/ 2: 왼쪽/ 3: 오른쪽
            if action == 0:
                real_action = 1
            elif action == 1:
                real_action = 2
            else:
                real_action = 3

            # 선택한 행동으로 환경에서 한 타임스텝 진행
            

            # 매 스텝 마다 관찰된 상태(이미지) 전처리


            # 에이전트가 공을 놓치면 dead --> 에피소드는 종료되지 않음


            # 샘플 <s, a, r, s'>을 리플레이 메모리에 저장 후 학습

            
            # 일정 시간마다 타깃 모델을 모델의 가중치로 업데이트


            # 에이전트가 dead 면, dead 값 리셋
            if dead:
                dead = False
            else:
                history = next_history

            # 각 에피스드 마다 학습 정보를 기록
            if done:
                if global_step > agent.train_start:
                    stats = [score, agent.avg_q_max / float(step), step,
                             agent.avg_loss / float(step)]
                    for i in range(len(stats)):
                        agent.sess.run(agent.update_ops[i], feed_dict={
                            agent.summary_placeholders[i]: float(stats[i])
                        })
                    summary_str = agent.sess.run(agent.summary_op)
                    agent.summary_writer.add_summary(summary_str, e + 1)

                print("episode:", e, "  score:", score, "  memory length:",
                      len(agent.memory), "  epsilon:", agent.epsilon,
                      "  global_step:", global_step, "  average_q:",
                      agent.avg_q_max / float(step), "  average loss:",
                      agent.avg_loss / float(step))

                agent.avg_q_max, agent.avg_loss = 0, 0

        # 1000 에피소드마다 모델 저장
        if e % 100 == 0:
            agent.model.save_weights(os.path.join(model_path, "breakout_dqn.h5"))

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 20, 20, 32)        8224      
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 9, 9, 64)          32832     
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 7, 7, 64)          36928     
_________________________________________________________________
flatten (Flatten)            (None, 3136)              0         
_________________________________________________________________
dense (Dense)                (None, 512)               1606144   
_________________________________________________________________
dense_1 (Dense)              (None, 3)                 1539      
Total params: 1,685,667
Trainable params:

In [None]:
class TestAgent:
    def __init__(self, action_size):
        self.state_size = (84, 84, 4)
        self.action_size = action_size
        self.no_op_steps = 20

        self.model = self.build_model()

        self.sess = tf.InteractiveSession()
        K.set_session(self.sess)

        self.avg_q_max, self.avg_loss = 0, 0
        self.sess.run(tf.global_variables_initializer())

    def build_model(self):
        model = Sequential()
        model.add(Conv2D(32, (8, 8), strides=(4, 4), activation='relu',
                         input_shape=self.state_size))
        model.add(Conv2D(64, (4, 4), strides=(2, 2), activation='relu'))
        model.add(Conv2D(64, (3, 3), strides=(1, 1), activation='relu'))
        model.add(Flatten())
        model.add(Dense(512, activation='relu'))
        model.add(Dense(self.action_size))
        model.summary()

        return model

    def get_action(self, history):
        if np.random.random() < 0.01:
            return random.randrange(3)
        history = np.float32(history / 255.0)
        q_value = self.model.predict(history)
        return np.argmax(q_value[0])

    def load_model(self, filename):
        self.model.load_weights(filename)

In [None]:
TEST_EPISODES = 10
model_to_load = os.path.join(model_path, 'breakout_dqn.h5')

In [None]:
TEST_EPISODES = 10
link = "https://drive.google.com/open?id=1RbNmbp8EBXDom3MhhoWezdLNxY3xGPHL"
fluff, id = link.split('=')
gdd.download_file_from_google_drive(file_id=id,
                                    dest_path='./model_trained.h5')
model_to_load = 'model_trained.h5'

In [None]:
model_to_load

'model_trained.h5'

In [None]:
if __name__ == "__main__":
    from gym import wrappers
    
    # add virtual monitor for capturing video
    env = gym.make('BreakoutDeterministic-v4')
    env = wrappers.Monitor(env, f"/tmp/BreakoutDeterministic-v4", force=True)
  
    agent = TestAgent(action_size=3)
    agent.load_model(model_to_load)

    for e in range(TEST_EPISODES):
        done = False
        dead = False
       
        step, score, start_life = 0, 0, 5
        observe = env.reset()

        for _ in range(random.randint(1, agent.no_op_steps)):
            observe, _, _, _ = env.step(1)

        state = pre_processing(observe)
        history = np.stack((state, state, state, state), axis=2)
        history = np.reshape([history], (1, 84, 84, 4))

        while not done:
            env.render()
            step += 1

            action = agent.get_action(history)

            if action == 0:
                real_action = 1
            elif action == 1:
                real_action = 2
            else:
                real_action = 3

            if dead:
                real_action = 1
                dead = False

            observe, reward, done, info = env.step(real_action)

            next_state = pre_processing(observe)
            next_state = np.reshape([next_state], (1, 84, 84, 1))
            next_history = np.append(next_state, history[:, :, :, :3], axis=3)

            if start_life > info['ale.lives']:
                dead = True
                start_life = info['ale.lives']

            score += reward
 
            history = next_history

            if done:
                print("episode:", e, "  score:", score)

In [None]:
!ls /tmp/BreakoutDeterministic-v4 -al

In [None]:
ipython_show_video("/tmp/BreakoutDeterministic-v4/openaigym.video.1.1084.video000008.mp4")

In [None]:
agent.epsilon, agent.epsilon_decay_step, agent.epsilon_end

(1.0, 0.09, 0.1)