<a href="https://colab.research.google.com/github/thkted/DeepLearning_Study/blob/main/RL_DQN_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [17]:
# 구글 드라이브 마운트
from google.colab import drive
drive.mount('/gdrive', force_remount=True)
# /gdrive/My Drive/ (폴더명)

Mounted at /gdrive


In [9]:
# 그림파일로 렌더링 하도록 패키지 설정
!apt-get install python-opengl -y
!apt install xvfb -y
!pip install pyvirtualdisplay
!pip install piglet

Reading package lists... Done
Building dependency tree       
Reading state information... Done
python-opengl is already the newest version (3.1.0+dfsg-1).
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 34 not upgraded.
Reading package lists... Done
Building dependency tree       
Reading state information... Done
xvfb is already the newest version (2:1.19.6-1ubuntu4.9).
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 34 not upgraded.


In [10]:
# 필요한 모듈 설치
import tensorflow as tf
import gym
from IPython import display
import cv2
from pyvirtualdisplay import Display
from IPython import display
import matplotlib.pyplot as plt
from collections import deque
import numpy as np
import random
%matplotlib inline
Display().start()

<pyvirtualdisplay.display.Display at 0x7fc6b24eb6d0>

In [11]:
# 게임 환경
env = gym.make("CartPole-v1")

In [12]:
env.render('rgb_array')

In [13]:
# 액션 종류 수 (output)
action_num = env.action_space.n
action_num

2

In [14]:
# 상태 종류 수 (input)
state_num = env.observation_space.shape[0]
state_num

4

In [15]:
# dqn 모델 (Q 함수를 모사(예측)할 모델)
dqn_model = tf.keras.models.Sequential()
dqn_model.add(tf.keras.layers.Dense(128, input_shape=(state_num,),activation='relu'))
dqn_model.add(tf.keras.layers.Dense(action_num))
dqn_model.compile(loss='mse', optimizer=tf.keras.optimizers.Adam(0.001))

# target 모델
target_model = tf.keras.models.Sequential()
target_model.add(tf.keras.layers.Dense(128, input_shape=(state_num,),activation='relu'))
target_model.add(tf.keras.layers.Dense(action_num))
target_model.compile(loss='mse', optimizer=tf.keras.optimizers.Adam(0.001))

# dqn모델과 target모델의 값이 같도록 업데이트
target_model.set_weights(dqn_model.get_weights())

In [16]:
# 에피소드 수만큼 학습
episode_count = 1000

# 플레이를 저장할 메모리 리스트 (최근 플레이 10000개까지 기억)
memory = deque(maxlen=10000)

# 점수를 기록 할 리스트
scores = []

# E-greedy에서 탐험 할 입실론 - epsilon_decay만큼 조금씩 줄어들어 결과적으로 min값으로 변경
epsilon = 0.9
epsilon_min = 0.1
epsilon_decay = epsilon_min / epsilon
epsilon_decay = epsilon ** (1. / float(300))

# 배치 사이즈
batch_size = 64

# 리워드 감가/할인율
reward_discount_rate = 0.999

# 타겟 데이터 업데이트 비율
train_count = 0
target_update_count = 30

for episode in range(episode_count):
  state = env.reset()
  # 차원 맞추기
  state = np.reshape(state, [1, state_num])
  done = False
  total_reward = 0
  while not done:
    # 액션이 입실론 값보다 작으면 랜덤 / 아니면 DQN 모델에 물어보고 가장 점수가 높은 행동
    if np.random.rand() < epsilon:
      action = env.action_space.sample()
    else:
      q_val = dqn_model.predict(state)
      action = np.argmax(q_val[0])
    
    next_state, reward, done, _ = env.step(action)
    next_state = np.reshape(next_state, [1, state_num])
    i = (state, action, reward / 100, next_state, done)

    # 메모리에 작업내용 저장
    memory.append(i)

    # 다음 상태를 현 상태로 변경하여 계속 학습 진행
    state = next_state
    total_reward += reward

  # 메모리가 일정량 차면 학습 (배치 사이즈보단 커야 함)
  if len(memory) >= 1000:
    sample = random.sample(memory, batch_size) # memory에서 batch_size만큼 랜덤하게 뽑아서 학습
    # 학습에 쓰일 리스트
    state_batch = []
    q_val_batch = []
    # 샘플에 있던 내용으로 학습
    for state_, action_, reward_, next_state_, done_ in sample:
      q_val = dqn_model.predict(state_)

      # dqn: q = r + d_r * max(q')

      target_q_val = reward_ + reward_discount_rate * np.max(target_model.predict(next_state_)[0])

      # double dqn
      # target_q_val = np.argmax(dqn_model.predict(next_state_[0]))
      # target_q_val = target_model.predict(next_state_)[0][target_q_val]
      # target_q_val = reward_ + reward_discount_rate * target_q_val

      if done_:
        q_val[0][action_] = reward_
      else:
        q_val[0][action_] = target_q_val
      
      state_batch.append(state_[0])
      q_val_batch.append(q_val[0])
    
    # 학습 후 타겟모델을 dqn 모델로 업데이트하고, 입실론값을 줄임
    dqn_model.train_on_batch(np.array(state_batch), np.array(q_val_batch))
    if epsilon > epsilon_min:
      epsilon *= epsilon_decay
    train_count += 1

    if train_count % target_update_count == 0:
      target_model.set_weights(dqn_model.get_weights())
      print('!! Target Model Update !!')
  
  scores.append(total_reward)
  mean_score = np.mean(scores)

  print(episode + 1, total_reward, epsilon)

  if (episode + 1) % 20 == 0:
    print("Episode %d: Mean survival = %0.2lf in %d episodes" %(episode+1, mean_score, 20))
    if mean_score >= 400:
      break
    scores = []

env.close()

1 15.0 0.9
2 16.0 0.9
3 50.0 0.9
4 37.0 0.9
5 34.0 0.9
6 19.0 0.9
7 11.0 0.9
8 15.0 0.9
9 37.0 0.9
10 21.0 0.9
11 13.0 0.9
12 11.0 0.9
13 18.0 0.9
14 21.0 0.9
15 16.0 0.9
16 17.0 0.9
17 11.0 0.9
18 44.0 0.9
19 27.0 0.9
20 10.0 0.9
Episode 20: Mean survival = 22.15 in 20 episodes
21 11.0 0.9
22 37.0 0.9
23 19.0 0.9
24 23.0 0.9
25 26.0 0.9
26 34.0 0.9
27 18.0 0.9
28 15.0 0.9
29 17.0 0.9
30 21.0 0.9
31 14.0 0.9
32 25.0 0.9
33 19.0 0.9
34 18.0 0.9
35 61.0 0.9
36 34.0 0.9
37 9.0 0.9
38 23.0 0.9
39 11.0 0.9
40 14.0 0.9
Episode 40: Mean survival = 22.45 in 20 episodes
41 23.0 0.9
42 15.0 0.9
43 25.0 0.9
44 11.0 0.9
45 16.0 0.9
46 14.0 0.9
47 12.0 0.8996839739507208
48 18.0 0.8993680588708457
49 17.0 0.899052254721409
50 9.0 0.8987365614634587
51 29.0 0.8984209790580563
52 24.0 0.898105507466277
53 42.0 0.8977901466492098
54 20.0 0.8974748965679571
55 11.0 0.8971597571836352
56 12.0 0.896844728457374
57 18.0 0.8965298103503169
58 46.0 0.8962150028236212
59 12.0 0.8959003058384577
60 12.0 0.895

In [18]:
dqn_model.save('/gdrive/My Drive/dqn_model_2.h5')

In [20]:
env = gym.make('CartPole-v1')
state=env.reset()
state = np.reshape(state, [1, state_num])
done=False
# img = plt.imshow(env.render('rgb_array')) # only call this once
total_reward=0
img_avi=np.zeros((400,600,3))
fcc=cv2.VideoWriter_fourcc(*'DIVX')
out=cv2.VideoWriter('/gdrive/My Drive/dqn_2.avi',fcc,10.0,(600,400))
while not done:
    # img.set_data(env.render('rgb_array')) # just update the data
    # display.display(plt.gcf())
    # display.clear_output(wait=True)
    img_avi=env.render('rgb_array')
    action = np.argmax(dqn_model.predict(state)[0])
    # action = env.action_space.sample()
    next_state, reward, done, _ = env.step(action)
    next_state = np.reshape(next_state, [1, state_num])
    state = next_state
    total_reward += reward
    out.write(np.uint8(img_avi))
print(total_reward)
out.release()
cv2.destroyAllWindows()

246.0
