<a href="https://colab.research.google.com/github/tmsk0711/RL/blob/main/DQN_ch08.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import gym
import collections
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [None]:
learning_rate = 0.0005
gamma = 0.98
buffer_limt = 50000
batch_size = 32

In [None]:
class ReplayBuffer():
  def __init__(self):
    self.buffer = collections.deque(maxlen=buffer_limt)
  
  def put(self, transition): # 데이터 버퍼에 넣는다
    self.buffer.append(transition)
  
  def sample(self, n): # 버퍼에서 랜덤하게 미니배치 구성 
    mini_batch = random.sample(self.buffer,n)
    s_lst, a_lst, r_lst, s_prime_lst, done_mask_lst = [],[],[],[],[] # done_mask 값 (0,1)종류상태 밸류 마스킹 

    for transition in mini_batch:
      s, a, r, s_prime, done_mask = transition
      s_lst.append(s) # s는 상태 값이니 벡터값이 아닌 상수값이라 배열이 아니다.
      a_lst.append([a])
      r_lst.append([r])
      s_prime_lst.append(s_prime)
      done_mask_lst.append([done_mask_lst])
      
    return torch.tensor(s_lst, dtype=torch.float), torch.tensor(a_lst), \
        torch.tensor(r_lst), torch.tensor(s_prime_lst, dtype=torch.float), \
        torch.tensor(done_mask_lst)
    
  def size(self):
    return len(self.buffer)

In [None]:
class Qnet(nn.Module):
  def __init__(self):
    super(Qnet, self).__init__()
    self.fc1 = nn.Linear(4, 128)
    self.fc2 = nn.Linear(128, 128)
    self.fc3 = nn.Linear(128, 2)
  
  def forward(self, x):
    x = F.relu(self.fc1(x))
    x = F.relu(self.fc2(x))
    x = self.fc3(x)

    return x

  def sample_action(self, obs, epsilon):
    out = self.forward(obs)
    coin = random.random()
    
    if coin < epsilon:
      return random.randint(0,1)
    else:
      return out.argmax().item()



In [None]:
def train(q, q_target, memory, optimizer):
  for i in range(10):
    s,a,r,s_prime,done_mask = memory.sample(batch_size) # 미니배치 값 가져와서 재정의

    q_out = q(s)
    q_a = q_out.gather(1,a)
    #https://pytorch.org/docs/stable/generated/torch.gather.html
    max_q_prime = q_target(s_prime). max (1)[0].unsqueeze(1)
    target = r + gamma * max_q_prime * done_mask
    loss = F.smooth_l1_loss(q_a, target) # 해당 데이터 이용해서 loss값 계산ㅇ

    optimizer.zero_grad()
    loss.bacward() # loss 그라디언트 계산
    optimizer.step() # Qnet 파라미터 업데이트 


 >  1.
  q네트워크 파라미터 값을 그대로 q_target 네트워크에 그대로 복사 하여 학습이 진행되면서 q는 업데이트 되지만 q_target은 load_state_dict함수를 호출 하기 전까지 변하지 않고 초기 파라미터를 유지한다.

 > 2.optimizer에서 q_target은 학습대상이 아니기때문에 매개변수로 넘겨주지 않으므로 q 네트워크의 파라미터만 업데이트 한다.



In [None]:
def main():
    env = gym.make('CartPole-v1')
    q = Qnet()
    q_target = Qnet()
    q_target.load_state_dict(q.state_dict())
    memory = ReplayBuffer()

    print_interval = 20
    score = 0.0  
    optimizer = optim.Adam(q.parameters(), lr=learning_rate)

    for n_epi in range(10000):
        epsilon = max(0.01, 0.08 - 0.01*(n_epi/200)) #Linear annealing from 8% to 1%
        s = env.reset()
        done = False

        while not done:
            a = q.sample_action(torch.from_numpy(s).float(), epsilon)      
            s_prime, r, done, info = env.step(a)
            done_mask = 0.0 if done else 1.0
            memory.put((s,a,r/100.0,s_prime, done_mask))
            s = s_prime

            score += r
            if done:
                break
            
        if memory.size()>2000:
            train(q, q_target, memory, optimizer)

        if n_epi%print_interval==0 and n_epi!=0:
            q_target.load_state_dict(q.state_dict())
            print("n_episode :{}, score : {:.1f}, n_buffer : {}, eps : {:.1f}%".format(
                                                            n_epi, score/print_interval, memory.size(), epsilon*100))
            score = 0.0
    env.close()

main()

n_episode :20, score : 10.7, n_buffer : 213, eps : 7.9%
n_episode :40, score : 9.6, n_buffer : 404, eps : 7.8%
n_episode :60, score : 9.0, n_buffer : 584, eps : 7.7%
n_episode :80, score : 9.4, n_buffer : 773, eps : 7.6%
n_episode :100, score : 9.3, n_buffer : 960, eps : 7.5%
n_episode :120, score : 9.6, n_buffer : 1152, eps : 7.4%
n_episode :140, score : 10.7, n_buffer : 1365, eps : 7.3%
n_episode :160, score : 9.9, n_buffer : 1563, eps : 7.2%
n_episode :180, score : 10.2, n_buffer : 1767, eps : 7.1%
n_episode :200, score : 9.8, n_buffer : 1962, eps : 7.0%


ValueError: ignored