In [None]:
import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import random

In [None]:
class DDPG_Mu(nn.Module):
    def __init__(self):
        super(DDPG_Mu, self).__init__()
        self.fc1 = nn.Linear(3, 512)
        self.fc_mu = nn.Linear(512, 1)
        self.optimizer = optim.Adam(self.parameters(), lr=0.0001)
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        mu = torch.tanh(self.fc_mu(x))*2
        return mu
    
    def train(self, loss):
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        
class DDPG_Q(nn.Module):
    def __init__(self):
        super(DDPG_Q, self).__init__()
        self.fc_a = nn.Linear(1, 64)
        self.fc_s = nn.Linear(3, 64)
        self.fc_1 = nn.Linear(128, 128)
        self.fc_q = nn.Linear(128, 1)
        self.optimizer = optim.Adam(self.parameters(), lr=0.001)
    
    def forward(self, x, a):
        x1 = F.relu(self.fc_a(a))
        x2 = F.relu(self.fc_s(x))
        x = torch.cat([x1, x2], dim=1)
        x = F.relu(self.fc_1(x))
        q = self.fc_q(x)
        return q

    
    def train(self, loss):
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

In [None]:
env = gym.make('Pendulum-v0')
Q, Q_p, Mu, Mu_p = DDPG_Q(), DDPG_Q(), DDPG_Mu(), DDPG_Mu()
GAMMA = 0.99 #discount factor
BATCH_SIZE = 64 # 128
BUFFER_SIZE = 600000 # int(1e5) 30000 #replay buffer size
replay_buffer = [] #다른 자료구조로 바꾸어도 상관없음.(list, queue, dict 등)
TAU = 0.0001 #1e-3 0.01 soft update parameter
PARAMETER_NOISE_COEF = 0.0005 # 0.0005
ITER = 100 # 1, 200000 training 함수가 호출될때 학습 iteration 횟수.

In [None]:
def training():
    ###############Put your code here############
    # Actor와 Critic의 loss가 target 네트워크를 이용하여 잘 정의되었는가. (target 네트워크에 gradient가 전달된다면 0점)
    def _make_var(tensor, volatile=False, requires_grad=False, dtype=torch.FloatTensor):
        return torch.autograd.Variable(tensor, volatile=volatile, requires_grad=requires_grad).type(dtype)

    for _ in range(ITER):
        state_batch, action_batch, reward_batch, nextstate_batch, done_batch = make_minibatch()

        # Prepare for the target q batch
        next_q_batch = Q_p(_make_var(nextstate_batch, volatile=True), Mu_p(_make_var(nextstate_batch, volatile=True)))
        next_q_batch.volatile=False

        y_batch = _make_var(reward_batch) + GAMMA * _make_var(1-done_batch) * next_q_batch

        # Critic update
        critic_output = Q(_make_var(state_batch), _make_var(action_batch))
        critic_loss = F.mse_loss(critic_output, y_batch) # nn.MSELOSS()(y_batch, critic_output)
        Q.train(critic_loss)

        # Actor update
        actor_loss = - Q( _make_var(state_batch), Mu(_make_var(state_batch)))
        actor_loss = actor_loss.mean()
        Mu.train(actor_loss)
        
        soft_target_update(Mu, Mu_p)
        soft_target_update(Q, Q_p)
    #############################################

def soft_target_update(model, model_p):
    ###############Put your code here############
    # 하이퍼파라미터 TAU에 의해 soft target update가 적절히 이루어졌는가.
    for param, target_param in zip(model.parameters(), model_p.parameters()):
        target_param.data.copy_(
            param.data * TAU + target_param.data * (1.0 - TAU)
        )
    #############################################
        
def init_target_param(model, model_p):
    ###############Put your code here############
    model_p.load_state_dict(model.state_dict())
    
    # test 
    for p, q in zip(model.parameters(), model_p.parameters()):
        assert torch.all(torch.eq(p, q))
    #############################################
        
def parameter_noise(model):
    with torch.no_grad():
        for param in model.parameters():
            param.add_(torch.randn(param.size()) * PARAMETER_NOISE_COEF)
            
def store_transition(s, a, r, s_prime, done):
    ###############Put your code here############
    # replay_buffer에 transition data가 적절히 저장되고 삭제되었는가.
    if len(replay_buffer) >= BUFFER_SIZE:
        replay_buffer.pop(0)
    replay_buffer.append((s,a,r,s_prime,done))
    #############################################
    
def make_minibatch():
    ###############Put your code here############
    # 네트워크를 학습할 때 매 iteration 마다 Batch transition data가 replay_buffer에서 random하게 샘플링 되었는가. 
    # make shape as (batch size, dim)

    sample = random.sample(replay_buffer, BATCH_SIZE)
    reward_batch = []
    done_batch = []

    for idx, elem in enumerate(sample):
        s, a, r, s_prime, done = elem
        s = s.view(1, -1) ; a = a.view(1, -1) ; s_prime = s_prime.view(1,-1)

        if idx == 0:
            state_batch, action_batch, nextstate_batch = s, a, s_prime
        else:
            state_batch = torch.cat((state_batch, s))
            action_batch = torch.cat((action_batch, a))
            nextstate_batch = torch.cat((nextstate_batch, s_prime))

        reward_batch.append([r])
        done_batch.append(float(done))

    reward_batch = torch.tensor(reward_batch).view(-1,1)
    done_batch = torch.tensor(done_batch).view(-1,1)

    return state_batch, action_batch, reward_batch, nextstate_batch, done_batch
    #############################################

In [None]:
reward_sum = 0.0
reward_list = []
init_target_param(Mu, Mu_p)
init_target_param(Q, Q_p)

for ep in range(20000):
    observation = env.reset()
    while True:
        state = torch.tensor(observation, dtype=torch.float)
        parameter_noise(Mu)
        action = Mu(state).detach()
        observation, reward, done, _ = env.step([action.item()])
        reward_sum += reward
        next_state = torch.tensor(observation, dtype=torch.float)
        store_transition(state, action, reward, next_state, done)   
        if done:
            break
            
    if len(replay_buffer) >= 500:
        training()

    # 최근 20 에피소드의 평균 score값이 -200 이상인가. (BUFFER_SIZE, ITER, GAMMA등 모든 하이퍼파라미터 값들은 변경 가능하다)      
    if ep % 20 == 19:
        print('Episode %d'%ep,', Reward mean : %f'%(reward_sum/20.0))
        if reward_sum/20.0 > -200.0:
            break
        reward_sum = 0.0

  """
  if sys.path[0] == '':


Episode 19 , Reward mean : -1396.688388
Episode 39 , Reward mean : -1327.857886
Episode 59 , Reward mean : -1412.292782
Episode 79 , Reward mean : -1355.379440
Episode 99 , Reward mean : -1454.268870
Episode 119 , Reward mean : -1468.980751
Episode 139 , Reward mean : -1522.613394
Episode 159 , Reward mean : -1463.612270
Episode 179 , Reward mean : -1449.870713
Episode 199 , Reward mean : -1442.853659
Episode 219 , Reward mean : -1467.714472
Episode 239 , Reward mean : -1447.718468
Episode 259 , Reward mean : -1377.436540
Episode 279 , Reward mean : -1077.226470
Episode 299 , Reward mean : -1242.213303
Episode 319 , Reward mean : -1009.013708
Episode 339 , Reward mean : -1159.879974
Episode 359 , Reward mean : -1178.854035
Episode 379 , Reward mean : -1183.102687
Episode 399 , Reward mean : -1048.369530
Episode 419 , Reward mean : -1167.780214
Episode 439 , Reward mean : -1217.789007
Episode 459 , Reward mean : -1079.103879
Episode 479 , Reward mean : -1144.566638
Episode 499 , Reward 