In [1]:
"""core.py"""
import torch
import torch.nn as nn

import copy

"""ddpg.py"""
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

import gym, time
import numpy as np

In [9]:
env = gym.envs.make("MountainCarContinuous-v0")
test_env = gym.envs.make("MountainCarContinuous-v0")

In [2]:
class continuous_policy(nn.Module):
    def __init__(self, act_dim, obs_dim, hidden_layer=(400,300)):
        super().__init__()
        layer = [nn.Linear(obs_dim, hidden_layer[0]), nn.ReLU()]
        for i in range(1, len(hidden_layer)):
            layer.append(nn.Linear(hidden_layer[i-1], hidden_layer[i]))
            layer.append(nn.ReLU())
        layer.append(nn.Linear(hidden_layer[-1], act_dim))
        layer.append(nn.Tanh())
        self.policy = nn.Sequential(*layer)

    def forward(self, obs):
        return self.policy(obs)

class q_function(nn.Module):
    def __init__(self, obs_dim, hidden_layer=(400,300)):
        super().__init__()
        layer = [nn.Linear(obs_dim, hidden_layer[0]), nn.ReLU()]
        for i in range(1, len(hidden_layer)):
            layer.append(nn.Linear(hidden_layer[i-1], hidden_layer[i]))
            layer.append(nn.ReLU())
        layer.append(nn.Linear(hidden_layer[-1], 1))
        self.policy = nn.Sequential(*layer)

    def forward(self, obs):
        return self.policy(obs)

class actor_critic(nn.Module):
    def __init__(self, act_dim, obs_dim, hidden_layer=(400,300), act_limit=2):
        super().__init__()
        self.policy = continuous_policy(act_dim, obs_dim, hidden_layer)

        self.q = q_function(obs_dim+act_dim, hidden_layer)
        self.act_limit = act_limit

        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.xavier_normal_(m.weight)
                nn.init.constant_(m.bias, 0)

        self.policy_targ = continuous_policy(act_dim, obs_dim, hidden_layer)
        self.q_targ = q_function(obs_dim+act_dim, hidden_layer)

        self.copy_param()

    def copy_param(self):
        self.policy_targ.load_state_dict(self.policy.state_dict())
        self.q_targ.load_state_dict(self.q.state_dict())

    def get_action(self, obs, noise_scale):
        pi = self.act_limit * self.policy(obs)
        pi += noise_scale * torch.randn_like(pi)
        pi.clamp_(max=self.act_limit, min=-self.act_limit)
        return pi.squeeze()

    def update_target(self, rho):
        # compute rho * targ_p + (1 - rho) * main_p
        for poly_p, poly_targ_p in zip(self.policy.parameters(), self.policy_targ.parameters()):
            poly_targ_p.data = rho * poly_targ_p.data + (1-rho) * poly_p.data

        for q_p, q_targ_p in zip(self.q.parameters(), self.q_targ.parameters()):
            q_targ_p.data = rho * q_targ_p.data + (1-rho) * q_p.data

    def compute_target(self, obs, gamma, rewards, done):
        # compute r + gamma * (1 - d) * Q(s', mu_targ(s'))
        pi = self.act_limit * self.policy_targ(obs)
        return (rewards + gamma * (1-done) * self.q_targ(torch.cat([obs, pi], -1)).squeeze()).detach()

    def q_function(self, obs, detach=True, action=None):
        # compute Q(s, a) or Q(s, mu(s))
        if action is None:
            pi = self.act_limit * self.policy(obs)
        else:
            pi = action
        if detach:
            pi = pi.detach()
        return self.q(torch.cat([obs, pi], -1))

In [5]:
class ReplayBuffer:
    def __init__(self, size):
        self.size, self.max_size = 0, size
        self.obs1_buf = []
        self.obs2_buf = []
        self.acts_buf = []
        self.rews_buf = []
        self.done_buf = []

    def store(self, obs, act, rew, next_obs, done):
        self.obs1_buf.append(obs)
        self.obs2_buf.append(next_obs)
        self.acts_buf.append(act)
        self.rews_buf.append(rew)
        self.done_buf.append(int(done))
        while len(self.obs1_buf) > self.max_size:
            self.obs1_buf.pop(0)
            self.obs2_buf.pop(0)
            self.acts_buf.pop(0)
            self.rews_buf.pop(0)
            self.done_buf.pop(0)

        self.size = len(self.obs1_buf)

    def sample_batch(self, batch_size=32):
        idxs = np.random.randint(low=0, high=self.size, size=(batch_size,))
        obs1 = torch.FloatTensor([self.obs1_buf[i] for i in idxs])
        obs2 = torch.FloatTensor([self.obs2_buf[i] for i in idxs])
        acts = torch.FloatTensor([self.acts_buf[i] for i in idxs])
        rews = torch.FloatTensor([self.rews_buf[i] for i in idxs])
        done = torch.FloatTensor([self.done_buf[i] for i in idxs])
        return [obs1, obs2, acts, rews, done]



In [7]:
replay_size = 100

In [11]:
replay_buffer = ReplayBuffer(replay_size)

obs_dim = env.observation_space.shape[0]
act_dim = env.action_space.shape[0]

act_limit = int(env.action_space.high[0])

actor_critic = actor_critic(act_dim, obs_dim, hidden_size, act_limit)

q_optimizer = optim.Adam(actor_critic.q.parameters(), q_lr)
policy_optimizer = optim.Adam(actor_critic.policy.parameters(), pi_lr)

start_time = time.time()

obs, ret, done, ep_ret, ep_len = env.reset(), 0, False, 0, 0
total_steps = steps_per_epoch * epochs

for t in range(total_steps):
    if t > 50000:
        env.render()
    if t > start_steps:
        obs_tens = torch.from_numpy(obs).float().reshape(1,-1)
        act = actor_critic.get_action(obs_tens, act_noise).detach().numpy().reshape(-1)
    else:
        act = env.action_space.sample()

    obs2, ret, done, _ = env.step(act)

    ep_ret += ret
    ep_len += 1

    done = False if ep_len==max_ep_len else done

    replay_buffer.store(obs, act, ret, obs2, done)

    obs = obs2

    if done or (ep_len == max_ep_len):
        for _ in range(ep_len):
            obs1_tens, obs2_tens, acts_tens, rews_tens, done_tens = replay_buffer.sample_batch(batch_size)
            # compute Q(s, a)
            q = actor_critic.q_function(obs1_tens, action=acts_tens)
            # compute r + gamma * (1 - d) * Q(s', mu_targ(s'))
            q_targ = actor_critic.compute_target(obs2_tens, gamma, rews_tens, done_tens)
            # compute (Q(s, a) - y(r, s', d))^2
            q_loss = (q.squeeze()-q_targ).pow(2).mean()

            q_optimizer.zero_grad()
            q_loss.backward()
            q_optimizer.step()

            logger.store(LossQ=q_loss.item(), QVals=q.detach().numpy())

            # compute Q(s, mu(s))
            policy_loss = -actor_critic.q_function(obs1_tens, detach=False).mean()
            policy_optimizer.zero_grad()
            policy_loss.backward()
            policy_optimizer.step()

            logger.store(LossPi=policy_loss.item())

            # compute rho * targ_p + (1 - rho) * main_p
            actor_critic.update_target(polyak)

        obs, ret, done, ep_ret, ep_len = env.reset(), 0, False, 0, 0

    if t > 0 and t % steps_per_epoch == 0:
        epoch = t // steps_per_epoch

NameError: name 'hidden_size' is not defined