In [1]:
import numpy as np

In [2]:
class OUActionNoise():
    # initialize mean, std, 
    def __init__(self, mu, sigma=0.15, theta=0.2, dt=1e-2, xNull=None):
        self.mu, self.sigma, self.theta, self.dt, self.xNull = mu, sigma, theta, dt, xNull
        self.reset()
        
    # allows us to use the name of an object as a function
    def __call__(self):
        # get temporal correlation of noise
        x = self.xPrevious + self.theta * (self.mu - self.xPrevious) * \
        self.dt + self.sigma * np.sqrt(self.dt) + np.random.normal(size=self.mu.shape)
        self.xPrevious = x
        
        return x
    
    # set initial value of xPrevious
    def reset(self):
        self.xPrevious = self.xNull if self.xNull is not None else np.zeros_like(self.mu)

In [3]:
# from collections import deque

In [4]:
class ReplayBuffer():
    def __init__(self, max_size, input_shape, action_shape):
        self.mem_size = max_size
        self.mem_cntr = 0
        
        self.memory = {
            "null_state" : np.zeros((self.mem_size, *input_shape)),
            "prime_state" : np.zeros((self.mem_size, *input_shape)),
            "action" : np.zeros((self.mem_size, action_shape)),
            "reward" : np.zeros(self.mem_size),
            "terminal" : np.zeros(self.mem_size, dtype=np.bool),
        }
        
        # mask for setting critic values for new state to zero
        # self.term_mem = np.zeros(self.mem_size, dtype=np.bool)
        
        
    def store_transition(self, null_state, action, reward, prime_state, done):
        index = self.mem_cntr % self.mem_size
        alignment = zip(["null_state", "prime_state", "action", "reward"],\
                        [null_state, action, reward, prime_state])
        
        for mem, value in alignment:
            self.memory[mem][self.mem_trgt_indx] = value
            
        self.mem_cntr += 1
        
#     def sample_replay(self, proportion):
        
#         sample_size = np.ceil(len(self.replays) * proportion)
#         return np.random.choice(self.replays, sample_size)
        
    def sample_replay(self, batch_size):
        picks = {}
        
        max_mem = min(self.mem_cntr, self.mem_size)        
        batch = np.random.choice(max_mem, batch_size)
        
        for mem in ["null_state", "prime_state", "action", "reward", "terminal"]:
            picks[mem] = self.memory[mem][batch]
            
        return picks["null_state"], picks["prime_state"], picks["action"], picks["reward"], picks["terminal"]

In [5]:
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [6]:
class CriticNetwork(nn.Module):
    def __init__(self, beta, input_dims, hl_one, hl_two, n_actions, chkpt_name, chkpt_dir='tmp/ddpg'):
        super(CriticNetwork, self).__init__()
        
        self.beta, self.input_dims, self.hl_one, self.hl_two, self.n_actions, self.chkpt_name, self.chkpt_dir = \
        beta, input_dims, hl_one, hl_two, n_actions, chkpt_name, chkpt_dir
        
        chkpt_file = os.path.join(self.chkpt_dir + self.chkpt_name + '_ddpg')
        
        # define layers
        self.input = nn.Linear(*self.input_dims, self.hl_one)
        self.hidden = nn.Linear(self.hl_one, self.hl_two)
        
        # define normalizers
        self.lnormi = nn.LayerNorm(self.hl_one)
        self.lnormh = nn.LayerNorm(self.hl_two)
        
        # a calculation
        self.action_output = nn.Linear(self.n_actions, self.hl_two)
        
        # Q calculation
        self.critic_output = nn.Linear(self.hl_two, 1)
        
        # initialize layers
        for layer in [self.input, self.hidden, self.action_output]:
            fan_in = 1 / np.sqrt(layer.weight.data.size()[0])
            layer.weight.data.uniform_(-fan_in, fan_in)
            layer.bias.data.uniform_(-fan_in, fan_in)  
            
        critic_fan_in = 0.003
        self.critic_output.weight.data.uniform_(-critic_fan_in, critic_fan_in)
        self.critic_output.bias.data.uniform_(-critic_fan_in, critic_fan_in)
        
        # define optimizer
        self.optimizer = optim.Adam(self.parameters(), lr=self.beta, weight_decay=0.01)
        
        # device
        self.device = [torch.device('cpu')]
        if torch.cuda.device_count() > 0:
            self.device = []
            for d in range(torch.cuda.device_count()):
                self.device.append(torch.device('cuda:%s' % d))
        self.target_device = 0
        # self.to(self.device[0])
        self.route_data(self)
        ## print('self.device: %s' % self.device)
        
    def route_data(self, data):
        ## print('self.target_device: %s' % self.target_device)
        moved_data = data.to(self.device[self.target_device])
        # self.target_device = 0 if self.target_device >= len(self.device) else self.target_device + 1
        
        return moved_data
    
    def forward(self, state, action):
        state_value = self.lnormi(self.input(state))
        state_value = F.relu(state_value)
        state_value = self.lnormh(self.hidden(state_value))
        action_value = self.action_output(action)
        state_action_value = F.relu(torch.add(state_value, action_value))
        state_action_value = self.critic_output(state_action_value)
        
        return state_action_value
    
    def save_checkpoint(self):
        print("Saving checkpoint...")
        torch.save(self.state_dict(), self.chkpt_file)
        print("Checkpoint saved.")
    
    def load_checkpoint(self):
        print("Loading checkpoint...")
        self.load_state_dict(torch.load(self.chkpt_file))
        print("Checkpoint loaded.")

In [7]:
class ActorNetwork(nn.Module):
    def __init__(self, alpha, input_dims, hl_one, hl_two, n_actions, chkpt_name, chkpt_dir='tmp/ddpg'):
        super(ActorNetwork, self).__init__()
        
        self.alpha, self.input_dims, self.hl_one, self.hl_two, self.n_actions, self.chkpt_name, self.chkpt_dir = \
        alpha, input_dims, hl_one, hl_two, n_actions, chkpt_name, chkpt_dir
        
        self.chkpt_file = os.path.join(self.chkpt_dir + self.chkpt_name + '_ddpg')
        
        # define layers
        self.input = nn.Linear(*self.input_dims, self.hl_one)
        self.hidden = nn.Linear(self.hl_one, self.hl_two)
        # self.output = nn.Linear(self.hl_two, self.n_actions)
        
        # define normalizers
        self.lnormi = nn.LayerNorm(self.hl_one)
        self.lnormh = nn.LayerNorm(self.hl_two)
        
        # define mu
        self.mu = nn.Linear(self.hl_two, self.n_actions)
        
        # initialize layers
        for layer in [self.input, self.hidden]:
            fan_in = 1 / np.sqrt(layer.weight.data.size()[0])
            layer.weight.data.uniform_(-fan_in,fan_in)
            layer.bias.data.uniform_(-fan_in,fan_in)
            
        mu_fan_in = 3e-3
        self.mu.weight.data.uniform_(-mu_fan_in, mu_fan_in)
        self.mu.bias.data.uniform_(-mu_fan_in, mu_fan_in)
        
        # define optimizer
        self.optimizer = optim.Adam(self.parameters(), lr=self.alpha)#, weight_decay=1e-4)
        
        # device
        self.device = [torch.device('cpu')]
        if torch.cuda.device_count() > 0:
            self.device = []
            for d in range(torch.cuda.device_count()):
                self.device.append(torch.device('cuda:%s' % d))
        self.target_device = 0
        # self.to(self.device[0])
        self.route_data(self)
        ## print('self.device: %s' % self.device)
        
    def route_data(self, data):
        ## print('self.target_device: %s' % self.target_device)
        moved_data = data.to(self.device[self.target_device])
        # self.target_device = 0 if self.target_device >= len(self.device) else self.target_device + 1
        
        return moved_data
        
    def forward(self, state):
        x = F.relu(self.lnormi(self.input(state)))
        x = F.relu(self.lnormh(self.hidden(x)))
        A = F.tanh(self.mu(x))
        # action_value = self.mu(x)
        
        return A
    
    def save_checkpoint(self):
        print("Saving checkpoint...")
        torch.save(self.state_dict(), self.chkpt_file)
        print("Checkpoint saved.")
    
    def load_checkpoint(self):
        print("Loading checkpoint...")
        self.load_state_dict(torch.load(self.chkpt_file))
        print("Checkpoint loaded.")

In [9]:
class Agent():
    def __init__(self, alpha, beta, tau, input_dims, n_actions, gamma=0.99, hlOne=400, hlTwo=300, buffer_size=1e6, batch_size=64):
        self.alpha, self.beta, self.gamma, self.tau, self.input_dims, self.n_actions, self.hlOne, self.hlTwo, self.buffer_size, self.batch_size = \
        alpha, beta, gamma, tau, input_dims, n_actions, hlOne, hlTwo, buffer_size, batch_size
        
        self.actor = ActorNetwork(self.alpha, self.input_dims, self.n_actions, self.hlOne, self.hlTwo, 'actor')
        self.critic = CriticNetwork(self.beta, self.input_dims, self.n_actions, self.hlOne, self.hlTwo, 'critic')
        self.actor_prime = ActorNetwork(self.alpha, self.input_dims, self.n_actions, self.hlOne, self.hlTwo, 'target_actor')
        self.critic_prime = CriticNetwork(self.beta, self.input_dims, self.n_actions, self.hlOne, self.hlTwo, 'target_critic')
        
        self.buffer = ReplayBuffer(self.buffer_size, self.input_dims, self.n_actions)
        self.noise = OUActionNoise(mu=np.zeros(n_actions))
        
        self.update_network_parameters(tau=1)
        
    def choose_action(self, observation):
        self.actor.eval()
        # get policy
        state = torch.Tensor([observation])
        state = self.actor.route_data(state)
        mu = self.actor(state)
        mu_prime = mu + torch.Tensor(self.noise)
        self.actor.train()
        
#         # get action from policy
#         mu = F.softmax(mu, dim=1)
#         action_probs = torch.distributions.Categorical(mu)
#         a = action_probs.sample()
        
#         self.log_prob = action_probs.log_prob(a)
        
        return mu_prime.cpu().detach(),numpy()[0]
        
    def remember(self, state_null, action, reward, state_prime, done):
        self.buffer.store_transition(state_null, action, reward, state_prime, done)
    
    def save_models(self):
        self.actor.save_checkpoint()
        self.critic.save_checkpoint()
        self.actor_prime.save_checkpoint()
        self.critic_prime.save_checkpoint()
    
    def load_models(self):
        self.actor.load_checkpoint()
        self.critic.load_checkpoint()
        self.actor_prime.load_checkpoint()
        self.critic_prime.load_checkpoint()
        
    def learn(self):
        if self.buffer.mem_cntr < self.batch_size:
            return
        
        null_states, actions, rewards, prime_states, terminal = self.buffer.sample_replay(self.batch_size)
        null_states = self.actor.route_data(torch.Tensor([null_states]))
        actions = self.actor.route_data(torch.Tensor([actions]))
        rewards = self.actor.route_data(torch.Tensor([rewards]))
        prime_states = self.actor.route_data(torch.Tensor([prime_states]))
        terminal = self.actor.route_data(torch.Tensor([terminal]))

        # target_critic_value_null = self.critic_prime(state_null)
        # target_critic_value_prime = self.critic_prime(state_prime)

        Q = self.critic(null_state, actions)
        Q_prime = self.critic_prime(prime_state, self.actor_prime(prime_state))
        # use terminal tensor as a mask to modify respective rewards
        Q_prime[terminal] = 0.0
        Q_prime = Q_prime.view[-1]
        
        y = reward + self.gamma * Q_prime
        y = y.view(self.batch_size, 1)

        self.actor.optimizer.zero_grad()
        actor_loss = -self.critic(null_states, self.actor(null_states))
        actor_loss = torch.mean(actor_loss)
        actor_loss.backward()
        self.actor.optimizer.step()
        
        self.critic.optimizer.zero_grad()
        critic_loss = F.mse_loss(y, Q)
        critic_loss.backward()
        self.critic.optimizer.step()
        
        self.update_network_parameters()
        
    def update_network_parameters(self, tau=None):
        if tau is None:
            tau = self.tau
            
        theta_mu = self.actor.state_dict()
        theta_Q = self.critic.state_dict()
        theta_mu_prime = self.actor_prime.state_dict()
        theta_Q_prime = self.critic_prime.state_dict()
        # theta_mu = {name:param for name,param in self.actor.named_parameters()}
        # theta_Q = {name:param for name,param in self.critic.named_parameters()}
        # theta_mu_prime = {name:param for name,param in self.actor_prime.named_parameters()}
        # theta_Q_prime = {name:param for name,param in self.critic_prime.named_parameters()}
        
        for target_network, null_network in [[theta_mu_prime,theta_mu],[theta_Q_prime,theta_Q]]:
            for param in null_network.keys():
                target_network[param] = (tau * null_network[param].clone()) + \
                ((1 - tau) * target_network[param].clone())
        
        self.actor_prime.load_state_dict(theta_mu_prime)
        self.critic_prime.load_state_dict(theta_Q_prime)
        
#         L = (1 / batch_size) * ((y - Q) ** 2)
#         # target_actor_loss = -self.log_prob*delta
#         # target_critic_loss = delta**2

#         # (target_actor_loss + target_critic_loss).backward()
#         L.backward()

#         self.actor.optimizer.step()