In [1]:
import os
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim


In [2]:
class OUActionNoise(object):
    def __init__(self, mu, sigma=0.15, theta=0.2, dt=1e-2, x0=None):
        self.theta = theta
        self.mu = mu
        self.sigma = sigma
        self.theta = theta
        self.dt = dt
        self.x0 = x0
        self.reset()

    def __call__(self):
        x = self.x_prev + self.thate*(self.mu - self.x_prev)*self.dt + self.sigma*np.sqrt(self.dt)*np.random.normal(size=self.mu.shape)
        self.x_prev = x
        return x

    def reset(self):
        self.x_prev = self.x0 if self.x0 is not None else np.zeros_like(self.mu)
        

In [6]:
class HERbuffer(object):
    def __init__(self, inp_shape, num_actions):
        self.inp_shape = inp_shape
        self.num_actions = num_actions
        self.state_mem = np.zeros(inp_shape)
        self.new_state_mem = np.zeros(inp_shape)
        self.action_mem = np.zeros(num_actions)
        self.reward_mem = 0
        self.terminal_mem = False

    def store_transition(self, state, action, reward, new_state, done=False):
        self.state_mem = state
        self.action_mem = action
        self.new_state_mem = new_state
        self.reward_mem = reward
        self.terminal_mem = 1 - int(done)

    #def manipulate_buffer(self):
        #r,c = self.new_state_mem.shape
        #desired_goal = new_state_mem[-1][]


    def get_her_buffer(self, state, action, reward, new_state, done=False):
        store_transition(state, action, reward, new_state, done)

        return self.state_mem, self.action, self.reward, self.new_state, self.done


In [11]:
import gym

In [12]:
env = gym.make('FetchSlide-v1')

In [13]:
s = env.reset()
s.

{'observation': array([ 9.95644936e-01,  7.48909349e-01,  4.12685879e-01,  9.21860028e-01,
         6.63677386e-01,  4.14022562e-01, -7.37849079e-02, -8.52319625e-02,
         1.33668285e-03, -2.02409822e-06,  1.46269158e-03, -5.32964268e-03,
         1.24548653e-04, -1.88202233e-02,  1.18415598e-03, -5.54487049e-05,
         7.46713768e-05,  1.86450993e-02, -4.64845349e-04, -4.83036132e-03,
        -1.22686993e-03,  5.01588816e-05, -2.30984296e-06,  4.63429471e-07,
         5.47231104e-05]),
 'achieved_goal': array([0.92186003, 0.66367739, 0.41402256]),
 'desired_goal': array([1.38655201, 0.88080161, 0.41401894])}

In [15]:
her = HERbuffer(25,4)

In [None]:
for i in range(10):
    s = env.reset()
    a = 

In [41]:
a = np.zeros((5,5))
a[-1][4]

0.0

In [37]:
s = env.reset()
actor(s)

AttributeError: 'dict' object has no attribute 'dim'

In [7]:
class ReplayBuffer(object):
    def __init__(self, max_size, inp_shape, num_actions):
        self.mem_size = max_size
        self.inp_shape = inp_shape
        self.num_actions = num_actions
        self.mem_cntr = 0
        self.state_mem = np.zeros((self.mem_size, *inp_shape))
        self.new_state_mem = np.zeros((self.mem_size, *inp_shape))
        self.action_mem = np.zeros((self.mem_size, self.num_actions))
        self.reward_mem = np.zeros(self.mem_size)
        self.terminal_mem = np.zeros(self.mem_size, dtype=float32)

    def store_transition(self, state, action, reward, new_state, done=False):
        index = self.mem_cntr % self.mem_size
        self.state_mem[index] = state
        self.action_mem[index] = action
        self.new_state_mem[index] = new_state
        self.reward_mem[index] = reward
        self.terminal_mem = 1 - int(done)
        self.mem_cntr += 1

    def sample_buffer(self, batch_size):
        max_mem = min(self.mem_cntr, self.mem_size)
        batch = np.random.choice(max_mem, batch_size)
        states = self.state_mem[batch]
        new_states = self.new_state_mem[batch]
        actions = self.action_mem[batch]
        rewards = self.reward_mem[batch]
        terminals = self.terminal_mem[batch]

        return states, actions, new_states, rewards, terminals


In [35]:
class ActorNw(nn.Module):
    def __init__(self, name, alpha, inp_dims, fc1_dims, fc2_dims, num_actions, action_bound=1, batch_size=64, chkp_dir='tmp/ddpg'):
        super(ActorNw, self).__init__()
        self.lr = alpha
        self.name = name
        self.inp_dims = inp_dims
        self.fc1_dims = fc1_dims
        self.fc2_dims = fc2_dims
        self.num_actions = num_actions
        self.action_bound = action_bound
        self.batch_size = batch_size
        self.chkp_file = os.path.join(chkp_dir,name+'_ddpg')

        self.fc1 = nn.Linear(self.inp_dims, self.fc1_dims)
        f1_bound = 1./np.sqrt(self.fc1.weight.data.size()[0])
        nn.init.uniform_(self.fc1.weight.data, -f1_bound, f1_bound)
        nn.init.uniform_(self.fc1.bias.data, -f1_bound, f1_bound)
        self.bn1 = nn.LayerNorm(self.fc1_dims)

        self.fc2 = nn.Linear(self.fc1_dims, self.fc2_dims)
        f2_bound = 1./np.sqrt(self.fc2.weight.data.size()[0])
        nn.init.uniform_(self.fc2.weight.data, -f2_bound, f2_bound)
        nn.init.uniform_(self.fc2.bias.data, -f2_bound, f2_bound)
        self.bn2 = nn.LayerNorm(self.fc2_dims)

        self.mu = nn.Linear(self.fc2_dims, self.num_actions)
        f3_bound = 0.003
        nn.init.uniform_(self.mu.weight.data, -f3_bound, f3_bound)
        nn.init.uniform_(self.mu.bias.data, -f3_bound, f3_bound)

        self.optimizer = optim.Adam(self.parameters(), lr=self.lr)
        self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

        self.to(self.device)

    def forward(self, state):
        x = self.fc1(state)
        x = F.relu(self.bn1(x))
        
        x = self.fc2(x)
        x = F.relu(self.bn2(x))

        x = self.mu(x)
        return torch.tanh(x)

    def save_checkpoint_to_file(self):
        print('############  Saving checkpoint  ##############')
        torch.save(self.state_dict(), self.chkp_file)

    def load_checkpoint_from_file(self):
        print('#############  Loading checkpoint  ###############')
        self.load_state_dict(torch.load(self.chkp_file))


In [25]:
class CriticNw(nn.Module):
    def __init__(self, name, beta, inp_dims, fc1_dims, fc2_dims, num_actions, action_bound=1, batch_size=64, chkp_dir='tmp/ddpg'):
        super(ActorNw, self).__init__()
        self.lr = beta
        self.name = name
        self.inp_dims = inp_dims
        self.fc1_dims = fc1_dims
        self.fc2_dims = fc2_dims
        self.num_actions = num_actions
        self.action_bound = action_bound
        self.batch_size = batch_size
        self.chkp_file = os.path,join(chkp_dir,name+'_ddpg')

        self.fc1 = nn.Linear(*self.inp_dims, self.fc1_dims)
        f1_bound = 1./np.sqrt(self.fc1.weight.data.size()[0])
        nn.init.uniform_(self.fc1.weight.data, -f1_bound, f1_bound)
        nn.init.uniform_(self.fc1.bias.data, -f1_bound, f1_bound)
        self.bn1 = nn.LayerNorm(self.fc1_dims)

        self.fc2 = nn.Linear(self.fc1_dims, self.fc2_dims)
        f2_bound = 1./np.sqrt(self.fc2.weight.data.size()[0])
        nn.init.uniform_(self.fc2.weight.data, -f2_bound, f2_bound)
        nn.init.uniform_(self.fc2.bias.data, -f2_bound, f2_bound)
        self.bn2 = nn.LayerNorm(self.fc2_dims)

        self.action_value = nn.Linear(self.num_actions, fc2_dims)
        self.q = nn.Linear(self.fc2_dims, 1)
        f3_bound = 0.003
        nn.init.uniform_(self.q.weight.data, -f3_bound, f3_bound)
        nn.init.unifor_(self.q.bias.data, -f3_bound, f3_bound)

        self.optimizer = optim.Adam(self.parameters(), lr=self.lr)
        self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

        self.to(self.device)

    def forward(self, state, action):
        state_value = self.fc1(state)
        state_value = F.relu(self.bn1(state_value))
        
        state_value = self.fc2(state_value)
        state_value = F.relu(self.bn2(state_value))

        action_value = F.relu(self.action_value(action))
        state_action_value = F.relu(torch.add(state_value, action_value))
        state_action_value = self.q(state_action_value)

        return state_action_value

    def save_checkpoint_to_file(self):
        print('############  Saving checkpoint  ##############')
        torch.save(self.state_dict(), self.chkp_file)

    def load_checkpoint_from_file(self):
        print('#############  Loading checkpoint  ###############')
        self.load_state_dict(torch.load(self.chkp_file))


In [18]:
class Agent(object):
    def __init__(self, alpha, beta, input_dims, tau, env, num_actions, layer1_size, layer2_size, gamma=0.99, batch_size=64, max_size=100000):
        self.alpha = alpha
        self.beta = beta
        self.gamma = gamma
        self.tau = tau
        self.input_dims = input_dims
        self.batch_size = batch_size
        self.memory = ReplayBuffer(max_size, input_dims,num_actions)

        self.actor = ActorNw('Actor', alpha, input_dims, layer1_size, layer2_size, num_actions)

        self.critic = CriticNw('Critic', beta, input_dims, layer1_size, layer2_size, num_actions)

        self.target_actor = ActorNw('TargetActor', alpha, input_dims, layer1_size, layer2_size, num_actions)

        self.target_critic = CriticNw('TargetCritic', beta, input_dims, layer1_size, layer2_size, num_actions)

        self.noise = OUActionNoise(mu=np.zeros(num_actions))

        self.update_network_parameters(tau=1)

    def choose_action(self, observation):
        self.actor.eval()
        observation = torch.tensor(observation,dtype=torch.float).to(self.actor.device)
        mu = self.actor.forward(observation).to(self.actor.device)
        mu_prime = mu + torch.tensor(self.noise(),dtype=torch.float).to(self.actor.device)
        self.actor.train()
        return mu_prime.cpu().detatch().numpy()

    def remember(self, state, action, reward, new_state, done):
        self.memory.store_transition(state, action, reward, new_state, done)
    
    def learn(self):
        if self.memory.mem_cntr < self.batch_size:
            return
        
        states, actions, new_states, rewards, done = self.memory.sample_buffer(self.batch_size)
        
        states = torch.tensor(states, dtype=torch.float).to(self.critic.device)
        actions = torch.tensor(actions, dtype=torch.float).to(self.critic.device)
        new_states = torch.tensor(new_states, dtype=torch.float).to(self.critic.device)
        rewards = torch.tensor(rewards, dtype=torch.float).to(self.critic.device)
        done = torch.tensor(done).to(self.critic.device)

        self.target_actor.eval()
        self.target_critic.eval()
        self.critic.eval()

        # Q_phi(s,a)
        critic_value = self.critic.forward(states, actions)

        # mu_theta_targ(s')
        target_actions = self.target_actor.forward(new_states)

        # Q_phi_targ(s', mu_theta_targ(s'))
        critic_value_ = self.target_critic(new_state, target_actions)

        # Find the target value for the batch size using the Bellman equation  
        target_value = []
        for idx in range(self.batch_size):
            target_value.append(rewards[idx] + self.gamma*done[idx]*critic_value_[idx])

        target_value = torch.tensor(target_value, dtype=float).to(self.critic.device)
        target_value = target_value(self.batch_size,1)

        # Gradient descent to minimize the loss function
        self.critic.train()
        self.critic.optimizer.zero_grad()
        critic_loss = F.mse_loss(target_value, critic_value)
        critic_loss.backward()
        self.critic.optimizer.step()
        self.critic.eval()
        
        # gradient ascent to find the optimal policy
        self.actor.optimizer.zero_grad()
        mu_theta = self.actor.forward(states)
        self.actor.train()
        actor_obj_function = -self.critic.forward(states, mu_theta)
        actor_obj_function = torch.mean(actor_obj_function)
        actor_obj_function.backward()
        self.actor.optimizer.step()

        self.update_network_parameters()

    def update_network_parameters(self, tau=None):
        if tau is None:
            tau = self.tau

        actor_params = self.actor.named_parameters()
        target_actor_params = self.target_actor.named_parameters()
        critic_params = self.critic.named_parameters()
        target_critic_params = self.target_critic.named_parameters()

        actor_state_dict = dict(actor_params)
        target_actor_state_dict = dict(target_actor_params)
        critic_state_dict = dict(critic_params)
        target_critic_state_dict = dict(target_critic_params)

        # Update step phi_targ = tau*phi_targ + (1 - tau)*phi 
        for key in critic_state_dict:
            critic_state_dict[key] = tau*critic_state_dict[key].clone() + (1-tau)*target_critic_state_dict[key].clone()
        self.target_critic.load_state_dict(actor_state_dict)

        # Update step theta_targ = tau*theta_targ + (1 - tau)*theta 
        for key in actor_state_dict:
            actor_state_dict[key] = tau*actor_state_dict[key].clone() + (1-tau)*target_actor_state_dict[key].clone()
        self.target_actor.load_state_dict(actor_state_dict)
    
    def save_models(self):
        self.actor.save_checkpoint_to_file()
        self.critic.save_checkpoint_to_file()
        self.target_actor.save_checkpoint_to_file()
        self.target_critic.save_checkpoint_to_file()

    def load_models(self):
        self.actor.load_checkpoint_from_file()
        self.critic.load_checkpoint_from_file()
        self.target_actor.load_checkpoint_from_file()
        self.target_critic.load_checkpoint_from_file()

    def check_actor_params(self):
        current_actor_params = self.actor.named_parameters()
        current_actor_dict = dict(current_actor_params)
        original_actor_dict = dict(self.original_actor.named_parameters())
        original_critic_dict = dict(self.original_critic.named_parameters())
        current_critic_params = self.critic.named_parameters()
        current_critic_dict = dict(current_critic_params)
        print('Checking Actor parameters')

        for param in current_actor_dict:
            print(param, T.equal(original_actor_dict[param], current_actor_dict[param]))
        print('Checking critic parameters')
        for param in current_critic_dict:
            print(param, T.equal(original_critic_dict[param], current_critic_dict[param]))
        input()