# **T3D** **Program** **Understanding**

# Initialisation

In [0]:
import os
import time
import random
import numpy as np
import matplotlib.pyplot as plt
import pybullet_envs
import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
from gym import wrappers
from torch.autograd import Variable
from collections import deque

## Initialize the Experience Replay Memory with a size of 1e6. Then populate it with new transitions

In [0]:
class ReplayBuffer(object):
  def __init__(self, max_size = 1e6):
    self.storage = []
    self.max_size = max_size
    self.ptr = 0

  def add(self, transition):
    if len(self.storage) == self.max_size:
      self.storage[int(self.ptr)] = transition
      self.ptr = (self.ptr + 1) % self.max_size
    else:
      self.storage.append(transition)
  

  def sample(self, batch_size):
    ind = np.random.randint(0, len(self.storage), batch_size)
    batch_states, batch_next_states, batch_actions, batch_rewards, batch_dones = [], [], [], []

    for i in ind:
      state, next_state, action, reward, done = self.storage[i]
      batch_states.append(np.array(state, copy= False))
      batch_next_states.append(np.array(next_state, copy = False))
      batch_actions.append(np.array(action, copy = False))
      batch_rewards.append(np.array(reward, copy = False))
      batch_dones.append(np.array(done, copy = False))
    return np.array(batch_states), np.array(batch_next_states), np.array(batch_actions), np.array(batch_rewards).reshape(-1,1), np.array(batch_dones).reshape(-1,1)


## Build one DNN for the Actor model and one for Actor Target

In [0]:
class Actor(nn.Module):

  def __init__(self, state_dims, action_dim, max_action):
    #max_action is to clip if we add too much noise
    super(Actor, self).__init__() #Activate the inheritance
    self.layer_1 = nn.Linear(state_dims, 400)
    self.layer_2 = nn.Linear(400, 300)
    self.layer_3 = nn.Linear(300, action_dim)
    self.max_action = max_action

  def forward(self, x):
    x = F.relu(self.layer_1(x))
    x = F.relu(self.layer_2(x))
    x = self.max_action * torch.tanh(self.layer_3(x))
    return x



## Build two DNNs for the two Critic models and two DNNs for the two Critic Targets

In [0]:
class Critic(nn.Module):

  def __init__(self, state_dims, action_dim):
    #max_action is to clip in case we add too much noise
    super(Critic, self).__init__() #Activate theinheritance
    #First Critic Network
    self.layer_1 = nn.Linear(state_dims + action_dim, 400)
    self.layer_2 = nn.Linear(400, 300)
    self.layer_3 = nn.Linear(300, action_dim)

    #Second Critic Network
    self.layer_4 = nn.Linear(state_dims + action_dim, 400)
    self.layer_5 = nn.Linear(400, 300)
    self.layer_6 = nn.Linear(300, action_dim)    

  def forward(self, x, u): #x - state, u - action
    xu = torch.cat([x,u],1) # 1- Vertical concatenation, 0 - Horizontal Concatenation
    # Forward propogation on first critic
    x1 = F.relu(self.layer_1(xu))
    x1 = F.relu(self.layer_2(x1))
    x1 = self.layer_3(x1)
    # Forward propagation on second critic
    x2 = F.relu(self.layer_4(xu))
    x2 = F.relu(self.layer_5(x2))
    x2 = self.layer_6(x2)

    return x1, x2

  def Q1(self, x, u):#x - state, u - action This is used for updating the Q-Values
    xu = torch.cat([x,u],1) # 1- Vertical concatenation, 0 - Horizontal Concatenation
    x1 = F.relu(self.layer_1(xu))
    x1 = F.relu(self.layer_2(x1))
    x1 = self.layer_3(x1)
    return x1
    




## Training process. Create a T3D class, initialize variables and get ready for step 4

In [0]:
#Selecting the device(CPU or GPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

#Building the whole training process into class
class T3D(object):
  def __init__(self, state_dims, action_dim, max_action):
    #Making sure our T3D class can run with any env
    self.actor = Actor(state_dims, action_dim, max_action).to(device) #GD
    self.actor_target = Actor(state_dims, action_dim, max_action).to(device) #Polyak Avg
    self.actor_target.load_state_dict(self.actor.state_dict)
    #Intialising with model weights to keep them same
    self.actor_optimizer = torch.optim.Adam(self.actor.parameters())

    self.critic = Critic(state_dims, action_dim).to(device) #GD
    self.critic_target = critic(state_dims, action_dim).to(device) #Polyak Avg
    self.critic_target.load_state_dict(self,critic.state_dict)
    #Intialising with model weights to keep them same
    self.critic_optimizer = torch.optim.Adam(self.critic.parameters())

  def select_action(self, state):
    state = torch.Tensor(state.reshape(-1,1)).to(device)
    return self.actor(state).cpu().data.numpy().flatten()
    # Need to convert to numpy, remember clipping?


## Training the model

In [0]:
def train(self, replay_buffer, iterations, batch_size = 100, discount = 0.99,
          tau = 0.005, policy_noise = 0.2, noise_clip=0.5, policy_freq=2):
  for it in range(iterations):
    #Step- 4 We sample from a batch of transitions (s, s', a, r) from memory
    batch_states, batch_next_states, batch_actions, batch_rewards, batch_dones = replay_buffer.sample(batch_size)
    state = torch.Tensor(batch_states).to(device)
    next_state = torch.Tensor(batch_next_states).to(device)
    action = torch.Tensor(batch_actions).to(device)
    reward = torch.Tensor(batch_rewards).to(device)
    done = torch.Tensor(batch_dones).to(device)

    #Step - 5: From the next state s', The actor target plays the next actions a'
    next_action = self.actor_target.forward(next_state)

    #Step - 6: We add Guassian noise to this next action a' and we clamp it in a 
    # range of values supported by env
    noise = torch.Tensor(batch_actions).data.normal_0(0, policy_noise).to(device)
    noise = noise.clamp(-noise_clip, noise_clip)
    next_action = (next_action + noise).clamp(-self.max_action, self.max_action)

    #Step - 7: The two critic targets take each the couple (s', a') as inout and 
    # return two Q-Values, Qt(s',a') and Qt(s', a') as outputs
    target_Q1, target_Q2 = self.critic_target.forward(next_state, next_action)

    #Step - 8: we keep min of these two Q-Values
    target_Q = torch.min(target_Q1, target_Q2)

    #Step - 9: # We get the final target of the two Critic models, which is:
          #  Qt = r + gamma * min(Qt1, Qt2)
          # target_Q = reward + (1-done) * discount * target_Q
          # 0 = episode not over, 1 = episode over
          # we can't run the above the equation efficiently as some components are in Computational
          # graphs and some are not, We need to make one minor modification
    target_Q = reward + ((1 - done) * discount * target_Q).detach()
    
    #Step - 10: # Two critic models take (s, a) as input and return two Q-Vales
    current_Q1, current_Q2 = self.critic.forward(state, action)

    #Step - 11: Compute the critic loss
    critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q)

    #Step - 12: We backpropagate this critic loss and update the parameters of two critic
    #models with Adam optimiser
    self.critic_optimizer.zero.grad() #Initialising the gradient to Zero
    critic_loss.backward() #computing the gradients
    self.critic_optimizer.step() #Performing the weight update

    #Step - 13: Once every two iterations, we update our actor model by performing gradient ascent 
    # on the ouput of first critic model
    if it % policy_freq == 0:
      #This is DPG part
      actor_loss = -(self.critic.Q1(state, self.actor(state)).mean())
      self.actor_optimizer.grad_zero()
      actor_loss.backward()
      self.actor_optimizer.steo()

    #Step - 14: Still once every two iterations, we update the weights of actor target by Polyak avg
    for param, target_param in zip(self.actor.parameters(),
                                   self.actor_target.parameters()):
      target_param.data.copy_(tau * param.data + (1- tau) * target_param.data)

    #Step - 15: Still once every two iterations, we update the weights of critic target by Polyak avg
    for param, target_param in zip(self.critic.parameters(),
                                   self.critic_target.parameters()):
      target_param.data.copy_(tau * param.data + (1- tau) * target_param.data)

    #T3D is done now



