### T3D Implementation Steps



In [0]:
### Initialization
# Import essential packages
import os
import time
import random
import numpy as np
import matplotlib.pyplot as plt
import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
from gym import wrappers
from torch.autograd import Variable
from collections import deque



In [3]:
package = "pybullet_envs"
try:
    __import__(package)
except ImportError:
    !pip install pybullet

Collecting pybullet
[?25l  Downloading https://files.pythonhosted.org/packages/d8/ac/a422ab8d1c57ab3f43e573b5a5f532e6afd348d81308fe66a1ecb691548e/pybullet-2.7.1-cp36-cp36m-manylinux1_x86_64.whl (95.0MB)
[K     |████████████████████████████████| 95.0MB 62kB/s 
[?25hInstalling collected packages: pybullet
Successfully installed pybullet-2.7.1


# STEP 1 
### We initialize the Experience Replay Memory with a size of 1e6.
### Then we populate it with new transitions



In [0]:
class ReplayBuffer(object):
	def __init__(self, max_size = 1e6):
		self.storage = []
		self.max_size = max_size
		self.ptr = 0
		
	def add(self, transition):
		if len(self.storage) == self.max_size:
			self.storage[int(self.ptr)] = trasition
			self.ptr = (self.ptr + 1) % self.max_size
		else:
			self.storage.append(transition)
	
	def sample(self, batch_size):
		ind = np.random.randint(0, len(self.storage), batch_size)
		batch_states, batch_next_states, batch_actions, batch_rewards, batch_done = [], [], [], []
		for i in ind:
			state, next_state, action, reward, done = self.storage[i]
			batch_states.append(np.array(state, copy=False))
			batch_next_states.append(np.array(next_state, copy=False))
			batch_actions.append(np.array(action, copy=False))
			batch_rewards.append(np.array(reward, copy = False))
			batch_done.append(np.array(done, copy = False))
		return np.array(batch_states), np.array(batch_next_states), np.array(batch_actions), np.array(batch_rewards).reshape(-1,1),np.array(batch_done).reshape(-1,1)


# STEP 2 
### Build one DNN for the Actor model and one for Actor Target


In [0]:
class Actor(nn.Module):
	def __init__(self, state_dims, action_dim, max_action):
		# max_action is to clip in case we added too much noise
		super(Actor, self).__init__() # activate the inheritance
		self.layer_1 = nn.Linear(state_dims, 400)
		self.layer_2 = nn.Linear(400, 300)
		self.Layer_3 = nn.Linear(300, action_dim)
		self.max_action = max_action
		
	def forward(self,x):
		x = F.relu(self.layer_1(x))
		x = F.relu(self.layer_2(x))
		x = self.max_action * torch.tanh(self.layer_3(x))
		return x



## 	STEP 3 
### Build two DNNs for the two Critic models and two DNNs for the two Critic Targets	


In [0]:

class Critic(nn.Module):

	def __init__(self, state_dims, action_dim):
		#max_action is to clip in case we added too much noise
		super(Critic, self).__init__()
		# First Critic Network
		self.layer_1 = nn.Linear(state_dims + action_dim, 400)
		self.layer_2 = nn.Linear(400, 300)
		self.Layer_3 = nn.Linear(300, action_dim)
		#Second Critic Network
		self.layer_4 = nn.Linear(state_dims + action_dim, 400)
		self.layer_5 = nn.Linear(400, 300)
		self.Layer_6 = nn.Linear(300, action_dim)
		
	def forward(self,x): # x - state, u-action
		xu = torch.cat([x,u],1) #1 for vertical concatenation, 0 for horizondal
		# forward propagation on first Critic
		x1 = F.relu(self.layer_1(xu))
		x1 = F.relu(self.layer_2(x1))
		x1 = self.layer_3(x1)
		# forward propagation on second Critic
		x2 = F.relu(self.layer_4(xu))
		x2 = F.relu(self.layer_5(x2))
		x2 = self.layer_6(x2)
		return x1,x2
		
	def Q1(self, x,u ): # x-state, u=action This is used for updating the Q values
		xu = torch.cat([x,u],1)
		x1 = F.relu(self.layer_1(xu))
		x1 = F.relu(self.layer_2(x1))
		x1 = self.layer_3(x1)
		return x1


### STEP 4-15 
### Training process. Create a T3D class, initialize variables and get ready for step 4		


In [0]:
#device = torch.device('cuda', if torch.cuda.is_available() else 'cpu')
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


In [0]:

# Building the whole training process into a class
class T3D(object):
	def __init__(self, state_dims, action_dim, max_action):
		# making sure our T3D class can work with any environment
		self.actor = Actor(state_dims, action_dim, max_action).to(device) # GD
		self.action_target = Actor(state_dims, action_dim, max_action).to(device) # Polyak Avg
		self.actor_target.load_state_dict(self.actor.state_dict)
		#initializing with model weights to keep them same
		self.actor_optimizer = torch.option.Adam(self.actor.parameters())
		
		self.critic = Critic(state_dims, action_dim).to(device) # GD
		self.critic_target = critic(state_dims, action_dim).to(device) # ployak Avg
		self.critic_target.load_state_dict(self.critic.state_dict)
		# initializing with model weights to keep them same
		self.critic_optimizer = torch.option.Adam(self.critic.parameters())
		self.max_action = max_action
		
	def select_action(self, state):
		state = torch.Tensor(state.reshape(1,-1)).to(device)
		return self.actor(state).cpu().data.numpy().flatten()
		# need to convert to numpy, remember clipping?
		
		
	### STEP 4 
	### Sample from a batch of transitions (s, s', a, r) from the memory
	def train(self, replay_buffer, iterations, batch_size=100, discount = 0.99, tau=0.005, policy_noise=0.2, noise_clip=0.5, policy_freq=2):
		for it in range(iterations):
			# setp 4 we sample from a batch of transition(s, s', a, r) from memory
			batch_states, batch_next_states, batch_actions, batch_rewards, batch_dones = replay_buffer.sample(batch_size)
			state = torch.Tensor(batch_states).to(device)
			next_state = torch.Tensor(batch_next_states).to(device)
			action = torch.Tensor(batch_actions).to(device)
			reward = torch.Tensor(batch_rewards).to(device)
			done = torch.Tensor(batch_dones).to(device)
			
			#  STEP 5 
			# From the next state s', the actor target plays the next action a'
			next_state = self.actor_target.forward(next_state)
			
			# STEP 6 
			#We add Gaussian noise to this next action a' and we clamp it in a range of values supported by the environment
			
			noise = torch.Tensor(batch_actions).data.normal_(0, policy_noise).to(device)
			noise = noise.clamp(-noise_clip, noise_clip)
			next_action = (next_action+noise).clamp(-self.max_Action, self.max_action)
			
			# STEP 7 
			#The two Critic targets take each the couple (s', a') as input and return two Q values, Qt1(s', a') and Qt2(s', a') as outputs
			
			target_Q1, target_Q2 = self.critic_target.formward(next_state, next_action)
			
			#  STEP 8 
			# Keep the minimum of these two Q-Values
			target_Q = torch.min(target_Q1, target_Q2)
			
			# STEP 9 
			#We get the final target of the two Critic models, #which is:
			# Qt = r + gamma * min(Qt1, Qt2)
			# We can define 
			#target_q or Qt as reward + discount  * torch.min(Qt1, Qt2)
			# target_Q = reward +(1-done)*discount*target_Q
			# 0 = episode not over, 1 = episode over
			# we can't run the above equation efficiently as some components are in computational graphs and some are not. we need to mane one minor modification
			target_Q = reward + ((1-done)*discount * target_Q).detach()

			### STEP 10 
			### Two critic models take (s, a) and return two Q-Vales
			currnet_Q1, currnet_Q2 = self.critic.forward(state, action)
			# STEP 11 
			# Compute the Critic Loss
			critic_loss = F.mse_loss(currnet_Q1, target_Q) + F.mse_loss(currnet_Q2, target_Q)
			
			# STEP 12 
			### Backpropagate this critic loss and update the parameters of two Critic models
			self.critic_optimizer.zero_grad() # initializing the gradients to zero
			critic_loss.backward() # computing the graidents
			self.critic_optimizer.step() # performing the weight updates
			
			# STEP 13 
			#Once every two iterations, we update our Actor model by performing gradient ASCENT on the output of the first Critic model
			
			if it % policy_freq == 0:
				# This is DPG partition
				actor_loss = -(self.critic.Q1(state, self.actor(state)).mean())
				self.actor_optimizer.grad_zero()
				actor_loss.backward()
				self.actor_optimizer.step()
				
			# STEP 14 
			#Still, in once every two iterations, we update our Actor Target by Polyak Averaging	
			for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
				target_param.data.copy(tau * param.data + (1-tau) * target_param.data)
			#  STEP 15 
			# Still, in once every two iterations, we update our  Critic Target by Polyak Averaging	
			for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
				target_param.data.copy(tau * param.data + (1-tau) * target_param.data)
			
			# T3D is done now!!!
