In [0]:
import os
import time
import random
import numpy as np
import matplotlib.pyplot as plt
import pybullet_envs
import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
from gym import wrappers
from torch.autograd import Variable
from collections import deque


In [0]:
class ReplayBuffer(object):
    def __init__(self,max_size = 1e6):
      self.storage = []
      self.max_size = max_size
      self.ptr = 0

    def add(self,transition):
      if len(self.storage) == self.max_size:
        self.storage[int(self.ptr)] = transition
        self.ptr = (self.ptr+1)%self.max_size
      else:
        self.storage.append(transition)

    def sample(self,batch_size):
      ind = np.random.randint(0,len(self.storage),batch_size)
      batch_states,batch_next_states,batch_actions,batch_rewards,batch_dones = [],[],[],[],[]
      for i in ind:
        state,next_state,action,reward,done = self.storage[i]
        batch_states.append(np.array(state,copy=False))
        batch_next_states.append(np.array(next_state,copy=False))
        batch_actions.append(np.array(action,copy=False))
        batch_rewards.append(np.array(reward,copy=False))
        batch_dones.append(np.array(done,copy=False))
      return np.array(batch_states),np.array(batch_next_states),np.array(batch_actions) \
      ,np.array(batch_rewards).reshape(-1,1),np.array(batch_dones).reshape(-1,1)



In [0]:
class Actor(nn.Module):
  def __init__(self,state_dims,action_dim,max_action):
    #max action is to clip in case we added too much noise
    super(Actor, self).__init() #activate the inheritence
    self.layer_1 = nn.Linear(state_dims, 400)
    self.layer_2 = nn.Linear(400,300)
    self.layer_3 = nn.Linear(300,action_dim)
    self.max_action = max_action

  def forward(self,x):
    x = F.relu(self.layer_1(x))
    x = F.relu(self.layer_2(x))
    x = self.max_action * torch.tanh(self.layer_3(x))
    return x



In [0]:
class Critic(nn.Module):
  def __init__(self,state_dims,action_dim):
    super(Critic, self).__init() #activate the inheritence
    #first critic network
    self.layer_1 = nn.Linear(state_dims + action_dim, 400)
    self.layer_2 = nn.Linear(400,300)
    self.layer_3 = nn.Linear(300,action_dim)
    #second critic network
    self.layer_4 = nn.Linear(state_dims + action_dim, 400)
    self.layer_5 = nn.Linear(400,300)
    self.layer_6 = nn.Linear(300,action_dim)
    
  def forward(self,x, u): # x - state u - action
    xu = torch.cat([x,u],1)# 1 for vertical concatenation, 0 for horizontal
    #forward propogation of first critic
    x1 = F.relu(self.layer_1(xu))
    x1 = F.relu(self.layer_2(x1))
    x1 = self.layer_3(x1)

    #forward propogation of second critic
    x2 = F.relu(self.layer_4(xu))
    x2 = F.relu(self.layer_5(x2))
    x2 = self.layer_6(x2)

    return x1,x2

  def Q1(self, x, u): # x - state u - action. This is for updating Q values
    xu = torch.cat([x,u], 1) 1 for vertical concatenation, 0 for horizontal
    x1 = F.relu(self.layer_1(xu))
    x1 = F.relu(self.layer_2(x1))
    x1 = self.layer_3(x1)
    return x1
    



In [0]:
#selecting the device cpu or gpu
device = torch.device('cuda', if torch.cuda.is_available() else 'cpu')

#Building whole training process into a class

class T3D(object):
  def __init__(self,state_dims,action_dim,max_action):
    #making sureour T3D class can work with any env
    self.actor = Actor(state_dims,action_dim,max_action).to(device) #GD
    self.actor_target = Actor(state_dims,action_dim,max_action).to(device) #polyak averaging
    self.actor_target.load_state_dict(self.actor.state_dict)
    #initializing with model weights to keep them same
    self.actor_optimizer = torch.optim.Adam(self.actor_parameters())

    self.critic = Critic(state_dims,action_dim).to_device() #GD
    self.critic_target = Critic(state_dims,action_dim,max_action).to(device) #polyak averaging
    self.critic_target.load_state_dict(self.critic.state_dict)
    #initializing with model weights to keep them same
    self.critic_optimizer = torch.optim.Adam(self.critic_parameters())
    self.max_action = max_action

  def select_action(self,state):
    state = torch.Tensor(state.reshape(-1,1)).to(device)
    return self.actor(state).cpu().data.numpy().flatten()

In [0]:
def train(self,replay_buffer,iterations,batch_size=100,discount=0.99,tau=0.005,
          policy_noise=0.2,noise_clip=0.2,noise_clip=0.5,polic_Freq=2):
  for it in range(iterations):
    #step 4 We sample from a batch of transition (s, s',a,r) from memory
    batch_states, batch_next_states, batch_actions, batch_rewards, batch_dones = \
      replay_buffer.sample(batch_size)
    state = torch.Tensor(batch_state).to(device)
    next_state = torch.Tensor(batch_next_states).to(device)
    action = torch.Tensor(batch_actions).to(device)
    reward = torch.Tensor(batch_rewards).to(device)
    done = torch.Tensor(batch_dones).to(device)
  #Step 5 From the next state s', The actor target plays the next action a'
  next_action = self.actor_target.forward(next_state)
  #step 6  We add Gaussian noise to this next action a' we clamp it in a range of values supported by the environment
  noise = torch.Tensor(batch_actions).data_normal_(0, policy_noise).to(device)
  noise = noise.clamp(-noise_clip,noise_clip)
  next_action = (next_action + noise).clamp(-self.max_action,self.max_action)
  #step 7 The two critic targets take each the couple (s',a') as input
  # and return two Q values Qt1(s',a') and Qt2(s',a') as outputs
  target_Q1, target_Q2 = self.critic_target.forward(next_state,next_action)
  #Step 8 Keep the minimum of two Q values
  target_Q = torch.min(target_Q1,target_Q2)
  # Steps 9 We get the final target of two critic models which is:
  #Qt = r + gamma * min(Qt1, Qt2)
  # target_q or Qt as reward + (1-done)  * discount * target_Q
  # 0 = episode not over, 1 = episode over
  # we can't run the above equation efficiently as some components are in computational graphs
  # and some are not. We need to make on minor modification
  tager_Q = rewar + ((1-done)*discount*target_Q).detach()

  #step 10 Two critic models take each the couple(s,a)
  # as input and return 2 Q values
  current_Q1, current_Q2 = self.critic.forward(state,action)

  #Setps 11 We compute the loss coming from two critic models
  critic_loss = F.mse_loss(current_Q1,target_Q) + F.mse_loss(current_Q2,target_Q)

  # STep 12 We backpropagate this critic loss and update the parameters of the two critic models with a Adam optimizer
  self.critic_optimizer.zero_grad() #initializing the gradients to zero
  critic_loss.backward() # computing the gradients
  self.critic_optimizer.step() #performing the weight updates

  #Step 13: Once every two iterations, we update our actor model by
  # performing gradient ascent on the output of the first critic model
  if it % policy_freq == 0:
    #This is DPG part
    actor_loss = -(self.critic.Q1(state, self.actor(state)).mean()
    self.actor_optimizer.grad_zer()
    actor_loss.backward()
    self.actor_optimizer.step()

  # Steps 14 : Still, in once every two iterations, we update our Actor Target by Polyak Averaging
  for param, target_param in zip(Self.actor_parameters(), self.actor_target.parameters()):
    target_param.data.copy_(tau * param.data + ( 1 - tau) * target_param.data)

  # Step 15 : Still, in once every two iterations, we update our Critic Target
  # by Polyak Averaging
  for param, target_param in zip(Self.critic_parameters(), self.critic_target.parameters()):
    target_param.data.copy_(tau * param.data + ( 1 - tau) * target_param.data)