# Twin-Delayed DDPG

## Installing the packages

In [1]:
!pip install pybullet==2.5.0
!pip uninstall gym -y
!pip install gym==0.22 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pybullet==2.5.0
  Downloading pybullet-2.5.0.tar.gz (47.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.4/47.4 MB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pybullet
  Building wheel for pybullet (setup.py) ... [?25l[?25hdone
  Created wheel for pybullet: filename=pybullet-2.5.0-cp38-cp38-linux_x86_64.whl size=65238954 sha256=efc71dc58084cc874274f5765319165ff03fe493b273336a00dbab54609bc88a
  Stored in directory: /root/.cache/pip/wheels/ad/9c/6f/a0e7cef8650c9437167b9cee2290dd9bcf61b47253e8f1d54d
Successfully built pybullet
Installing collected packages: pybullet
Successfully installed pybullet-2.5.0
Found existing installation: gym 0.25.2
Uninstalling gym-0.25.2:
  Successfully uninstalled gym-0.25.2
Looking in indexes: https://pypi.org/simple, https://u

## Importing the libraries

In [2]:
import os
import time
import random
import numpy as np
import matplotlib.pyplot as plt
import pybullet_envs
import gym
import torch
import torch.nn as nn
from torch.nn.modules import flatten
import torch.nn.functional as F
from gym import wrappers
#from torch.autograd import Variable
from collections import deque
import copy




In [3]:
class ReplayBuffer(object):

  def __init__(self, max_size=1e6):
    self.storage = []
    self.max_size = max_size
    self.ptr = 0

  def add(self, transition):
    if len(self.storage) == self.max_size:
      self.storage[int(self.ptr)] = transition
      self.ptr = (self.ptr + 1) % self.max_size
    else:
      self.storage.append(transition)

  def sample(self, batch_size):
    ind = np.random.randint(0, len(self.storage), size=batch_size)
    batch_states, batch_next_states, batch_actions, batch_rewards, batch_dones = [], [], [], [], []
    for i in ind: 
      state, next_state, action, reward, done = self.storage[i]
      batch_states.append(np.array(state, copy=False))
      batch_next_states.append(np.array(next_state, copy=False))
      batch_actions.append(np.array(action, copy=False))
      batch_rewards.append(np.array(reward, copy=False))
      batch_dones.append(np.array(done, copy=False))
    return np.array(batch_states), np.array(batch_next_states), np.array(batch_actions), np.array(batch_rewards).reshape(-1, 1), np.array(batch_dones).reshape(-1, 1)

In [4]:
class Actor(nn.Module):

  def __init__(self, state_dim, action_dim, max_action):
    super(Actor, self).__init__()
    self.max_action = max_action
    self.fc1 = nn.Linear(state_dim , 400)
    self.fc2 = nn.Linear(400, 300)
    self.fc3 = nn.Linear(300, action_dim)

  def forward(self, x):
    x = F.relu(self.fc1(x))
    x = F.relu(self.fc2(x))
    x = self.max_action * torch.tanh(self.fc3(x))
    return x


In [5]:
class Critic(nn.Module):

  def __init__(self, state, action_space):
    super(Critic, self).__init__()
    
    #first net
    self.first_fc1 = nn.Linear(state + action_space, 400)
    self.first_fc2 = nn.Linear(400, 300)
    self.first_fc3 = nn.Linear(300, 1)
    #second net
    self.sec_fc1 = nn.Linear(state + action_space, 400)
    self.sec_fc2 = nn.Linear(400, 300)
    self.sec_fc3 = nn.Linear(300, 1)

  def forward(self, state, action):
    input = torch.cat([state, action], 1) #vertical concat

    x1 = F.relu(self.first_fc1(input))
    x1 = F.relu(self.first_fc2(x1))
    x1 = self.first_fc3(x1)
    x2 = F.relu(self.sec_fc1(input))
    x2 = F.relu(self.sec_fc2(x2))
    x2 = self.sec_fc3(x2)

    return x1, x2

  def Q1(self, state, action):
    input = torch.cat([state, action], 1) 
    x1 = F.relu(self.first_fc1(input))
    x1 = F.relu(self.first_fc2(x1))
    x1 = self.first_fc3(x1)
    return x1    

In [6]:
# Selecting the device (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Building the whole Training Process into a class

class TD3(object):
  
  def __init__(self, state_dim, action_dim, max_action):
    self.actor_model = Actor(state_dim, action_dim, max_action).to(device)
    self.critic_model = Critic(state_dim, action_dim).to(device)
    ###
    self.actor_target = copy.deepcopy(self.actor_model)
    self.critic_target = copy.deepcopy(self.critic_model)
    ###
    self.critic_optimizer = torch.optim.Adam(self.critic_model.parameters(), lr=3e-4)
    self.actor_optimizer = torch.optim.Adam(self.actor_model.parameters(),  lr=3e-4)
    ###
    self.max_action = max_action
    
  def select_action(self, state):
    state = torch.Tensor(state.reshape(1, -1)).to(device)
    return self.actor_model(state).cpu().data.numpy().flatten()

  def train(self, replay_buffer : ReplayBuffer(), iterations, batch_size=100, discount=0.99, tau=0.005, policy_noise=0.2, noise_clip=0.5,
            policy_update_freq=2):
    
    for it in range(iterations):
        #if len(self.replay_buffer.storage) < 10000:
        #  next_action = self.select_action(state)
 
        batch_states, batch_next_states, batch_actions, batch_rewards, batch_dones = replay_buffer.sample(batch_size)
        state = torch.Tensor(batch_states).to(device)
        next_state = torch.Tensor(batch_next_states).to(device)
        action = torch.Tensor(batch_actions).to(device)
        reward = torch.Tensor(batch_rewards).to(device)
        done = torch.Tensor(batch_dones).to(device)
        
        next_action = self.actor_target(next_state)
        noise = torch.Tensor(batch_actions).data.normal_(0, policy_noise).to(device)
        noise.clamp(-noise_clip, noise_clip)
        next_action = (next_action + noise).clamp(-self.max_action, self.max_action)

        target_Q1, target_Q2 = self.critic_target(next_state, next_action)
        #target_Q = torch.where( done == False, torch.min(target_Q1, target_Q2), 0) 
        #target = reward + discount * target_Q
        target_Q = torch.min(target_Q1, target_Q2)
        target = reward + ( (1 - done) * discount * target_Q ).detach()

        curr_Q1, curr_Q2 = self.critic_model(state, action)

        critic_loss = F.mse_loss(curr_Q1, target_Q) + F.mse_loss(curr_Q2, target_Q)

        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        if it % policy_update_freq == 0:
          actor_loss = -self.critic_model.Q1(state, self.actor_model(state)).mean() # -ve cause stochastic *ascent*

          self.actor_optimizer.zero_grad()
          actor_loss.backward()
          self.actor_optimizer.step()

          for model_param, target_param in zip(self.actor_model.parameters(), self.actor_target.parameters()):
              target_param.data.copy_( model_param.data * tau + (1 - tau) * target_param.data)
          for model_param, target_param in zip(self.critic_model.parameters(), self.critic_target.parameters()):
              target_param.data.copy_( model_param.data * tau + (1 - tau) * target_param.data)
    
  def save(self, filename):
    torch.save(self.critic_model.state_dict(), filename + "_critic")
    torch.save(self.critic_optimizer.state_dict(), filename + "_critic_optimizer")
    torch.save(self.actor_model.state_dict(), filename + "_actor")
    torch.save(self.actor_optimizer.state_dict(), filename + "_actor_optimizer")


  def load(self, filename):
    self.critic_model.load_state_dict(torch.load(filename + "_critic"))
    self.critic_optimizer.load_state_dict(torch.load(filename + "_critic_optimizer"))
    self.critic_target = copy.deepcopy(self.critic)
    self.actor_model.load_state_dict(torch.load(filename + "_actor"))
    self.actor_optimizer.load_state_dict(torch.load(filename + "_actor_optimizer"))
    self.actor_target = copy.deepcopy(self.actor)
        

## We make a function that evaluates the policy by calculating its average reward over 10 episodes

In [7]:
def evaluate_policy(policy, eval_episodes=10):
  avg_reward = 0.
  for _ in range(eval_episodes):
    obs = env.reset()
    done = False
    while not done:
      action = policy.select_action(np.array(obs))
      obs, reward, done, _ = env.step(action)
      avg_reward += reward
  avg_reward /= eval_episodes
  print ("---------------------------------------")
  print ("Average Reward over the Evaluation Step: %f" % (avg_reward))
  print ("---------------------------------------")
  return avg_reward

## We set the parameters

In [9]:
env_name = "AntBulletEnv-v0" # Name of a environment (set it to any Continous environment you want)
seed = 0 # Random seed number
start_timesteps = 1e4 # Number of iterations/timesteps before which the model randomly chooses an action, and after which it starts to use the policy network
eval_freq = 5e3 # How often the evaluation step is performed (after how many timesteps)
max_timesteps = 5e5 # Total number of iterations/timesteps
save_models = True # Boolean checker whether or not to save the pre-trained model
expl_noise = 0.1 # Exploration noise - STD value of exploration Gaussian noise
batch_size = 100 # Size of the batch
discount = 0.99 # Discount factor gamma, used in the calculation of the total discounted reward
tau = 0.05 # Target network update rate
policy_noise = 0.2 # STD of Gaussian noise added to the actions for the exploration purposes
noise_clip = 0.5 # Maximum value of the Gaussian noise added to the actions (policy)
policy_freq = 2 # Number of iterations to wait before the policy network (Actor model) is updated

## We create a file name for the two saved models: the Actor and Critic models

In [10]:
file_name = "%s_%s_%s" % ("TD3", env_name, str(seed))
print ("---------------------------------------")
print ("Settings: %s" % (file_name))
print ("---------------------------------------")

---------------------------------------
Settings: TD3_AntBulletEnv-v0_0
---------------------------------------


## We create a folder inside which will be saved the trained models

In [11]:
if not os.path.exists("./results"):
  os.makedirs("./results")
if save_models and not os.path.exists("./pytorch_models"):
  os.makedirs("./pytorch_models")

## We create the PyBullet environment

In [12]:
env = gym.make(env_name)

  logger.warn(f"Box bound precision lowered by casting to {self.dtype}")


## We set seeds and we get the necessary information on the states and actions in the chosen environment

In [13]:
env.seed(seed)
torch.manual_seed(seed)
np.random.seed(seed)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
max_action = float(env.action_space.high[0])

## We create the policy network (the Actor model)

In [14]:
policy = TD3(state_dim, action_dim, max_action)

## We create the Experience Replay memory

In [15]:
replay_buffer = ReplayBuffer()

## We define a list where all the evaluation results over 10 episodes are stored

In [None]:
evaluations = [evaluate_policy(policy)]

## We create a new folder directory in which the final results (videos of the agent) will be populated

In [17]:
def mkdir(base, name):
    path = os.path.join(base, name)
    if not os.path.exists(path):
        os.makedirs(path)
    return path
work_dir = mkdir('exp', 'brs')
monitor_dir = mkdir(work_dir, 'monitor')
max_episode_steps = env._max_episode_steps
save_env_vid = False
if save_env_vid:
  env = wrappers.Monitor(env, monitor_dir, force = True)
  env.reset()

## We initialize the variables

In [18]:
total_timesteps = 0
timesteps_since_eval = 0
episode_num = 0
done = True
t0 = time.time()

## Training

In [None]:
# We start the main loop over 500,000 timesteps
while total_timesteps < max_timesteps:
  
  # If the episode is done
  if done:

    # If we are not at the very beginning, we start the training process of the model
    if total_timesteps != 0:
      print("Total Timesteps: {} Episode Num: {} Reward: {}".format(total_timesteps, episode_num, episode_reward))
      policy.train(replay_buffer, episode_timesteps, batch_size, discount, tau, policy_noise, noise_clip, policy_freq)

    # We evaluate the episode and we save the policy
    if timesteps_since_eval >= eval_freq:
      timesteps_since_eval %= eval_freq
      evaluations.append(evaluate_policy(policy))
      policy.save(file_name, directory="./pytorch_models")
      np.save("./results/%s" % (file_name), evaluations)
    
    # When the training step is done, we reset the state of the environment
    obs = env.reset()
    
    # Set the Done to False
    done = False
    
    # Set rewards and episode timesteps to zero
    episode_reward = 0
    episode_timesteps = 0
    episode_num += 1
  
  # Before 10000 timesteps, we play random actions
  if total_timesteps < start_timesteps:
    action = env.action_space.sample()
  else: # After 10000 timesteps, we switch to the model
    action = policy.select_action(np.array(obs))
    # If the explore_noise parameter is not 0, we add noise to the action and we clip it
    if expl_noise != 0:
      action = (action + np.random.normal(0, expl_noise, size=env.action_space.shape[0])).clip(env.action_space.low, env.action_space.high)
  
  # The agent performs the action in the environment, then reaches the next state and receives the reward
  new_obs, reward, done, _ = env.step(action)
  
  # We check if the episode is done
  done_bool = 0 if episode_timesteps + 1 == env._max_episode_steps else float(done)
  
  # We increase the total reward
  episode_reward += reward
  
  # We store the new transition into the Experience Replay memory (ReplayBuffer)
  replay_buffer.add((obs, new_obs, action, reward, done_bool))

  # We update the state, the episode timestep, the total timesteps, and the timesteps since the evaluation of the policy
  obs = new_obs
  episode_timesteps += 1
  total_timesteps += 1
  timesteps_since_eval += 1

# We add the last policy evaluation to our list of evaluations and we save our model
evaluations.append(evaluate_policy(policy))
if save_models: policy.save("%s" % (file_name), directory="./pytorch_models")
np.save("./results/%s" % (file_name), evaluations)

## Inference

In [None]:
!pip uninstall gym -y
!pip install gym==0.22 
!pip install pybullet
import gym
from gym import wrappers
import os
import time
import random
import numpy as np
import matplotlib.pyplot as plt
import pybullet_envs
import gym
import torch
import torch.nn as nn
from torch.nn.modules import flatten
import torch.nn.functional as F
from gym import wrappers
#from torch.autograd import Variable
from collections import deque
import copy

def mkdir(base, name):
    path = os.path.join(base, name)
    if not os.path.exists(path):
        os.makedirs(path)
    return path
work_dir = mkdir('exp', 'brs')

In [None]:
class ReplayBuffer(object):

  def __init__(self, max_size=1e6):
    self.storage = []
    self.max_size = max_size
    self.ptr = 0

  def add(self, transition):
    if len(self.storage) == self.max_size:
      self.storage[int(self.ptr)] = transition
      self.ptr = (self.ptr + 1) % self.max_size
    else:
      self.storage.append(transition)

  def sample(self, batch_size):
    ind = np.random.randint(0, len(self.storage), size=batch_size)
    batch_states, batch_next_states, batch_actions, batch_rewards, batch_dones = [], [], [], [], []
    for i in ind: 
      state, next_state, action, reward, done = self.storage[i]
      batch_states.append(np.array(state, copy=False))
      batch_next_states.append(np.array(next_state, copy=False))
      batch_actions.append(np.array(action, copy=False))
      batch_rewards.append(np.array(reward, copy=False))
      batch_dones.append(np.array(done, copy=False))
    return np.array(batch_states), np.array(batch_next_states), np.array(batch_actions), np.array(batch_rewards).reshape(-1, 1), np.array(batch_dones).reshape(-1, 1)

class Actor(nn.Module):

  def __init__(self, state_dim, action_dim, max_action):
    super(Actor, self).__init__()
    self.max_action = max_action
    self.fc1 = nn.Linear(state_dim , 400)
    self.fc2 = nn.Linear(400, 300)
    self.fc3 = nn.Linear(300, action_dim)

  def forward(self, x):
    x = F.relu(self.fc1(x))
    x = F.relu(self.fc2(x))
    x = self.max_action * torch.tanh(self.fc3(x))
    return x


class Critic(nn.Module):

  def __init__(self, state, action_space):
    super(Critic, self).__init__()
    
    #first net
    self.first_fc1 = nn.Linear(state + action_space, 400)
    self.first_fc2 = nn.Linear(400, 300)
    self.first_fc3 = nn.Linear(300, 1)
    #second net
    self.sec_fc1 = nn.Linear(state + action_space, 400)
    self.sec_fc2 = nn.Linear(400, 300)
    self.sec_fc3 = nn.Linear(300, 1)

  def forward(self, state, action):
    input = torch.cat([state, action], 1) #vertical concat

    x1 = F.relu(self.first_fc1(input))
    x1 = F.relu(self.first_fc2(x1))
    x1 = self.first_fc3(x1)
    x2 = F.relu(self.sec_fc1(input))
    x2 = F.relu(self.sec_fc2(x2))
    x2 = self.sec_fc3(x2) 

    return x1, x2

  def Q1(self, state, action):
    input = torch.cat([state, action], 1) 
    x1 = F.relu(self.first_fc1(input))
    x1 = F.relu(self.first_fc2(x1))
    x1 = self.first_fc3(x1)
    return x1    

# Selecting the device (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Building the whole Training Process into a class

class TD3(object):
  
  def __init__(self, state_dim, action_dim, max_action):
    self.actor_model = Actor(state_dim, action_dim, max_action).to(device)
    self.critic_model = Critic(state_dim, action_dim).to(device)
    ###
    self.actor_target = copy.deepcopy(self.actor_model)
    self.critic_target = copy.deepcopy(self.critic_model)
    ###
    self.critic_optimizer = torch.optim.Adam(self.critic_model.parameters(), lr=3e-4)
    self.actor_optimizer = torch.optim.Adam(self.actor_model.parameters(),  lr=3e-4)
    ###
    self.max_action = max_action
    
  def select_action(self, state):
    state = torch.Tensor(state.reshape(1, -1)).to(device)
    return self.actor_model(state).cpu().data.numpy().flatten()

  def train(self, replay_buffer : ReplayBuffer(), iterations, batch_size=100, discount=0.99, tau=0.005, policy_noise=0.2, noise_clip=0.5,
            policy_update_freq=2):
    
    for it in range(iterations):
        #if len(self.replay_buffer.storage) < 10000:
        #  next_action = self.select_action(state)
 
        batch_states, batch_next_states, batch_actions, batch_rewards, batch_dones = replay_buffer.sample(batch_size)
        state = torch.Tensor(batch_states).to(device)
        next_state = torch.Tensor(batch_next_states).to(device)
        action = torch.Tensor(batch_actions).to(device)
        reward = torch.Tensor(batch_rewards).to(device)
        done = torch.Tensor(batch_dones).to(device)
        
        next_action = self.actor_target(next_state)
        noise = torch.Tensor(batch_actions).data.normal_(0, policy_noise).to(device)
        noise.clamp(-noise_clip, noise_clip)
        next_action = (next_action + noise).clamp(-self.max_action, self.max_action)

        target_Q1, target_Q2 = self.critic_target(next_state, next_action)
        #target_Q = torch.where( done == False, torch.min(target_Q1, target_Q2), 0) 
        #target = reward + discount * target_Q
        target_Q = torch.min(target_Q1, target_Q2)
        target = reward + ( (1 - done) * discount * target_Q ).detach()

        curr_Q1, curr_Q2 = self.critic_model(state, action)

        critic_loss = F.mse_loss(curr_Q1, target_Q) + F.mse_loss(curr_Q2, target_Q)

        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        if it % policy_update_freq == 0:
          actor_loss = - self.critic.Q1(state, self.actor(state)).mean() # -ve cause stochastic *ascent*

          self.actor_optimizer.zero_grad()
          actor_loss.backward()
          self.actor_optimizer.step()

          for model_param, target_param in zip(self.actor_model.parameters(), self.actor_target.parameters()):
              target_param.data.copy_( model_param.data * tau + (1 - tau) * target_param.data)
          for model_param, target_param in zip(self.critic_model.parameters(), self.critic_target.parameters()):
              target_param.data.copy_( model_param.data * tau + (1 - tau) * target_param.data)


  def save(self, filename):
    torch.save(self.critic_model.state_dict(), filename + "_critic")
    torch.save(self.critic_optimizer.state_dict(), filename + "_critic_optimizer")
    torch.save(self.actor_model.state_dict(), filename + "_actor")
    torch.save(self.actor_optimizer.state_dict(), filename + "_actor_optimizer")


  def load(self, filename):
    self.critic_model.load_state_dict(torch.load(filename + "_critic"))
    self.critic_optimizer.load_state_dict(torch.load(filename + "_critic_optimizer"))
    self.critic_target = copy.deepcopy(self.critic)
    self.actor_model.load_state_dict(torch.load(filename + "_actor"))
    self.actor_optimizer.load_state_dict(torch.load(filename + "_actor_optimizer"))
    self.actor_target = copy.deepcopy(self.actor)
        

def evaluate_policy(policy, eval_episodes=10):
  avg_reward = 0.
  for _ in range(eval_episodes):
    obs = env.reset()
    done = False
    while not done:
      action = policy.select_action(np.array(obs))
      obs, reward, done, _ = env.step(action)
      avg_reward += reward
  avg_reward /= eval_episodes
  print ("---------------------------------------")
  print ("Average Reward over the Evaluation Step: %f" % (avg_reward))
  print ("---------------------------------------")
  return avg_reward

env_name = "AntBulletEnv-v0"
seed = 0

file_name = "%s_%s_%s" % ("TD3", env_name, str(seed))
print ("---------------------------------------")
print ("Settings: %s" % (file_name))
print ("---------------------------------------")

eval_episodes = 10
save_env_vid = True
env = gym.make(env_name)
max_episode_steps = env._max_episode_steps
if save_env_vid:
  env = wrappers.Monitor(env, monitor_dir, force = True)
  env.reset()
env.seed(seed)
torch.manual_seed(seed)
np.random.seed(seed)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
max_action = float(env.action_space.high[0])
policy = TD3(state_dim, action_dim, max_action)
policy.load(file_name, './pytorch_models/')
_ = evaluate_policy(policy, eval_episodes=eval_episodes)