<a href="https://colab.research.google.com/github/gitHubAndyLee2020/OpenAI_Gym_RL_Algorithms_Database/blob/main/TD3_Module.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### TD3

> About

- Consists of Actor and two Q networks.
- For continuous action space.
- Actor: state -> action value
- Q Network: state + action -> expected reward

> Model Dependency

- Actor + Target Q Networks (discounted reward) -> Q Networks; Q Network No.1 (delayed for stability) -> Actor; Q Networks (update) -> Target Q Networks

> Pro

- Stability and Robustness

> Con

- Sample Inefficiency

```
class Replay_buffer():
  def __init__(self, max_size=capacity):
    - Initialize transition storage

  def push(self, data):
    - Insert the data into the storage; if the storage is full, replace the oldest data in the storage

  def sample(self, batch_size):
    - Select some number of ranom indexes from the storage, and return the tensors for state, action, next state, done, and reward for those indexes
```

```
class Actor(nn.Module):
  def __init__(self, state_dim, action_dim, max_action):
    - Map state -> hidden layer -> action values

  def forward(self, state):
    - Feed the input state through the neural network and transform the range of the output action values to be between +-max action value
```

```
class Critic(nn.Module):
  # The model is named Critic in the codebase, but it's essentially a Q network

  def __init__(self, state_dim, action_dim):
    - Map state + action -> hidden layer -> expected reward

  def forward(self, state, action):
    - Feed the input state and action through the neural network and return the output expected reward
```

```
class TD3():
  def __init__(self, state_dim, action_dim, max_action):
    - Initialize the Actor, Critic No.1, Critic No.2, Actor Target, Critic No.1 Target, Critic No.2 Target networks
    - Initialize the optimizers for Actor, Critic No.1, Critic No.2 networks
    - Copy the weights from Actor, Critic No.1, Critic No.2 to their target networks
    - Initialize the transition storage

  def select_action(self, state):
    - Feed the state to Actor network and return the resulting action value

  def update(self, num_iteration):
    - For some number of iterations, repeat the following update loop
    # Update Loop
    - 1. Select sample state, action, next state, done, and reward tensors from the transition storage
    - 2. Get the action for the next state by feeding next state to the Actor network, and add noise (tensor with same dimension as action, normally distributed with clipped values) to stable training
    - 3. Feed the next state and next action from above to each target Critic No.1 and No.2, and take the minimum Q value, and then calculate the discounted reward using formula target_Q = reward + gamma * target_Q
    # Adding noise (likely to be sub-optimal) and taking the minimum of the Q value prevents the target_Q from being too high, preventing the Q networks from overpredicting
    - 4. Get the current Q value by feeding state and action to each Critic No.1 and No.2, and use the MSE of the current Q value and target Q value as loss value to backpropagate the Critic networks (guides to Critic Q networks to make more accurate predictions)
    - 5. Only every some number of iterations, update the Actor network. This is to make sure the Critic networks are accurate enough before updating the Actor network, which helps to stablise the training
    - 6. Use the negated expected value from Critic No.1 as Actor loss; this represents how well the Actor is expected to be performining; the negation is specifically for the Pendulum-v1 environment where the rewards are negative, meaning better performance will result in smaller loss
    - 7. Copy tau amount of network from Actor, Critic No.1, and Critic No.2 networks to their target counterparts

  def save(self):
    - Save the weights of the networks

  def load(self):
    - Load weights into the networks
```

def main():
  - If the mode is test, for some number of iterations, play the game until the end, and print the average steps and reward
  - Otherwise if the mode is train, for some number of iterations, run the training loop
  - In each training loop, collect transition data from the environment, and update the model when the storage fills up. Terminate the current training loop if the game ends and start the next training loop

In [None]:
from collections import namedtuple
from itertools import count

import os, sys, random
import numpy as np

import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Normal
from tensorboardX import SummaryWriter

# Hard-coded variables
mode = 'train'
env_name = "Pendulum-v1"
tau = 0.005
target_update_interval = 1
iteration = 5
learning_rate = 3e-4
gamma = 0.99
capacity = 50000
num_iteration = 100000
batch_size = 100
seed = 1
num_hidden_layers = 2
sample_frequency = 256
activation = 'Relu'
render = False
log_interval = 50
load = False
render_interval = 100
policy_noise = 0.2
noise_clip = 0.5
policy_delay = 2
exploration_noise = 0.1
max_episode = 2000
print_log = 5

# Set seeds
# env.seed(seed)
# torch.manual_seed(seed)
# np.random.seed(seed)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
script_name = "td3"
env = gym.make(env_name)

state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
max_action = float(env.action_space.high[0])
min_Val = torch.tensor(1e-7).float().to(device) # min value

directory = './exp' + script_name + env_name +'./'
'''
Implementation of TD3 with pytorch
Original paper: https://arxiv.org/abs/1802.09477
Not the author's implementation !
'''

class Replay_buffer():
    '''
    Code based on:
    https://github.com/openai/baselines/blob/master/baselines/deepq/replay_buffer.py
    Expects tuples of (state, next_state, action, reward, done)
    '''
    def __init__(self, max_size=capacity):
        self.storage = []
        self.max_size = max_size
        self.ptr = 0

    def push(self, data):
        if len(self.storage) == self.max_size:
            self.storage[int(self.ptr)] = data
            self.ptr = (self.ptr + 1) % self.max_size
        else:
            self.storage.append(data)

    def sample(self, batch_size):
        ind = np.random.randint(0, len(self.storage), size=batch_size)
        x, y, u, r, d = [], [], [], [], []

        for i in ind:
            X, Y, U, R, D = self.storage[i]
            x.append(np.array(X, copy=False))
            y.append(np.array(Y, copy=False))
            u.append(np.array(U, copy=False))
            r.append(np.array(R, copy=False))
            d.append(np.array(D, copy=False))

        return np.array(x), np.array(y), np.array(u), np.array(r).reshape(-1, 1), np.array(d).reshape(-1, 1)


class Actor(nn.Module):

    def __init__(self, state_dim, action_dim, max_action):
        super(Actor, self).__init__()

        self.fc1 = nn.Linear(state_dim, 400)
        self.fc2 = nn.Linear(400, 300)
        self.fc3 = nn.Linear(300, action_dim)

        self.max_action = max_action

    def forward(self, state):
        a = F.relu(self.fc1(state))
        a = F.relu(self.fc2(a))
        a = torch.tanh(self.fc3(a)) * self.max_action
        return a


class Critic(nn.Module):

    def __init__(self, state_dim, action_dim):
        super(Critic, self).__init__()

        self.fc1 = nn.Linear(state_dim + action_dim, 400)
        self.fc2 = nn.Linear(400, 300)
        self.fc3 = nn.Linear(300, 1)

    def forward(self, state, action):
        state_action = torch.cat([state, action], 1)

        q = F.relu(self.fc1(state_action))
        q = F.relu(self.fc2(q))
        q = self.fc3(q)
        return q


class TD3():
    def __init__(self, state_dim, action_dim, max_action):

        self.actor = Actor(state_dim, action_dim, max_action).to(device)
        self.actor_target = Actor(state_dim, action_dim, max_action).to(device)
        self.critic_1 = Critic(state_dim, action_dim).to(device)
        self.critic_1_target = Critic(state_dim, action_dim).to(device)
        self.critic_2 = Critic(state_dim, action_dim).to(device)
        self.critic_2_target = Critic(state_dim, action_dim).to(device)

        self.actor_optimizer = optim.Adam(self.actor.parameters())
        self.critic_1_optimizer = optim.Adam(self.critic_1.parameters())
        self.critic_2_optimizer = optim.Adam(self.critic_2.parameters())

        self.actor_target.load_state_dict(self.actor.state_dict())
        self.critic_1_target.load_state_dict(self.critic_1.state_dict())
        self.critic_2_target.load_state_dict(self.critic_2.state_dict())

        self.max_action = max_action
        self.memory = Replay_buffer(capacity)
        self.writer = SummaryWriter(directory)
        self.num_critic_update_iteration = 0
        self.num_actor_update_iteration = 0
        self.num_training = 0

    def select_action(self, state):
        state = torch.tensor(state.reshape(1, -1)).float().to(device)
        return self.actor(state).cpu().data.numpy().flatten()

    def update(self, num_iteration):

        if self.num_training % 500 == 0:
            print("====================================")
            print("model has been trained for {} times...".format(self.num_training))
            print("====================================")
        for i in range(num_iteration):
            x, y, u, r, d = self.memory.sample(batch_size)
            state = torch.FloatTensor(x).to(device)
            action = torch.FloatTensor(u).to(device)
            next_state = torch.FloatTensor(y).to(device)
            done = torch.FloatTensor(d).to(device)
            reward = torch.FloatTensor(r).to(device)

            # Select next action according to target policy:
            noise = torch.ones_like(action).data.normal_(0, policy_noise).to(device)
            noise = noise.clamp(-noise_clip, noise_clip)
            next_action = (self.actor_target(next_state) + noise)
            next_action = next_action.clamp(-self.max_action, self.max_action)

            # Compute target Q-value:
            target_Q1 = self.critic_1_target(next_state, next_action)
            target_Q2 = self.critic_2_target(next_state, next_action)
            target_Q = torch.min(target_Q1, target_Q2)
            target_Q = reward + ((1 - done) * gamma * target_Q).detach()

            # Optimize Critic 1:
            current_Q1 = self.critic_1(state, action)
            loss_Q1 = F.mse_loss(current_Q1, target_Q)
            self.critic_1_optimizer.zero_grad()
            loss_Q1.backward()
            self.critic_1_optimizer.step()
            self.writer.add_scalar('Loss/Q1_loss', loss_Q1, global_step=self.num_critic_update_iteration)

            # Optimize Critic 2:
            current_Q2 = self.critic_2(state, action)
            loss_Q2 = F.mse_loss(current_Q2, target_Q)
            self.critic_2_optimizer.zero_grad()
            loss_Q2.backward()
            self.critic_2_optimizer.step()
            self.writer.add_scalar('Loss/Q2_loss', loss_Q2, global_step=self.num_critic_update_iteration)
            # Delayed policy updates:
            if i % policy_delay == 0:
                # Compute actor loss:
                actor_loss = - self.critic_1(state, self.actor(state)).mean()

                # Optimize the actor
                self.actor_optimizer.zero_grad()
                actor_loss.backward()
                self.actor_optimizer.step()
                self.writer.add_scalar('Loss/actor_loss', actor_loss, global_step=self.num_actor_update_iteration)
                for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
                    target_param.data.copy_(((1- tau) * target_param.data) + tau * param.data)

                for param, target_param in zip(self.critic_1.parameters(), self.critic_1_target.parameters()):
                    target_param.data.copy_(((1 - tau) * target_param.data) + tau * param.data)

                for param, target_param in zip(self.critic_2.parameters(), self.critic_2_target.parameters()):
                    target_param.data.copy_(((1 - tau) * target_param.data) + tau * param.data)

                self.num_actor_update_iteration += 1
        self.num_critic_update_iteration += 1
        self.num_training += 1

    def save(self):
        torch.save(self.actor.state_dict(), directory+'actor.pth')
        torch.save(self.actor_target.state_dict(), directory+'actor_target.pth')
        torch.save(self.critic_1.state_dict(), directory+'critic_1.pth')
        torch.save(self.critic_1_target.state_dict(), directory+'critic_1_target.pth')
        torch.save(self.critic_2.state_dict(), directory+'critic_2.pth')
        torch.save(self.critic_2_target.state_dict(), directory+'critic_2_target.pth')
        print("====================================")
        print("Model has been saved...")
        print("====================================")

    def load(self):
        self.actor.load_state_dict(torch.load(directory + 'actor.pth'))
        self.actor_target.load_state_dict(torch.load(directory + 'actor_target.pth'))
        self.critic_1.load_state_dict(torch.load(directory + 'critic_1.pth'))
        self.critic_1_target.load_state_dict(torch.load(directory + 'critic_1_target.pth'))
        self.critic_2.load_state_dict(torch.load(directory + 'critic_2.pth'))
        self.critic_2_target.load_state_dict(torch.load(directory + 'critic_2_target.pth'))
        print("====================================")
        print("model has been loaded...")
        print("====================================")


def main():
    agent = TD3(state_dim, action_dim, max_action)
    ep_r = 0

    if mode == 'test':
        agent.load()
        for i in range(iteration):
            state = env.reset()
            for t in count():
                action = agent.select_action(state)
                next_state, reward, done, info = env.step(np.float32(action))
                ep_r += reward
                env.render()
                if done or t ==2000 :
                    print("Ep_i \t{}, the ep_r is \t{:0.2f}, the step is \t{}".format(i, ep_r, t))
                    break
                state = next_state

    elif mode == 'train':
        print("====================================")
        print("Collection Experience...")
        print("====================================")
        if load: agent.load()
        for i in range(num_iteration):
            state = env.reset()
            for t in range(2000):

                action = agent.select_action(state)
                action = action + np.random.normal(0, exploration_noise, size=env.action_space.shape[0])
                action = action.clip(env.action_space.low, env.action_space.high)
                next_state, reward, done, info = env.step(action)
                ep_r += reward
                if render and i >= render_interval : env.render()
                agent.memory.push((state, next_state, action, reward, np.float(done)))
                if i+1 % 10 == 0:
                    print('Episode {},  The memory size is {} '.format(i, len(agent.memory.storage)))
                if len(agent.memory.storage) >= capacity-1:
                    agent.update(10)

                state = next_state
                if done or t == max_episode -1:
                    agent.writer.add_scalar('ep_r', ep_r, global_step=i)
                    if i % print_log == 0:
                        print("Ep_i \t{}, the ep_r is \t{:0.2f}, the step is \t{}".format(i, ep_r, t))
                    ep_r = 0
                    break

            if i % log_interval == 0:
                agent.save()

    else:
        raise NameError("mode wrong!!!")

if __name__ == '__main__':
    main()