In [0]:
# !pip install kivy
# !pip install torchsummary

In [2]:
import os
import time
import random
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import torch.optim as optim
from torchvision import datasets, transforms
from collections import deque
from torchsummary import summary
import autocar_env

[INFO   ] [Logger      ] Record log in /root/.kivy/logs/kivy_20-04-19_25.txt
[INFO   ] [Kivy        ] v1.11.1
[INFO   ] [Kivy        ] Installed at "/usr/local/lib/python3.6/dist-packages/kivy/__init__.py"
[INFO   ] [Python      ] v3.6.9 (default, Nov  7 2019, 10:44:02) 
[GCC 8.3.0]
[INFO   ] [Python      ] Interpreter at "/usr/bin/python3"


## Step 1: We initialize the Experience Replay memory

In [0]:
class ReplayBuffer(object):
    def __init__(self, max_size=1e6):
        self.storage = []
        self.max_size = max_size
        self.ptr = 0

    def add(self, transition):
        if len(self.storage) == self.max_size:
            self.storage[int(self.ptr)] = transition
            self.ptr = (self.ptr + 1) % self.max_size
        else:
            self.storage.append(transition)

    def sample(self, batch_size):
        ind = np.random.randint(0, len(self.storage), size=batch_size)
        batch_states, batch_next_states, batch_actions, batch_rewards, batch_dones = [], [], [], [], []
        for i in ind: 
            state, next_state, action, reward, done = self.storage[i]
            batch_states.append(np.array(state, copy=False))
            batch_next_states.append(np.array(next_state, copy=False))
            batch_actions.append(np.array(action, copy=False))
            batch_rewards.append(np.array(reward, copy=False))
            batch_dones.append(np.array(done, copy=False))
        return np.array(batch_states), np.array(batch_next_states), np.array(batch_actions), np.array(batch_rewards).reshape(-1, 1), np.array(batch_dones).reshape(-1, 1)

In [0]:
def conv2d_block(in_channels, out_channels):
    return nn.Sequential(
        nn.BatchNorm2d(in_channels),
        nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=(3, 3), padding=0),
        nn.ReLU(),
        nn.BatchNorm2d(out_channels),
        nn.Conv2d(in_channels=out_channels, out_channels=out_channels, kernel_size=(3, 3), padding=0),
        nn.ReLU(),
        nn.BatchNorm2d(out_channels),
        nn.Dropout2d(0.1))

def transition_block(in_channels, out_channels):
    return nn.Sequential(
        nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=(1, 1), padding=0),
        nn.ReLU(),
        nn.MaxPool2d(kernel_size=2, stride=2),
        nn.Dropout2d(0.1))

In [0]:
# Selecting the device (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Step 2: We build one neural network for the Actor model and one neural network for the Actor target

In [0]:
class Actor(nn.Module):
    def __init__(self, state_dim, action_dim, max_action):
        super(Actor, self).__init__()
        self.convblock1 = nn.Sequential(
            nn.Conv2d(in_channels=state_dim, out_channels=16, kernel_size=(3, 3), padding=0), 
            nn.ReLU()) 
        self.convblock2 = conv2d_block(in_channels=16, out_channels=16)
        self.transitionblock = transition_block(in_channels=16, out_channels=10)
        self.convblock3 = conv2d_block(in_channels=10, out_channels=16)
        self.convblock4 = nn.Sequential(
            nn.Conv2d(in_channels=16, out_channels=10, kernel_size=(1, 1), padding=0),
            nn.ReLU(),
            nn.BatchNorm2d(10)
        )
        self.fc1 = nn.Linear(10, 10)
        self.fc2 = nn.Linear(10, 1)
        self.max_action = max_action

    def forward(self, x):
        x = self.convblock1(x)
        x = self.convblock2(x)
        x = self.transitionblock(x)
        x = self.convblock3(x)
        x = self.convblock4(x)
        x = F.adaptive_avg_pool2d(x, (1,1))
        x = torch.flatten(x, 1)
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        x = self.max_action * torch.tanh(x)
        return x

In [7]:
model = Actor(1,1,1).to(device)
summary(model, input_size=(1, 32, 32))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 16, 30, 30]             160
              ReLU-2           [-1, 16, 30, 30]               0
       BatchNorm2d-3           [-1, 16, 30, 30]              32
            Conv2d-4           [-1, 16, 28, 28]           2,320
              ReLU-5           [-1, 16, 28, 28]               0
       BatchNorm2d-6           [-1, 16, 28, 28]              32
            Conv2d-7           [-1, 16, 26, 26]           2,320
              ReLU-8           [-1, 16, 26, 26]               0
       BatchNorm2d-9           [-1, 16, 26, 26]              32
        Dropout2d-10           [-1, 16, 26, 26]               0
           Conv2d-11           [-1, 10, 26, 26]             170
             ReLU-12           [-1, 10, 26, 26]               0
        MaxPool2d-13           [-1, 10, 13, 13]               0
        Dropout2d-14           [-1, 10,

## Step 3: We build two neural networks for the two Critic models and two neural networks for the two Critic targets

In [0]:
class Critic(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(Critic, self).__init__()
        # Defining the first Critic neural network
        self.convblock1 = nn.Sequential(
            nn.Conv2d(in_channels=1, out_channels=16, kernel_size=(3, 3), padding=0), 
            nn.ReLU()) 
        self.convblock2 = conv2d_block(in_channels=16, out_channels=16)
        self.transitionblock1 = transition_block(in_channels=16, out_channels=10)
        self.convblock3 = conv2d_block(in_channels=10, out_channels=16)
        self.convblock4 = nn.Sequential(
            nn.Conv2d(in_channels=16, out_channels=10, kernel_size=(1, 1), padding=0),
            nn.ReLU(),
            nn.BatchNorm2d(10)
        )
        self.avgPool2d1 = nn.AvgPool2d(9)
        self.fc1 = nn.Linear(10+1, 10)
        self.fc2 = nn.Linear(10, 1)
        # Defining the second Critic neural network
        self.convblock5 = nn.Sequential(
            nn.Conv2d(in_channels=1, out_channels=16, kernel_size=(3, 3), padding=0), 
            nn.ReLU()) 
        self.convblock6 = conv2d_block(in_channels=16, out_channels=16)
        self.transitionblock2 = transition_block(in_channels=16, out_channels=10)
        self.convblock7 = conv2d_block(in_channels=10, out_channels=16)
        self.convblock8 = nn.Sequential(
            nn.Conv2d(in_channels=16, out_channels=10, kernel_size=(1, 1), padding=0),
            nn.ReLU(),
            nn.BatchNorm2d(10)
        )
        self.avgPool2d2 = nn.AvgPool2d(9)
        self.fc3 = nn.Linear(10 + 1, 10)
        self.fc4 = nn.Linear(10, 1)

    def forward(self, x, u):
        # Forward-Propagation on the first Critic Neural Network
        x1 = self.convblock1(x)
        x1 = self.convblock2(x1)
        x1 = self.transitionblock1(x1)
        x1 = self.convblock3(x1)
        x1 = self.convblock4(x1)
        x1 = F.adaptive_avg_pool2d(x1, (1,1)) 
        x1 = torch.flatten(x1, 1)
        x1 = torch.cat([x1, u], 1)
        x1 = self.fc1(x1)
        x1 = F.relu(x1)
        x1 = self.fc2(x1)
        # Forward-Propagation on the second Critic Neural Network
        x2 = self.convblock5(x)
        x2 = self.convblock6(x2)
        x2 = self.transitionblock2(x2)
        x2 = self.convblock7(x2)
        x2 = self.convblock8(x2)
        x2 = F.adaptive_avg_pool2d(x2, (1,1))
        x2 = torch.flatten(x2, 1)
        x2 = torch.cat([x2, u], 1)
        x2 = self.fc1(x2)
        x2 = F.relu(x2)
        x2 = self.fc2(x2)
        return x1, x2

    def Q1(self, x, u):
        x = self.convblock1(x)
        x = self.convblock2(x)
        x = self.transitionblock1(x)
        x = self.convblock3(x)
        x = self.convblock4(x)
        x = F.adaptive_avg_pool2d(x, (1,1))
        x = torch.flatten(x, 1)
        x = torch.cat([x, u], 1)
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        return x

## Steps 4 to 15: Training Process

In [0]:
# Building the whole Training Process into a class

class TD3(object):
  
    def __init__(self, state_dim, action_dim, max_action):
        self.actor = Actor(state_dim, action_dim, max_action).to(device)
        self.actor_target = Actor(state_dim, action_dim, max_action).to(device)
        self.actor_target.load_state_dict(self.actor.state_dict())
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters())
        self.critic = Critic(state_dim, action_dim).to(device)
        self.critic_target = Critic(state_dim, action_dim).to(device)
        self.critic_target.load_state_dict(self.critic.state_dict())
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters())
        self.max_action = max_action

    def select_action(self, state):
        state = torch.Tensor(state).to(device)
        state = state.unsqueeze(0).unsqueeze(0)
        return self.actor(state).cpu().data.numpy().flatten()

    def train(self, replay_buffer, iterations, batch_size=100, discount=0.99, tau=0.005, policy_noise=0.2, noise_clip=0.5, policy_freq=2):
    
        for it in range(iterations):
      
            # Step 4: We sample a batch of transitions (s, s’, a, r) from the memory
            batch_states, batch_next_states, batch_actions, batch_rewards, batch_dones = replay_buffer.sample(batch_size)
            state = torch.Tensor(batch_states).to(device)
            next_state = torch.Tensor(batch_next_states).to(device)
            action = torch.Tensor(batch_actions).to(device)
            reward = torch.Tensor(batch_rewards).to(device)
            done = torch.Tensor(batch_dones).to(device)

            next_state = next_state.unsqueeze(1)
            state = state.unsqueeze(1)
            # Step 5: From the next state s’, the Actor target plays the next action a’
            next_action = self.actor_target(next_state).squeeze(1)
            
            # Step 6: We add Gaussian noise to this next action a’ and we clamp it in a range of values supported by the environment
            noise = torch.Tensor(batch_actions).data.normal_(0, policy_noise).to(device)
            noise = noise.clamp(-noise_clip, noise_clip)
            next_action = (next_action + noise).clamp(-self.max_action, self.max_action).unsqueeze(1)
            
            # Step 7: The two Critic targets take each the couple (s’, a’) as input and return two Q-values Qt1(s’,a’) and Qt2(s’,a’) as outputs
            target_Q1, target_Q2 = self.critic_target(next_state, next_action)

            # Step 8: We keep the minimum of these two Q-values: min(Qt1, Qt2)
            target_Q = torch.min(target_Q1, target_Q2)

            # Step 9: We get the final target of the two Critic models, which is: Qt = r + γ * min(Qt1, Qt2), where γ is the discount factor
            target_Q = reward + ((1 - done) * discount * target_Q).detach()

            # Step 10: The two Critic models take each the couple (s, a) as input and return two Q-values Q1(s,a) and Q2(s,a) as outputs
            current_Q1, current_Q2 = self.critic(state, action.unsqueeze(1))

            # Step 11: We compute the loss coming from the two Critic models: Critic Loss = MSE_Loss(Q1(s,a), Qt) + MSE_Loss(Q2(s,a), Qt)
            critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q)

            # Step 12: We backpropagate this Critic loss and update the parameters of the two Critic models with a SGD optimizer
            self.critic_optimizer.zero_grad()
            critic_loss.backward()
            self.critic_optimizer.step()

            # Step 13: Once every two iterations, we update our Actor model by performing gradient ascent on the output of the first Critic model
            if it % policy_freq == 0:
                actor_loss = -self.critic.Q1(state, self.actor(state)).mean()
                self.actor_optimizer.zero_grad()
                actor_loss.backward()
                self.actor_optimizer.step()

                # Step 14: Still once every two iterations, we update the weights of the Actor target by polyak averaging
                for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
                    target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)

                # Step 15: Still once every two iterations, we update the weights of the Critic target by polyak averaging
                for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
                    target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)

    # Making a save method to save a trained model
    def save(self, filename, directory):
        torch.save(self.actor.state_dict(), '%s/%s_actor.pth' % (directory, filename))
        torch.save(self.critic.state_dict(), '%s/%s_critic.pth' % (directory, filename))
  
    # Making a load method to load a pre-trained model
    def load(self, filename, directory):
        self.actor.load_state_dict(torch.load('%s/%s_actor.pth' % (directory, filename)))
        self.critic.load_state_dict(torch.load('%s/%s_critic.pth' % (directory, filename)))

## We set the parameters

In [0]:
env_name = "AutoCarEnv" # Name of a environment (set it to any Continous environment you want)
seed = 0 # Random seed number
start_timesteps = 1e4 # Number of iterations/timesteps before which the model randomly chooses an action, and after which it starts to use the policy network
eval_freq = 5e3 # How often the evaluation step is performed (after how many timesteps)
max_timesteps = 5e5 # Total number of iterations/timesteps
save_models = True # Boolean checker whether or not to save the pre-trained model
expl_noise = 0.1 # Exploration noise - STD value of exploration Gaussian noise
batch_size = 100 # Size of the batch
discount = 0.99 # Discount factor gamma, used in the calculation of the total discounted reward
tau = 0.005 # Target network update rate
policy_noise = 0.2 # STD of Gaussian noise added to the actions for the exploration purposes
noise_clip = 0.5 # Maximum value of the Gaussian noise added to the actions (policy)
policy_freq = 2 # Number of iterations to wait before the policy network (Actor model) is updated

## We create a file name for the two saved models: the Actor and Critic models

In [11]:
file_name = "%s_%s_%s" % ("TD3", env_name, str(seed))
print ("---------------------------------------")
print ("Settings: %s" % (file_name))
print ("---------------------------------------")

---------------------------------------
Settings: TD3_AutoCarEnv_0
---------------------------------------


## We create a folder inside which will be saved the trained models

In [0]:
if not os.path.exists("./results"):
    os.makedirs("./results")
if save_models and not os.path.exists("./pytorch_models"):
    os.makedirs("./pytorch_models")

## We create the AutoCar environment

In [0]:
env = autocar_env.AutoCarEnv()

## We set seeds and we get the necessary information on the states and actions in the chosen environment

In [0]:
env.seed(seed)
torch.manual_seed(seed)
np.random.seed(seed)
state_dim = env.state_dim
action_dim = env.action_dim
max_action = env.max_action

## We create the policy network (the Actor model)

In [0]:
policy = TD3(state_dim, action_dim, max_action)

## We create the Experience Replay memory

In [0]:
replay_buffer = ReplayBuffer()

## We initialize the variables

In [0]:
total_timesteps = 0
timesteps_since_eval = 0
episode_num = 0
done = True
t0 = time.time()

## Training

In [0]:
max_timesteps = 50000
# We start the main loop over 500,000 timesteps
while total_timesteps < max_timesteps:
  
    # If the episode is done
    if done:
        # If we are not at the very beginning, we start the training process of the model
        if total_timesteps != 0:
            print("Total Timesteps: {} Episode Num: {} Reward: {}".format(total_timesteps, episode_num, episode_reward))
            policy.train(replay_buffer, episode_timesteps, batch_size, discount, tau, policy_noise, noise_clip, policy_freq)

        # When the training step is done, we reset the state of the environment
        obs = env.reset()
        # Set the Done to False
        done = False

        # Set rewards and episode timesteps to zero
        episode_reward = 0
        episode_timesteps = 0
        episode_num += 1

    #Before 10000 timesteps, we play random actions
    if total_timesteps < start_timesteps:
        action = env.sample()
    else: # After 10000 timesteps, we switch to the model
        action = policy.select_action(obs)
        # If the explore_noise parameter is not 0, we add noise to the action and we clip it
        if expl_noise != 0:
            action = (action + np.random.normal(0, expl_noise, size=action_dim)).clip(env.low, env.high)
            action = action[0]
    
    # The agent performs the action in the environment, then reaches the next state and receives the reward
    new_obs, reward, done, _ = env.step(action)
    
    # We check if the episode is done
    done_bool = 0 if episode_timesteps + 1 == env._max_episode_steps else float(done)

    # We increase the total reward
    episode_reward += reward
    
    # We store the new transition into the Experience Replay memory (ReplayBuffer)
    replay_buffer.add((obs, new_obs, action, reward, done_bool))

    # We update the state, the episode timestep, the total timesteps, and the timesteps since the evaluation of the policy
    obs = new_obs
    episode_timesteps += 1
    total_timesteps += 1
    timesteps_since_eval += 1

    if(episode_reward < -2000):
      done = True

Total Timesteps: 1157 Episode Num: 1 Reward: -2001.7000000000028
Total Timesteps: 1795 Episode Num: 2 Reward: -1232.4000000000005
Total Timesteps: 2963 Episode Num: 3 Reward: -2001.0000000000002
Total Timesteps: 3105 Episode Num: 4 Reward: -272.1999999999999
Total Timesteps: 3749 Episode Num: 5 Reward: -1065.6000000000008
Total Timesteps: 3813 Episode Num: 6 Reward: -136
Total Timesteps: 4834 Episode Num: 7 Reward: -2000.600000000001
Total Timesteps: 5943 Episode Num: 8 Reward: -2001.1000000000015
Total Timesteps: 7042 Episode Num: 9 Reward: -2000.7000000000014
Total Timesteps: 8109 Episode Num: 10 Reward: -2001.5000000000005
Total Timesteps: 9159 Episode Num: 11 Reward: -2001.6000000000006
Total Timesteps: 9620 Episode Num: 12 Reward: -888.7000000000005
Total Timesteps: 10648 Episode Num: 13 Reward: -2000.2000000000003
Total Timesteps: 11713 Episode Num: 14 Reward: -2000.9
Total Timesteps: 12714 Episode Num: 15 Reward: -2002
Total Timesteps: 13883 Episode Num: 16 Reward: -2001.4000000