# Advantage Actor-Critic


In [1]:
import sklearn.preprocessing
import numpy as np
import random
import time
import gym
import csv
import os
from torch.autograd import Variable
import torch.nn.functional as F
import torch.optim as optim
import torch.nn as nn
import torch
from collections import namedtuple, deque
import random
import sys, time
import argparse
import IPython

In [2]:
class NoiseProcess:
    def __init__(self, action_space, theta,sigma,decay,min_sigma):
        action_shape     = action_space.shape
        self.theta       = theta
        self.sigma       = sigma
        self.sigma_decay = decay
        self.min_sigma   = min_sigma

        self.dt = 0.01

        self.prev_x = np.zeros(action_shape)
        self.mean   = np.zeros(action_shape)

    def sample(self):
        x = self.prev_x + self.theta * self.dt * (self.mean - self.prev_x) + \
            self.sigma * np.sqrt(self.dt) * np.random.normal(size=self.mean.shape)

        self.prev_x = x
        return x

    def decay(self):
        self.sigma = max(self.min_sigma, self.sigma - self.sigma_decay)

class NormalNoiseProcess:
    def __init__(self, action_space, var, decay, min_sigma):
        action_shape = action_space.shape

        self.mean   = np.zeros(action_shape)
        self.sigma = var
        self.sigma_decay = decay
        self.min_sigma = min_sigma

    def sample(self):
        return np.random.normal(loc = self.mean, scale=self.sigma, size=self.mean.shape)

    def decay(self):
        self.sigma = max(self.min_sigma, self.sigma - self.sigma_decay)

In [3]:
Sequence = namedtuple("Sequence", \
                ["state", "action", "reward", "next_state", "done"])

class Memory:
    def __init__(self, size):
        self.size = size
        self.data = deque(maxlen=size)
        self.max_entry = 0

    def push(self, sequence):
        self.data.append(sequence)
        self.max_entry = len(self.data)

    def sample(self, num_samples):
        samples = random.sample(self.data, num_samples)

        # convert to single sequence of samples for batch processing
        s, a, r, s1, d = [], [], [], [], []
        for sample in samples:
            s.append(sample.state)
            a.append(sample.action)
            r.append([sample.reward])
            s1.append(sample.next_state)
            d.append([sample.done])

        return Sequence(torch.tensor(s).float(),
                        torch.tensor(a).float(),
                        torch.tensor(r).float(),
                        torch.tensor(s1).float(),
                        torch.tensor(d))

In [4]:
class Actor(torch.nn.Module):
    def __init__(self, obs_size, action_space, l1_size=400, l2_size=300):
        super(Actor, self).__init__()
        self.action_space = action_space

        self.layer1 = torch.nn.Linear(obs_size, l1_size)
        self.layer2 = torch.nn.Linear(l1_size, l2_size)
        self.layer3 = torch.nn.Linear(l2_size, action_space.shape[0])

        # Initialization and batch norm ideas from
        # https://github.com/philtabor/Youtube-Code-Repository/tree/master/ReinforcementLearning/PolicyGradient/DDPG/lunar-lander/pytorch
        f1 = 1./np.sqrt(self.layer1.weight.data.size()[0])
        nn.init.uniform_(self.layer1.weight.data, -f1, f1)
        nn.init.uniform_(self.layer1.bias.data, -f1, f1)

        f2 = 1./np.sqrt(self.layer2.weight.data.size()[0])
        nn.init.uniform_(self.layer2.weight.data, -f2, f2)
        nn.init.uniform_(self.layer2.bias.data, -f2, f2)

        f3 = 0.003 # specified in the paper
        nn.init.uniform_(self.layer3.weight.data, -f3, f3)
        nn.init.uniform_(self.layer3.bias.data, -f3, f3)

        self.action_space = action_space

    def forward(self, x):
        x = F.relu(self.layer1(x))
        x = F.relu(self.layer2(x))
        x = self.layer3(x) # Don't use relu on last layer!

        x = torch.tanh(x) * torch.from_numpy(self.action_space.high).float()
        return x

    def take_action(self, state, added_noise=None):
        state_x = torch.from_numpy(state).float()
        action = self.forward(state_x).detach().numpy()

        if added_noise is not None:
            action += added_noise

        return action.clip(min=self.action_space.low, max=self.action_space.high) # TODO: clip action?


class Critic(torch.nn.Module):
    def __init__(self, obs_size, action_size, l1_size=400, l2_size=300):
        super(Critic, self).__init__()
        self.layer1 = torch.nn.Linear(obs_size, l1_size)
        self.layer2 = torch.nn.Linear(l1_size+action_size, l2_size)
        self.layer3 = torch.nn.Linear(l2_size, 1)

        # Initialization and batch norm ideas from
        # https://github.com/philtabor/Youtube-Code-Repository/tree/master/ReinforcementLearning/PolicyGradient/DDPG/lunar-lander/pytorch
        f1 = 1./np.sqrt(self.layer1.weight.data.size()[0])
        nn.init.uniform_(self.layer1.weight.data, -f1, f1)
        nn.init.uniform_(self.layer1.bias.data, -f1, f1)

        f2 = 1./np.sqrt(self.layer2.weight.data.size()[0])
        nn.init.uniform_(self.layer2.weight.data, -f2, f2)
        nn.init.uniform_(self.layer2.bias.data, -f2, f2)

        f3 = 0.0003 # specified in the paper
        nn.init.uniform_(self.layer3.weight.data, -f3, f3)
        nn.init.uniform_(self.layer3.bias.data, -f3, f3)


    def forward(self, x, a):
        layer1_out = self.layer1(x)
        layer1_bn = F.relu(layer1_out)

        layer2_out = self.layer2(torch.cat([layer1_bn, a], dim=1))
        layer2_bn = F.relu(layer2_out)

        q_value = self.layer3(layer2_bn)

        return q_value

In [8]:
class DDPG:
    def __init__(self, opt):
        self.params = opt
        self.start_time = time.time()
        self.training_timesteps = 0
        self.last_mean = 1E6
        self.last_var = 1E6
        self.update_params()
        self.reset()

    def update_params(self):
        # self.parameters = {
        #     "Environment Name"            : self.params['env_name'],
        #     "MAX_EPISODES"                : self.params['max_episodes'],
        #     "MEM_SIZE"                    : self.params['mem_size'],
        #     "MEMORY_MIN"                  : self.params['mem_min'],
        #     "BATCH_SIZE"                  : self.params['batch_size'],
        #     "GAMMA"                       : self.params['gamma'],
        #     "TAU"                         : self.params['tau'],
        #     "LEARNING_RATE_ACTOR"         : self.params['lr_actor'],
        #     "LEARNING_RATE_CRITIC"        : self.params['lr_critic'],
        #     "NOISE_TYPE"                  : self.params['noise_type'],
        #     "OU_NOISE_THETA"              : self.params['ou_noise_theta'],
        #     "OU_NOISE_SIGMA"              : self.params['ou_noise_sigma'],
        #     "NORMAL_VAR"                  : self.params['normal_noise_var'],
        #     "NORMAL_DECAY"                : self.params['normal_noise_decay'],
        #     "MIN_NORMAL_VAR"              : self.params['min_normal_noise'],
        #     "start time"                  : self.start_time,
        #     "L1_SIZE"                     : self.params['l1_size'],
        #     "L2_SIZE"                     : self.params['l2_size'],
        #     "OU_NOISE_SIGMA_DECAY_PER_EPS": self.params['ou_noise_decay'],
        #     "MIN_OU_NOISE_SIGMA"          : self.params['min_ou_noise_sigma'],
        #     "Save Freq"                   : self.params['save_freq'],
        #     "Print Freq"                  : self.params['print_freq'],
        #     "Save Actor Freq"             : self.params['save_actor_freq'],
        #     "LastMeanError"               : self.last_mean,
        #     "LastVarError"                : self.last_var,
        #     "Training Timesteps"          : self.training_timesteps,
        #     }
        

        self.parameters = {
            "Environment Name"            : self.params.env_name,
            "MAX_EPISODES"                : self.params.max_episodes,
            "MEM_SIZE"                    : self.params.mem_size,
            "MEMORY_MIN"                  : self.params.mem_min,
            "BATCH_SIZE"                  : self.params.batch_size,
            "GAMMA"                       : self.params.gamma,
            "TAU"                         : self.params.tau,
            "LEARNING_RATE_ACTOR"         : self.params.lr_actor,
            "LEARNING_RATE_CRITIC"        : self.params.lr_critic,
            "NOISE_TYPE"                  : self.params.noise_type,
            "OU_NOISE_THETA"              : self.params.ou_noise_theta,
            "OU_NOISE_SIGMA"              : self.params.ou_noise_sigma,
            "NORMAL_VAR"                  : self.params.normal_noise_var,
            "NORMAL_DECAY"                : self.params.normal_noise_decay,
            "MIN_NORMAL_VAR"              : self.params.min_normal_noise,
            "start time"                  : self.start_time,
            "L1_SIZE"                     : self.params.l1_size,
            "L2_SIZE"                     : self.params.l2_size,
            "OU_NOISE_SIGMA_DECAY_PER_EPS": self.params.ou_noise_decay,
            "MIN_OU_NOISE_SIGMA"          : self.params.min_ou_noise_sigma,
            "Save Freq"                   : self.params.save_freq,
            "Print Freq"                  : self.params.print_freq,
            "Save Actor Freq"             : self.params.save_actor_freq,
            "LastMeanError"               : self.last_mean,
            "LastVarError"                : self.last_var,
            "Training Timesteps"          : self.training_timesteps,
            }


    def reset(self):
        self.envname = self.parameters["Environment Name"]
        self.env = gym.make(self.parameters["Environment Name"])
        self.env.reset()

        t = time.localtime()
        if not self.params.load_from:
            self.name_suffix = "_" + self.env.spec.id[0:3] +"_"+ str(t.tm_mon) + "_" + str(t.tm_mday) + "_" + \
                    str(t.tm_hour) + "_" + str(t.tm_min)
        else:
            self.name_suffix = self.params.load_from

        obs_size    = self.env.observation_space.shape[0]
        action_size = self.env.action_space.shape[0]

        self.actor        = Actor(obs_size, self.env.action_space, self.params.l1_size, self.params.l2_size)
        self.critic       = Critic(obs_size, action_size, self.params.l1_size, self.params.l2_size)

        self.target_actor = Actor(obs_size, self.env.action_space, self.params.l1_size, self.params.l2_size)
        self.target_actor.load_state_dict(self.actor.state_dict())
        self.target_critic= Critic(obs_size, action_size, self.params.l1_size, self.params.l2_size)
        self.target_critic.load_state_dict(self.critic.state_dict())

        self.actor_optimizer = optim.Adam(self.actor.parameters(), self.params.lr_actor)
        self.critic_optimizer = optim.Adam(self.critic.parameters(), self.params.lr_critic, weight_decay=0.01)

        self.memory = Memory(self.params.mem_size)

        self.start_time = time.time()
        self.solved = None
        self.training_timesteps = 0

        if self.params.noise_type == "ou":
            self.noise = NoiseProcess(self.env.action_space,
                                        self.params.ou_noise_theta,
                                        self.params.ou_noise_sigma,
                                        self.params.ou_noise_decay,
                                        self.params.min_ou_noise_sigma)
        elif self.params.noise_type == "normal":
            self.noise = NormalNoiseProcess(self.env.action_space,
                                             self.params.normal_noise_var,
                                             self.params.normal_noise_decay,
                                             self.params.min_normal_noise)
        else:
            raise("Invalid noise type provided")

        self.folder_name = self.params.exp_name + self.name_suffix

    def fill_memory(self):
        fill_steps = 0
        while fill_steps < self.params.mem_min:
            state = self.env.reset()
            done = False

            ep_steps = 0
            while not done and ep_steps < self.env._max_episode_steps:
                ep_steps += 1
                noise_to_add = self.noise.sample()
                action = self.actor.take_action(state, noise_to_add)
                next_state, reward, done, _ = self.env.step(action)

                self.memory.push( \
                    Sequence(state, action, reward, next_state, done))

                state = next_state
                fill_steps += 1

    def train(self):
        print("Starting job: \n", self.parameters)

        training_episode_rewards = []
        test_episode_rewards = {"mean":[], "var":[]}
        actor_loss, critic_loss = torch.tensor(1E6), torch.tensor(1E6)

        self.fill_memory()
        self.training_timesteps = 0

        for episode_num in range(self.params.max_episodes):
            state = self.env.reset()
            done = False
            step_scores = []

            ep_steps = 0
            while not done and ep_steps < self.env._max_episode_steps:
                ep_steps += 1
                self.training_timesteps += 1
                noise_to_add = self.noise.sample()
                action = self.actor.take_action(state, noise_to_add)
                next_state, reward, done, _ = self.env.step(action)

                step_scores.append(float(reward))

                self.memory.push( \
                    Sequence(state, action, reward, next_state, done))

                state = next_state

                if self.memory.max_entry > self.params.mem_min:
                    actor_loss, critic_loss = self.update_networks()

            training_episode_rewards.append(sum(step_scores))
            self.noise.decay()

            print("Episode: ", episode_num, " / ", self.params.max_episodes,
                  " | Score: ", np.array(sum(step_scores)).round(4))

            if episode_num % self.params.print_freq == 0:
                average_episode_score = sum(training_episode_rewards[-self.params.print_freq:])/float(self.params.print_freq)
                print("\nEpisode: ", episode_num, " / ", self.params.max_episodes,
                      " | Avg Score: ",
                      np.array(average_episode_score).round(4),
                      " | Elapsed time [s]: ",
                      round((time.time() - self.start_time), 2),
                      )
                print("Actor loss: ", actor_loss.detach().numpy().round(4).item(),
                        "critic_loss: ", critic_loss.detach().numpy().round(4).item())

            if episode_num % self.params.save_freq == 0:
                print("\nAverage metric at iteration ", episode_num)
                average, variance = self.compute_average_metric()
                test_episode_rewards["mean"].append(average)
                test_episode_rewards["var"].append(variance)

                if episode_num%self.params.save_actor_freq == 0:
                    self.save_experiment("eps_"+str(episode_num) + "_of_"+str(self.params.max_episodes),
                                                                            training_episode_rewards,
                                                                            test_episode_rewards,
                                                                            save_actor=True)
                else:
                    self.save_experiment("eps_"+str(episode_num) + "_of_"+str(self.params.max_episodes))

                self.check_if_solved(average, episode_num)


        print("Finished training. Training time: ",
                    round((time.time() - self.start_time), 2) )
        print("Episode Scores: \n", training_episode_rewards)
        self.env.close()
        self.save_experiment("eps_"+str(self.params.max_episodes) + "_of_"+str(self.params.max_episodes),
                                                                training_episode_rewards,
                                                                test_episode_rewards,
                                                                save_actor=True, save_critic=True, )

        return True

    def check_if_solved(self, average, episode_num):
        solved = True
        if "mountaincar" in self.env.spec.id.lower() and average > 90.0:
            solved = True

        elif "lunarlander" in self.env.spec.id.lower() and average > 200.0:
            solved = True

        elif "bipedal" in self.env.spec.id.lower() and average > 300.0:
            solved = True

        if solved and self.solved is None:
            self.solved = episode_num
            print(self.env.spec.id, "solved after ", self.solved)

        return solved

    # mini-batch sample and update networks
    def update_networks(self):
        batch = self.memory.sample(self.params.batch_size)

        with torch.no_grad(): # Don't need gradient for target networks
            target_q = batch.reward + self.params.gamma * torch.mul(\
                                self.target_critic(batch.next_state,
                                    self.target_actor(batch.next_state)), (~batch.done).float()).detach()

        critic_q = self.critic(batch.state, batch.action)
        critic_loss = F.mse_loss(critic_q, target_q)

        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        actor_loss = -self.critic(batch.state, self.actor(batch.state)).mean() # gradient ascent for highest Q value

        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # soft update
        for param, target_param in zip(self.critic.parameters(), self.target_critic.parameters()):
            target_param.data.copy_(self.params.tau * param.data + (1 - self.params.tau) * target_param.data)

        for param, target_param in zip(self.actor.parameters(), self.target_actor.parameters()):
            target_param.data.copy_(self.params.tau * param.data + (1 - self.params.tau) * target_param.data)

        return actor_loss, critic_loss

    def compute_average_metric(self):
        num_to_test = 25
        rewards = np.zeros(num_to_test)

        for demo_ind in range(num_to_test):
            rewards[demo_ind] = self.demonstrate()

        print("Evaluation over ", num_to_test, "episodes.\n\t",
                                " Mean: ", rewards.mean(),
                                " | Variance: ", rewards.var())
        self.last_mean = rewards.mean()
        self.last_var = rewards.var()

        return rewards.mean(), rewards.var()

    def demonstrate(self):
        state = self.env.reset()
        done = False
        rewards = 0.0

        ep_steps = 0
        while not done and ep_steps < self.env._max_episode_steps:
            ep_steps += 1

            action = self.actor.take_action(state, None)
            next_state, reward, done, _ = self.env.step(action)
            rewards += reward

            state = next_state

        self.env.reset()
        return rewards

    def save_experiment(self, experiment_name,
                            training_episode_rewards=None,
                            test_episode_rewards=None,
                            save_actor=False,
                            save_critic=False):

        self.update_params()
        experiment_name = experiment_name + "_" + self.params.exp_name + self.name_suffix

        if self.folder_name not in os.listdir("experiments/"):
            os.mkdir("experiments/" + self.folder_name)
            print("made directory: ")
        save_location = "experiments/" + self.folder_name + "/" + experiment_name

        if save_actor:
            torch.save(self.actor.state_dict(), save_location + "actor")
        if save_critic:
            torch.save(self.critic.state_dict(), save_location + "critic")

        with open(save_location  + ".csv", "w") as file:
            w = csv.writer(file)
            for key, val in self.parameters.items():
                w.writerow([key, val, "\n"])

        if training_episode_rewards is not None:
            np.save(save_location + "_train_rewards", training_episode_rewards)

        if test_episode_rewards is not None:
            np.save(save_location + "_test_rewards_mean", test_episode_rewards["mean"])
            np.save(save_location + "_test_rewards_var", test_episode_rewards["var"])

In [11]:
class Struct:
    def __init__(self, **entries):
        self.__dict__.update(entries)

if __name__ == "__main__":
    opt = {
            "env_name": 'MountainCarContinuous-v0',
            "exp_name": 'MountainCar',
            "max_episodes": 25000,
            "mem_size": 100000,
            "mem_min": 1000,
            "batch_size": 64,
            "gamma": 0.99,
            "tau": 0.001,
            "lr_actor": 1e-4,
            "lr_critic": 1e-3,
            "l1_size": 400,
            "l2_size": 300,
            "noise_type": 'ou',
            "ou_noise_theta": 0.15,
            "ou_noise_sigma": 0.2,
            "normal_noise_var": 0.2,
            "normal_noise_decay": 0.,
            "min_normal_noise": 0.2,
            "ou_noise_decay": 0.,
            "min_ou_noise_sigma": 0.15,
            "save_freq": 10.,
            "print_freq": 50,
            "save_actor_freq": 500,
            'load_from': False
            }

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("Device is ", device)

    ddpg = DDPG(Struct(**opt))
    ddpg.train()

    # if not opt.load_from:
    #     ddpg.train()

    # else:
    #     # Use this to save or load networks. Assumes you are loading from experiments/ subdirectory.
    #     # Example Usage:
    #     # $ python ddpg.py --load_from quicklunarlander/finished_quick0_quick_Lun_11_28_15_27
    #     ddpg.load_experiment()
    #     IPython.embed()

Device is  cpu
Starting job: 
 {'Environment Name': 'MountainCarContinuous-v0', 'MAX_EPISODES': 25000, 'MEM_SIZE': 100000, 'MEMORY_MIN': 1000, 'BATCH_SIZE': 64, 'GAMMA': 0.99, 'TAU': 0.001, 'LEARNING_RATE_ACTOR': 0.0001, 'LEARNING_RATE_CRITIC': 0.001, 'NOISE_TYPE': 'ou', 'OU_NOISE_THETA': 0.15, 'OU_NOISE_SIGMA': 0.2, 'NORMAL_VAR': 0.2, 'NORMAL_DECAY': 0.0, 'MIN_NORMAL_VAR': 0.2, 'start time': 1686133551.9442987, 'L1_SIZE': 400, 'L2_SIZE': 300, 'OU_NOISE_SIGMA_DECAY_PER_EPS': 0.0, 'MIN_OU_NOISE_SIGMA': 0.15, 'Save Freq': 10.0, 'Print Freq': 50, 'Save Actor Freq': 500, 'LastMeanError': 1000000.0, 'LastVarError': 1000000.0, 'Training Timesteps': 0}
Episode:  0  /  25000  | Score:  -3.05

Episode:  0  /  25000  | Avg Score:  -0.061  | Elapsed time [s]:  57.73
Actor loss:  0.014299999922513962 critic_loss:  9.999999747378752e-05

Average metric at iteration  0
Evaluation over  25 episodes.
	  Mean:  -0.10855104856245643  | Variance:  4.5942089073653236e-10
made directory: 
MountainCarContin