<a href="https://colab.research.google.com/github/zoikorda/self-driving-cars/blob/main/SoftActorCritic_agent.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install rltorch
!pip install highway-env
!pip install stable-baselines==2.10.0
!pip install gym
!pip install highway-env
!pip install gym pyvirtualdisplay
!apt-get update
!apt-get install -y xvfb python-opengl ffmpeg -y

In [None]:
import torch
torch.cuda.is_available()

In [3]:
import os
import argparse
from datetime import datetime

import torch
from torch.optim import Adam
from tqdm import tqdm
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optimizer
import numpy as np
from torch.distributions import Normal
from rltorch.network import create_linear_network
from torch.utils.tensorboard import SummaryWriter
from rltorch.memory import MultiStepMemory, PrioritizedMemory
#import wandb
from collections import deque
from random import sample, random
from torch.autograd import Variable

from dataclasses import dataclass
from typing import Any
from random import sample

import gym
import highway_env
import time

import matplotlib.pyplot as plt
import numpy as np

@dataclass
class Sarsd:
  state: Any
  action: int
  reward: float
  next_state: Any
  masked_done: bool
  episode_done: bool

#state, action, reward, next_state, masked_done, episode_done=done))
class ReplayBuffer():
    def __init__(self, max_size, input_shape, n_actions):
        #print(input_shape) # (6,)
        self.memory_size = max_size  # the max size of the buffer/memory
        self.memory_counter = 0  # counts the inputs
        self.state_memory = np.zeros((self.memory_size, *input_shape))
        #print(self.state_memory) # (2,6)
        self.next_state_memory = np.zeros((self.memory_size, *input_shape))
        #print(self.next_state_memory) # (2,6)
        self.action_memory = np.zeros((self.memory_size, n_actions))
        #print(self.action_memory) # (2,2)
        self.reward_memory = np.zeros(self.memory_size)
        self.done_memory = np.zeros(self.memory_size, dtype=np.bool)
        
    def insert(self, state, action, reward, next_state, episode_done):
        index = self.memory_counter % self.memory_size # if the samples are more than the max size, the buffer is overwrited 
        #print(state) # dict of arrays, size 3

        self.state_memory[index] = state["observation"]
        self.next_state_memory[index] = next_state["observation"]
        self.action_memory[index] = action
        self.reward_memory[index] = reward
        self.done_memory[index] = episode_done

        self.memory_counter += 1

    def sample(self, batch_size):
        max_memory = min(self.memory_counter, self.memory_size)
        #print(max_memory)

        batch = np.random.choice(max_memory, batch_size) # Generate a uniform random sample from np.arange(max_memory) of size batch_size
        #print(batch)

        states = torch.FloatTensor(self.state_memory[batch])
        next_states = torch.FloatTensor(self.next_state_memory[batch])
        actions = torch.FloatTensor(self.action_memory[batch])
        rewards = torch.FloatTensor(self.reward_memory[batch])
        dones = torch.FloatTensor(self.done_memory[batch])
        
        return states, actions, rewards, next_states, dones

#num_steps = 2
env = gym.make("parking-v0")

#replay_buffer = ReplayBuffer(num_steps, env.observation_space["observation"].shape, env.action_space.shape[0])


class ReplayBuffer2:
  def __init__(self, buffer_size=100000):
    self.buffer_size = buffer_size
    #self.buffer = []
    self.buffer = deque(maxlen=buffer_size)

  def append(self, sars):
    self.buffer.append(sars)
    #self.buffer = self.buffer[-self.buffer_size:]

  def sample(self, num_samples):
    assert num_samples <= len(self.buffer)
    return sample(self.buffer, num_samples)

In [4]:
# Visualization
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
from tqdm.notebook import trange
from IPython import display as ipythondisplay
from pyvirtualdisplay import Display
from gym.wrappers import Monitor
import base64

# IO
from pathlib import Path

display = Display(visible=0, size=(1400, 900))
display.start()

def show_video(path):
    html = []
    for mp4 in Path(path).glob("*.mp4"):
        video_b64 = base64.b64encode(mp4.read_bytes())
        html.append('''<video alt="{}" autoplay 
                      loop controls style="height: 400px;">
                      <source src="data:video/mp4;base64,{}" type="video/mp4" />
                 </video>'''.format(mp4, video_b64.decode('ascii')))
    ipythondisplay.display(ipythondisplay.HTML(data="<br>".join(html)))

In [6]:
class BaseNetwork(nn.Module):
    def save(self, path):
        torch.save(self.state_dict(), path)

    def load(self, path):
        self.load_state_dict(torch.load(path))


class QNetwork(BaseNetwork):
    def __init__(self, num_inputs, num_actions, hidden_units=[256, 256],
                 initializer='xavier'):
        super(QNetwork, self).__init__()

        # https://github.com/ku2482/rltorch/blob/master/rltorch/network/builder.py
        self.Q = create_linear_network(
            num_inputs+num_actions, 1, hidden_units=hidden_units,
            initializer=initializer)

    def forward(self, x):
        q = self.Q(x)
        return q


class TwinnedQNetwork(BaseNetwork):

    def __init__(self, num_inputs, num_actions, hidden_units=[256, 256],
                 initializer='xavier'):
        super(TwinnedQNetwork, self).__init__()

        self.Q1 = QNetwork(
            num_inputs, num_actions, hidden_units, initializer)
        self.Q2 = QNetwork(
            num_inputs, num_actions, hidden_units, initializer)

    def forward(self, states, actions):
        x = torch.cat([states, actions], dim=1)
        q1 = self.Q1(x)
        q2 = self.Q2(x)
        return q1, q2


class GaussianPolicy(BaseNetwork):
    LOG_STD_MAX = 2
    LOG_STD_MIN = -20
    eps = 1e-6

    def __init__(self, num_inputs, num_actions, hidden_units=[256, 256],
                 initializer='xavier'):
        super(GaussianPolicy, self).__init__()

        # https://github.com/ku2482/rltorch/blob/master/rltorch/network/builder.py
        self.policy = create_linear_network(
            num_inputs, num_actions*2, hidden_units=hidden_units,
            initializer=initializer)

    def forward(self, states):
        mean, log_std = torch.chunk(self.policy(states), 2, dim=-1)
        log_std = torch.clamp(
            log_std, min=self.LOG_STD_MIN, max=self.LOG_STD_MAX)
        #print(mean, log_std)
        return mean, log_std

    def sample(self, states):
        # calculate Gaussian distribusion of (mean, std)
        means, log_stds = self.forward(states)
        stds = log_stds.exp()
        normals = Normal(means, stds)
        # sample actions
        xs = normals.rsample()
        actions = torch.tanh(xs)
        # calculate entropies
        log_probs = normals.log_prob(xs)\
            - torch.log(1 - actions.pow(2) + self.eps)
        entropies = -log_probs.sum(dim=1, keepdim=True)

        return actions, entropies, torch.tanh(means)

In [7]:
class RunningMeanStats:

    def __init__(self, n=10):
        self.n = n
        self.stats = deque(maxlen=n)

    def append(self, x):
        self.stats.append(x)

    def get(self):
        return np.mean(self.stats)

def to_batch(state, action, reward, next_state, done, device):
    state = torch.FloatTensor(state).unsqueeze(0).to(device)
    action = torch.FloatTensor([action]).view(1, -1).to(device)
    reward = torch.FloatTensor([reward]).unsqueeze(0).to(device)
    next_state = torch.FloatTensor(next_state).unsqueeze(0).to(device)
    done = torch.FloatTensor([done]).unsqueeze(0).to(device)
    return state, action, reward, next_state, done

def hard_update(target, source):
    target.load_state_dict(source.state_dict())

def grad_false(network):
    for param in network.parameters():
        param.requires_grad = False

def soft_update(target, source, tau):
    for t, s in zip(target.parameters(), source.parameters()):
        t.data.copy_(t.data * (1.0 - tau) + s.data * tau)

def update_params(optimizer, network, loss, grad_clip=None, retain_graph=True):
    optimizer.zero_grad()
    #print(loss) # tensor(..., dtype=torch.float64)
    torch.autograd.set_detect_anomaly(True)

In [11]:
class Agent():
    def __init__(self, env, observation_shape =  env.observation_space["observation"].shape, num_steps=20000, batch_size=256,
                 lr=0.0001, hidden_units=[256, 256], memory_size=1e6,
                 gamma=0.99, tau=0.005, entropy_tuning=True, ent_coef=0.2,
                 multi_step=1, per=False, alpha=0.6, beta=0.4,
                 beta_annealing=0.0001, grad_clip=None, updates_per_step=1,
                 start_steps=1000, log_interval=10, target_update_interval=1,
                 eval_interval=1000, cuda=True, seed=0, verbose=1):
        self.env = env
        self.device = torch.device("cuda" if cuda and torch.cuda.is_available() else "cpu")

        self.replay_buffer = ReplayBuffer(num_steps, observation_shape, self.env.action_space.shape[0])
        
        self.num_steps = num_steps #20000
        self.tau = tau
        self.per = per
        self.batch_size = batch_size
        self.start_steps = start_steps #1000
        self.gamma_n = gamma ** multi_step
        self.entropy_tuning = entropy_tuning
        self.grad_clip = grad_clip
        self.updates_per_step = updates_per_step
        self.log_interval = log_interval
        self.target_update_interval = target_update_interval
        self.eval_interval = eval_interval

        torch.manual_seed(seed)
        np.random.seed(seed)
        self.env.seed(seed)
        torch.backends.cudnn.deterministic = True  # It harms a performance.
        torch.backends.cudnn.benchmark = False

        self.actor_model = GaussianPolicy(
            self.env.observation_space["observation"].shape[0],
            self.env.action_space.shape[0],
            hidden_units=hidden_units).to(self.device)
        self.critic_model = TwinnedQNetwork(
            self.env.observation_space["observation"].shape[0],
            self.env.action_space.shape[0],
            hidden_units=hidden_units).to(self.device)
        self.critic_target_model = TwinnedQNetwork(
            self.env.observation_space["observation"].shape[0],
            self.env.action_space.shape[0],
            hidden_units=hidden_units).to(self.device).eval()

        # copy parameters of the learning network to the target network
        hard_update(self.critic_target_model, self.critic_model)
        # disable gradient calculations of the target network
        grad_false(self.critic_target_model)

        self.actor_optimizer = optimizer.Adam(self.actor_model.parameters(), lr=0.0001)
        self.Q1_optimizer = optimizer.Adam(self.critic_model.Q1.parameters(), lr=0.0001)
        self.Q2_optimizer = optimizer.Adam(self.critic_model.Q2.parameters(), lr=0.0001)

        if entropy_tuning:
            # Target entropy is -|A|.
            self.target_entropy = -torch.prod(torch.Tensor(
                self.env.action_space.shape).to(self.device)).item()
            # We optimize log(alpha), instead of alpha.
            self.log_alpha = torch.zeros(
                1, requires_grad=True, device=self.device)
            self.alpha = self.log_alpha.exp()
            self.alpha_optim = Adam([self.log_alpha], lr=lr)
        else:
            # fixed alpha
            self.alpha = torch.tensor(ent_coef).to(self.device)

        self.train_rewards = RunningMeanStats(log_interval)

        self.steps = 0
        self.learning_steps = 0
        self.episodes = 0

    def run(self, fuck):
        while fuck!=1000:
            fuck += 1
            self.train_episode()
            if self.steps > self.num_steps:
                break

    def is_update(self):
        return self.replay_buffer.memory_size > self.batch_size and self.steps >= self.start_steps

    def act(self, state):
        if self.start_steps > self.steps:
            action = self.env.action_space.sample()
        else:
            action = self.explore(state)
        return action

    def explore(self, state):
        # act with randomness
        state = torch.FloatTensor(state["observation"]).unsqueeze(0).to(self.device)
        with torch.no_grad():
            action, _, _ = self.actor_model.sample(state)
        return action.cpu().numpy().reshape(-1)

    def exploit(self, state):
        # act without randomness
        state = torch.FloatTensor(state["observation"]).unsqueeze(0).to(self.device)
        with torch.no_grad():
            _, _, action = self.actor_model.sample(state)
        return action.cpu().numpy().reshape(-1)

    def calc_current_q(self, states, actions, rewards, next_states, dones):
        curr_q1, curr_q2 = self.critic_model(states, actions)
        #print(curr_q1.requires_grad) # True
        return curr_q1, curr_q2

    def calc_target_q(self, states, actions, rewards, next_states, dones):
        with torch.no_grad():
          next_actions, next_entropies, _ = self.actor_model.sample(next_states)
          next_q1, next_q2 = self.critic_target_model(next_states, next_actions)
          next_q = torch.min(next_q1, next_q2) + self.alpha * next_entropies
          #print(next_q) # torch.Size([256, 1])
          #print(dones) # (256,)
          #print(rewards) # (256,)
                    
        target_q = rewards + (1.0 - dones) @ (self.gamma_n * next_q)
        return target_q

    def calculate_critic_loss(self, batch, weights):
        #print(batch)
        current_q1, current_q2 = self.calc_current_q(*batch)
        target_q = self.calc_target_q(*batch)
        #print(target_q.requires_grad) #True

        # TD errors for updating priority weights
        errors = torch.abs(current_q1.detach() - target_q)
        # We log means of Q to monitor training.
        mean_q1 = current_q1.detach().mean().item()
        mean_q2 = current_q2.detach().mean().item()

        # Critic loss is mean squared TD errors with priority weights.
        #print((current_q1.detach().cpu().numpy()).shape) # (256,1)
        #print((target_q).shape) # (256,)
        #target_q = target_q.reshape((256,1))
        #print((target_q).shape) # (256,1)
        q1_loss = torch.mean((current_q1 - target_q)**2 * weights)
        q2_loss = torch.mean((current_q2 - target_q)**2 * weights)
        return q1_loss, q2_loss, errors, mean_q1, mean_q2

    def calculate_actor_loss(self, batch, weights):
        states, actions, rewards, next_states, dones = batch

        # We re-sample actions to calculate expectations of Q.
        sampled_action, entropy, _ = self.actor_model.sample(states)

        # expectations of Q with clipped double Q technique
        q1, q2 = self.critic_model(states, sampled_action)
        q = torch.min(q1, q2)

        # Policy objective is maximization of (Q + alpha * entropy) with priority weights.
        actor_loss = torch.mean((- q - self.alpha * entropy) * weights)
        return actor_loss, entropy

    def train_episode(self):
        self.episodes += 1
        episode_reward = 0
        episode_steps = 0
        done = False
        state = self.env.reset()

        while not done:

            action = self.act(state)
            next_state, reward, done, _ = self.env.step(action)
            #if(not done) : print(done) # done = True when the episode is finished
            self.steps += 1
            episode_steps += 1
            episode_reward += reward

            # ignore done if the agent reach time horizons
            # (set done=True only when the agent fails)
            if episode_steps >= 100:
                masked_done = False
            else:
                masked_done = done

            if self.per:
                batch = to_batch(state, action, reward, next_state, masked_done, self.device)
                with torch.no_grad():
                    current_q1, current_q2 = self.critic_model(states, actions)

                    next_actions, next_entropies, _ = self.actor_model.sample(next_states)
                    next_q1, next_q2 = self.critic_target_model(next_states, next_actions)
                    next_q = torch.min(next_q1, next_q2) + self.alpha * next_entropies
                target_q = rewards + (1.0 - dones) * self.gamma_n * next_q
                error = torch.abs(current_q1 - target_q).item()
                # We need to give true done signal with addition to masked done
                # signal to calculate multi-step rewards.
                self.replay_buffer.insert(state, action, reward, next_state, masked_done, error, episode_done=done)

            else:
                # We need to give true done signal with addition to masked done
                # signal to calculate multi-step rewards
                self.replay_buffer.insert(state, action, reward, next_state, episode_done=done)

            if self.is_update():
                for _ in range(self.updates_per_step):
                    self.learn()

            if(self.steps % self.eval_interval==0):
                self.evaluate()

            state=next_state


        # We log running mean of training rewards.
        self.train_rewards.append(episode_reward)
        print(self.train_rewards.get(), self.steps)

        print(f'episode: {self.episodes:<4}  '
              f'episode steps: {episode_steps:<4}  '
              f'reward: {episode_reward:<5.1f}')

    def evaluate(self):
        print("NOW EVALUATE")
        env = gym.make("parking-v0")
        episodes = 10
        returns = np.zeros((episodes,), dtype=np.float32)

        env = Monitor(env, './video', force=True, video_callable=lambda episode: True)

        for i in range(episodes):
            state = self.env.reset()
            episode_reward = 0.
            done = False
            while not done:
                action = self.exploit(state)
                next_state, reward, done, _ = self.env.step(action)
                episode_reward += reward
                state = next_state
            returns[i] = episode_reward

        mean_return = np.mean(returns)
        
        print('-' * 60)
        print(f'Num steps: {self.steps:<5}  '
              f'reward: {mean_return:<5.1f}')
        print('-' * 60)

        env.close()
        show_video('./video')

    def learn(self):
        #print("NOW LEARN")
        self.learning_steps += 1
        if self.learning_steps % self.target_update_interval == 0:
            soft_update(self.critic_target_model, self.critic_model, self.tau)

        if self.per:
            batch, indices, weights = self.replay_buffer.sample(self.batch_size)
        else:
            batch = self.replay_buffer.sample(self.batch_size)
            weights = 1.

        q1_loss, q2_loss, errors, mean_q1, mean_q2 = self.calculate_critic_loss(batch, weights)
        actor_loss, entropies = self.calculate_actor_loss(batch, weights)

        update_params(self.Q1_optimizer, self.critic_model.Q1, q1_loss, self.grad_clip)
        update_params(self.Q2_optimizer, self.critic_model.Q2, q2_loss, self.grad_clip)
        update_params(self.actor_optimizer, self.actor_model, actor_loss, self.grad_clip)

        #loss.backward(create_graph=True)
        q1_loss.backward(retain_graph=True)
        q2_loss.backward(retain_graph=True)
        actor_loss.backward(retain_graph=True)

        self.Q1_optimizer.step()
        self.Q2_optimizer.step()
        self.actor_optimizer.step()

        if self.entropy_tuning:
            entropy_loss = -torch.mean(self.log_alpha * (self.target_entropy - entropies).detach() * weights)
            update_params(self.alpha_optim, None, entropy_loss)
            self.alpha = self.log_alpha.exp()

        if self.per:
            # update priority weights
            self.replay_buffer.update_priority(indices, errors.cpu().numpy())

In [None]:
def run():

    # You can define configs in the external json or yaml file.
    configs = {
        'num_steps': 20000,
        'batch_size': 256,
        'lr': 0.0003,
        'hidden_units': [256, 256],
        'memory_size': 1e6,
        'gamma': 0.99,
        'tau': 0.005,
        'entropy_tuning': True,
        'ent_coef': 0.2,  # It's ignored when entropy_tuning=True.
        'multi_step': 1,
        'per': False,  # prioritized experience replay
        'alpha': 0.6,  # It's ignored when per=False.
        'beta': 0.4,  # It's ignored when per=False.
        'beta_annealing': 0.0001,  # It's ignored when per=False.
        'grad_clip': None,
        'updates_per_step': 1,
        'start_steps': 10000,
        'log_interval': 10,
        'target_update_interval': 1,
        'eval_interval': 10000,
        'seed': 0
    }

    env = gym.make("parking-v0")

    agent = Agent(env=env, **configs)
    fuck = 0
    agent.run(fuck)

    ##-----------------------------------##

    obs = agent.env.reset()
    done =  False

    # Evaluate the agent
    rewards_total = []
    episode_reward = 0
    for _ in range(10000):
      with torch.no_grad():
        action = agent.exploit(obs)
      obs, reward, done, info = agent.env.step(action)
      episode_reward += reward
      if done or info.get('is_success', False):
        print("Reward:", episode_reward, "Success?", info.get('is_success', False))
        rewards_total.append(episode_reward)
        episode_reward = 0.0
        obs = agent.env.reset()
    
    print(rewards_total)
    # plot the scores
    fig = plt.figure()
    ax = fig.add_subplot(111)
    plt.plot(np.arange(1, len(rewards_total)+1), rewards_total)
    plt.ylabel('Score')
    plt.xlabel('Episode #')
    plt.show()    

 
if __name__ == '__main__':
    run()