<a href="https://colab.research.google.com/github/zoikorda/self-driving-cars/blob/main/DQN_agent.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install highway-env
!pip install stable-baselines==2.10.0
!pip install gym
!pip install highway-env
!pip install gym pyvirtualdisplay
!apt-get update
!apt-get install -y xvfb python-opengl ffmpeg -y

In [None]:
!pip install --user git+https://github.com/eleurent/rl-agents

In [3]:
import torch
from tqdm import tqdm
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optimizer
import numpy as np
import random
#import rl-agents
#import wandb
from collections import deque
#from rl_agents.configuration import Configurable
from random import random
from torch.autograd import Variable

from dataclasses import dataclass
from typing import Any
from random import sample

#from rl_agents.agents.common.memory import Transition

import gym
import time
import matplotlib.pyplot as plt

In [4]:
from collections import namedtuple
Transition = namedtuple('Transition', ('state', 'action', 'reward', 'next_state', 'done'))

In [14]:
class Model(nn.Module):

  def __init__(self, observation_shape, num_actions):
        super(Model, self).__init__()
        assert len(observation_shape) == 1
        self.observation_shape = observation_shape
        self.num_actions = num_actions
        self.net = torch.nn.Sequential(
        torch.nn.Linear(observation_shape[0], 256),
        torch.nn.ReLU(),
        torch.nn.Linear(256, num_actions)
        )

        self.optimizer = optimizer.Adam(self.net.parameters(), lr=0.0001)

  def forward(self, x):
    return self.net(x)

In [15]:
class ReplayBuffer2:
  """
  Stores and samples transitions
  """
  def __init__(self, buffer_size=100000, transition_type=Transition):
    self.buffer_size = buffer_size
    self.transition_type = transition_type
    self.buffer = []
    self.position = 0

  def insert(self, *args):
    """
    saves a transition
    """
    if len(self.buffer) < self.buffer_size:
      self.buffer.append(None)
      self.position = len(self.buffer) - 1
    elif len(self.buffer) > self.buffer_size:
      self.buffer = self.buffer[:self.buffer_size]
    self.buffer[self.position] = self.transition_type(*args)
    self.position = (self.position + 1) % self.buffer_size          
    
  def sample(self, batch_size, num_steps=1, collapsed=True):
    """
    Samples a batch of transitions
    If num_steps>1 the batch will be composed of lists of successive transitions.
    :param collapsed: whether successive transitions must be collapsed into one n-step transition.
    :param batch_size: Minibatch size for each gradient update
    """
    assert batch_size <= len(self.buffer)
    if num_steps==1:
      return sample(self.buffer, batch_size)
    else:      
      indexes = sample(range(len(self.buffer)), batch_size) # sample initial transition indexes
      all_transitions = [self.buffer[i:i+num_steps] for i in indexes] # get the batch of n-consecutive-transitions starting from sampled indexes
      return map(self.collapse_n_steps, all_transitions) if collapsed else all_transitions # collapse transitions

  def collapse_n_steps(self, transitions):
    """
    Collapse n transitions <s,a,r,s',d> of a trajectory into one transition <s0, a0, Sum(r), sp, dp>.
    We start from the initial state, perform the first action, and then the return estimate is formed by
            accumulating the discounted rewards along the trajectory until a terminal state or the end of the
            trajectory is reached.
    :param transitions: A list of n successive transitions
    """
    state, action, cumulated_reward, next_state, done = transitions[0]
    discount = 1
    for transition in transitions[1:]:
      if done:
        break
      else:
        _, _, reward, next_state, done = transition
        discount *= gamma
        cumulated_reward += discount*reward
    return state, action, cumulated_reward, next_state, done

In [16]:
def train1(model, batch, target, num_actions):

  #print(batch)
  print("THE FEATURES............")

  if not isinstance(batch.state, torch.Tensor):
    current_states = torch.stack(([torch.Tensor(s["observation"]) for s in batch.state]))
    #print(current_states.shape) # torch.Size([256, 6])
    actions = torch.stack(([torch.Tensor(s) for s in batch.action]))
    #print(actions.shape) # torch.Size([256, 2])
    rewards = torch.stack(([torch.Tensor([s]) for s in batch.reward]))
    #print(rewards.shape) # torch.Size([256, 1])
    next_states = torch.stack(([torch.Tensor(s["observation"]) for s in batch.next_state]))
    #print(next_states.shape) # torch.Size([256, 6])
    masks = torch.stack(([torch.Tensor([0]) if s else torch.Tensor([1]) for s in batch.done]))
    #print(masks.shape) # torch.Size([256, 1])
    batch = Transition(current_states, actions, rewards, next_states, masks)
  
  loss_func = torch.nn.MSELoss()
  model.optimizer.zero_grad()
  q_values = model(current_states) # get current Q-values estimates
  #print(q_values.shape) # torch.Size([256, 2])
  q_values = torch.gather(q_values, dim=1, index=actions.long()) # retrieve the Q-values for the actions from the replay buffer
  #print(q_values) 
  with torch.no_grad():
    best_values = target(next_states) # compute the next Q-values using the target network
    #print(best_values.shape) # torch.Size([256, 2])
    best_values, _ = best_values.max(dim=1) # follow greedy policy: use the one with the highest value
    best_values = best_values.reshape(-1,1) # avoid potential broadcast issue
    
    # plot the scores
    fig = plt.figure()
    ax = fig.add_subplot(111)
    plt.plot(np.arange(1, len(rewards)+1), rewards)
    plt.ylabel('Score')
    plt.xlabel('Episode steps #')
    plt.show()

    target_state_action_value = rewards + gamma * best_values*(1-masks) # 1-step TD target

  loss = loss_func(q_values, target_state_action_value)
  return loss

In [17]:
gamma = 0.99 # the discount factor

def update_target_model(model, target):
  """
  update the target network every "target_model_update" epochs.
  """
  target.load_state_dict(model.state_dict())

In [18]:
from tqdm.notebook import trange
from IPython import display as ipythondisplay
from pyvirtualdisplay import Display
from gym.wrappers import Monitor
import base64

# IO
from pathlib import Path

In [19]:
display = Display(visible=0, size=(1400, 900))
display.start()

def show_video(path):
    html = []
    for mp4 in Path(path).glob("*.mp4"):
        video_b64 = base64.b64encode(mp4.read_bytes())
        html.append('''<video alt="{}" autoplay 
                      loop controls style="height: 400px;">
                      <source src="data:video/mp4;base64,{}" type="video/mp4" />
                 </video>'''.format(mp4, video_b64.decode('ascii')))
    ipythondisplay.display(ipythondisplay.HTML(data="<br>".join(html)))

In [20]:
def sample_minibatch(memory, batch_size):
  print("------------MEMORY------------------")
  print(len(memory.buffer), batch_size)
  if(len(memory.buffer) < batch_size):
    return None
  transitions = memory.sample(batch_size)
  return Transition(*zip(*transitions))

In [None]:
import highway_env
if __name__ == '__main__':
    test = False
    min_replay_buffer_size = 10000
    sample_size = 2500

    #epsilon_max = 1.0
    epsilon_min = 0.01

    epsilon_decay = 0.999995

    env_steps_before_train = 100
    target_model_update = 20 ###we use 20 for a faster training (more efficient with 150 episodes)
    env = gym.make("parking-v0")
    last_observation = env.reset()
    print(last_observation) 
    print(env.observation_space["observation"].shape)

    base_model = Model(env.observation_space["observation"].shape, env.action_space.shape[0])

    target_model = Model(env.observation_space["observation"].shape, env.action_space.shape[0])
    update_target_model(base_model, target_model)

    replay_buffer = ReplayBuffer2()

    steps_since_train = 0
    epochs_since_target = 0
    step_num = -1 * min_replay_buffer_size

    episode_rewards = []
    rolling_reward = 0

    losses = []
    since=0

    tq = tqdm()
    try:
        while since!=4:
            if test:
              env = Monitor(env, './video', force=True, video_callable=lambda episode: True)
              time.sleep(0.05)

            tq.update(1)
            eps = epsilon_decay**(step_num)

            if test:
              eps = 0

            if random() < eps:
                action = env.action_space.sample()
            else:
                with torch.no_grad():
                  action = base_model(torch.Tensor(last_observation["observation"]))
                                                  
            observation, reward, done, info = env.step(action)
            rolling_reward += reward
            reward = reward/100.0

            replay_buffer.insert(last_observation, action, reward, observation, done)
            last_observation = observation

            if done:
                print("IT IS DONE")
                episode_rewards.append(rolling_reward)
                print(rolling_reward)
                if test:
                    print(rolling_reward)
                rolling_reward = 0
                observation = env.reset()

            steps_since_train += 1
            step_num += 1

            if (not test) and len(replay_buffer.buffer) > min_replay_buffer_size and steps_since_train > env_steps_before_train:

              batch = sample_minibatch(replay_buffer, sample_size) 
              loss1 = train1(base_model, batch, target_model, env.action_space.shape[0])
              losses.append(loss1)
              print("LOSS", loss1)
              episode_rewards = []
              epochs_since_target += 1
              print(epochs_since_target)
              if epochs_since_target > target_model_update:
                  print("Updating target model----------------------")
                  env = Monitor(env, './video', force=True, video_callable=lambda episode: True)
                  update_target_model(base_model, target_model)
                  epochs_since_target = 0
                  since += 1
                  show_video('./video')
              steps_since_train = 0
            env.close()
            show_video('./video')  

    except KeyboardInterrupt:
        pass
    env.close()

In [None]:
obs = env.reset()
done =  False

# Evaluate the agent
rewards_total = []
episode_reward = 0
for _ in range(10000):
  with torch.no_grad():
    action = base_model(torch.Tensor(obs["observation"]))
  obs, reward, done, info = env.step(action)
  episode_reward += reward
  if done or info.get('is_success', False):
    print("Reward:", episode_reward, "Success?", info.get('is_success', False))
    rewards_total.append(episode_reward)
    episode_reward = 0.0
    obs = env.reset()

In [None]:
print(rewards_total)

import matplotlib.pyplot as plt
import numpy as np

# plot the scores
fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(1, len(rewards_total)+1), rewards_total)
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.show()

In [None]:
# Plot losses
plt.plot(losses)
plt.yscale("log")
plt.xlabel("episodes")
plt.ylabel("loss")
plt.show()