In [None]:
# Install environment and agent
!pip install highway-env
# TODO: we use the bleeding edge version because the current stable version does not support the latest gym>=0.21 versions. Revert back to stable at the next SB3 release.
!pip install git+https://github.com/DLR-RM/stable-baselines3



In [None]:
# Environment
import gymnasium as gym
import highway_env
# Agent
from stable_baselines3 import DQN

# Visualization utils
%load_ext tensorboard
import sys
from tqdm.notebook import trange
!pip install tensorboardx gym pyvirtualdisplay
!apt-get install -y xvfb ffmpeg
!git clone https://github.com/Farama-Foundation/HighwayEnv.git 2> /dev/null
sys.path.insert(0, '/content/HighwayEnv/scripts/')
from utils import record_videos, show_videos
from tqdm import trange
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random
import matplotlib.pyplot as plt
from copy import deepcopy


In [None]:
env = gym.make("highway-fast-v0", render_mode="rgb_array")

config = {
    "observation": {
        "type": "OccupancyGrid",
        "vehicles_count": 10,
        "features": ["presence", "x", "y", "vx", "vy", "cos_h", "sin_h"],
        "features_range": {
            "x": [-100, 100],
            "y": [-100, 100],
            "vx": [-20, 20],
            "vy": [-20, 20],
        },
        "grid_size": [[-20, 20], [-20, 20]],
        "grid_step": [5, 5],
        "absolute": False,
    },
    "action": {
        "type": "DiscreteAction",
    },
    "lanes_count": 3,
    "vehicles_count": 10,
    "duration": 20,  # [s]
    "initial_spacing": 0,
    "collision_reward": -1,  # The reward received when colliding with a vehicle.
    "right_lane_reward": 0.5,  # The reward received when driving on the right-most lanes, linearly mapped to
    # zero for other lanes.
    "high_speed_reward": 0.1,  # The reward received when driving at full speed, linearly mapped to zero for
    # lower speeds according to config["reward_speed_range"].
    "lane_change_reward": 0,
    "reward_speed_range": [
        20,
        30,
    ],  # [m/s] The reward for high speed is mapped linearly from this range to [0, HighwayEnv.HIGH_SPEED_REWARD].
    "simulation_frequency": 5,  # [Hz]
    "policy_frequency": 1,  # [Hz]
    "other_vehicles_type": "highway_env.vehicle.behavior.IDMVehicle",
    "screen_width": 600,  # [px]
    "screen_height": 150,  # [px]
    "centering_position": [0.3, 0.5],
    "scaling": 5.5,
    "show_trajectories": True,
    "render_agent": True,
    "offscreen_rendering": False,
    "disable_collision_checks": True,

}


env.unwrapped.configure(config)
print(env.reset()[1])

In [None]:
class ReplayBuffer:
    """A buffer for storing trajectory experiences for the DQN agent."""

    def __init__(self, capacity):
        """Initialize the ReplayBuffer.

        Args:
            capacity (int): The size of the replay buffer.
        """
        self.capacity = capacity
        self.memory = []
        self.position = 0

    def push(self, state, action, reward, terminated, next_state):
        """Saves a transition to the replay buffer.

        Args:
            state: Current state of the environment.
            action: Action taken in the state.
            reward: Reward received after taking action.
            terminated: Boolean indicating if the episode has ended.
            next_state: The next state of the environment.
        """
        if len(self.memory) < self.capacity:
            self.memory.append(None)
        self.memory[self.position] = (state, action, reward, terminated, next_state)
        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size):
        """Samples a batch of transitions from the buffer.

        Args:
            batch_size (int): Size of the sample batch.

        Returns:
            list: A batch of transitions.
        """
        return random.choices(self.memory, k=batch_size)

    def __len__(self):
        """Return the current size of internal memory."""
        return len(self.memory)

class CNNDQN(nn.Module):
    """Convolutional neural network module used in the DQN agent."""

    def __init__(self, input_shape, n_actions):
        """Initialize the CNNDQN model.

        Args:
            input_shape (tuple): Shape of the input observations.
            n_actions (int): Number of possible actions.
        """
        super(CNNDQN, self).__init__()
        self.conv_layers = nn.Sequential(
            nn.Conv2d(input_shape[0], 32, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.Flatten()
        )

        self.fc_input_dim = self.feature_size(input_shape)
        self.fc_layers = nn.Sequential(
            nn.Linear(self.fc_input_dim, 512),
            nn.ReLU(),
            nn.Linear(512, n_actions)
        )

    def forward(self, x):
        """Forward pass through the network.

        Args:
            x: Input tensor.

        Returns:
            Output tensor.
        """
        x = self.conv_layers(x)
        x = self.fc_layers(x)
        return x

    def feature_size(self, input_shape):
        """Calculate the size of the feature maps after convolution layers.

        Args:
            input_shape (tuple): The shape of the input.

        Returns:
            int: The size of the feature maps.
        """
        return self.conv_layers(torch.zeros(1, *input_shape)).view(1, -1).size(1)


class DQN:
    """A DQN agent with convolutional neural network for function approximation."""

    def __init__(self, action_space, observation_space, gamma, batch_size, buffer_capacity, update_target_every, epsilon_start, decrease_epsilon_factor, epsilon_min, learning_rate, decay_rate, decay_step_size):
        """Initialize the DQN model.

        Args:
            action_space: The space of the available actions.
            observation_space: The space of the possible states.
            gamma (float): Discount factor for future rewards.
            batch_size (int): Size of the batch.
            buffer_capacity (int): Capacity of the replay buffer.
            update_target_every (int): Frequency of updating the target network.
            epsilon_start (float): Starting value of epsilon for epsilon-greedy strategy.
            decrease_epsilon_factor (int): The factor used to decrease epsilon.
            epsilon_min (float): The minimum value of epsilon.
            learning_rate (float): Learning rate for optimizer.
            decay_rate (float): Rate of decay for learning rate.
            decay_step_size (int): Step size for learning rate decay.
        """
        self.action_space = action_space
        self.observation_space = observation_space
        self.gamma = gamma
        self.batch_size = batch_size
        self.buffer_capacity = buffer_capacity
        self.update_target_every = update_target_every
        self.epsilon_start = epsilon_start
        self.decrease_epsilon_factor = decrease_epsilon_factor
        self.epsilon_min = epsilon_min
        self.learning_rate = learning_rate
        self.decay_rate = decay_rate
        self.decay_step_size = decay_step_size
        self.reset()

    def update(self, state, action, reward, terminated, next_state):
        # Ensure states are properly shaped when passed to the network
        state = np.reshape(state, (7, 8, 8))  # Assuming the state needs to be reshaped to (C, H, W)
        next_state = np.reshape(next_state, (7, 8, 8))

        # Convert numpy arrays to tensors
        state = torch.tensor(state, dtype=torch.float32).unsqueeze(0)  # Add batch dimension
        next_state = torch.tensor(next_state, dtype=torch.float32).unsqueeze(0)
        action = torch.tensor([[action]], dtype=torch.int64)
        reward = torch.tensor([reward], dtype=torch.float32)
        terminated = torch.tensor([terminated], dtype=torch.float32)

        # Push the reshaped data into the buffer
        self.buffer.push(state, action, reward, terminated, next_state)

        if len(self.buffer) < self.batch_size:
            return np.inf  # Not enough samples to perform a batch update

        # Sample a batch from the replay buffer
        transitions = self.buffer.sample(self.batch_size)
        batch = tuple(torch.cat(data) for data in zip(*transitions))

        # Unpack the batch data
        state_batch, action_batch, reward_batch, terminated_batch, next_state_batch = batch

        # Ensure the batch data is in correct shape; e.g., (batch_size, C, H, W)
        state_batch = state_batch.view(-1, 7, 8, 8)  # Adjust the view as per the actual state dimensions
        next_state_batch = next_state_batch.view(-1, 7, 8, 8)

        # Forward pass through the Q-network
        q_values = self.q_net(state_batch).gather(1, action_batch)
        next_q_values = self.target_net(next_state_batch).max(1)[0].detach()
        targets = reward_batch + self.gamma * next_q_values * (1 - terminated_batch)

        # Calculate loss
        loss = self.loss_function(q_values, targets.unsqueeze(1))

        # Backpropagation
        self.optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(self.q_net.parameters(), max_norm=1.0)
        self.optimizer.step()
        self.scheduler.step()

        # Periodically update the target network
        if self.n_steps % self.update_target_every == 0:
            self.target_net.load_state_dict(self.q_net.state_dict())

        self.decrease_epsilon()
        self.n_steps += 1

        return loss.item()


    def get_action(self, state, use_thompson_sampling=False):
        """
        Return action based on Thompson Sampling or epsilon-greedy policy.
        """
        channels = 7  # Number of features
        height = 8
        width = 8
        state = np.reshape(state, (channels, height, width))
        state = torch.tensor(state, dtype=torch.float32).unsqueeze(0)
        if use_thompson_sampling:
            q_values = self.q_net(state)
            action = torch.argmax(q_values).item()
        else:
            if np.random.rand() < self.epsilon:
                action = self.action_space.sample()
            else:
                q_values = self.q_net(state)
                action = torch.argmax(q_values).item()
        return action

    def get_q(self, state):
      """
      Pass the state through the network to get Q-values for all possible actions.
      """
      with torch.no_grad():
          return self.q_net(state)

    def decrease_epsilon(self):
        self.epsilon = max(self.epsilon_min, self.epsilon * np.exp(-1.0 / self.decrease_epsilon_factor))
        self.n_eps += 1

    def reset(self):
        n_actions = self.action_space.n
        obs_size = self.observation_space.shape  # (channels, height, width)
        self.buffer = ReplayBuffer(self.buffer_capacity)
        print("obs_size", obs_size)
        print("n_actions", n_actions.shape)
        self.q_net = CNNDQN(obs_size, n_actions)
        self.target_net = CNNDQN(obs_size, n_actions)
        self.target_net.load_state_dict(self.q_net.state_dict())
        self.loss_function = nn.SmoothL1Loss()
        self.optimizer = optim.Adam(self.q_net.parameters(), lr=self.learning_rate)
        self.scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=self.decay_step_size, gamma=self.decay_rate)
        self.epsilon = self.epsilon_start
        self.n_steps = 0
        self.n_eps = 0


In [None]:
def eval_agent(agent, env, n_sim, use_thompson_sampling=False):
    """
    Monte Carlo evaluation of DQN agent.

    Repeat n_sim times:
        * Run the DQN policy until the environment reaches a terminal state (= one episode)
        * Compute the sum of rewards in this episode
        * Store the sum of rewards in the episode_rewards array.
    """
    episode_rewards = np.zeros(n_sim)
    for i in range(n_sim):
        state, _ = env.reset()  # Assuming env.reset() now returns a tuple (state, info)
        state = state.flatten()  # Flattening the state to match the expected input dimension for the agent
        reward_sum = 0
        done = False
        while not done:
            action = agent.get_action(state, use_thompson_sampling=use_thompson_sampling)
            #action = agent.get_action(state, 0)  # Using epsilon=0 to exploit the best known strategy
            state, reward, terminated, truncated, _ = env.step(action)
            state = state.flatten()  # Update the state dimensionality after each step
            reward_sum += reward
            done = terminated or truncated
        episode_rewards[i] = reward_sum
    return episode_rewards


In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Assuming the agent and environment are correctly initialized and configured
action_space = env.action_space
observation_space = env.observation_space

# Initialize hyperparameters for the DQN agent.
gamma = 0.9  # Discount factor for future rewards (0.8 and 0.95 resulted in worse performance).
batch_size = 128  # Size of the batch used in the training of the DQN.
buffer_capacity = 20_000  # Size of the replay buffer.
update_target_every = 100  # Frequency of target network update (better results than with 50 or 32).

epsilon_start = 0.9  # Initial probability for epsilon-greedy policy.
decrease_epsilon_factor = 500  # Factor to decrease epsilon.
epsilon_min = 0.05  # Minimum value for epsilon.

# Learning rate decay configuration.
decay_rate = 0.95  # Decay the learning rate by this rate.
decay_step_size = 1000  # Number of steps before the learning rate decays.

n_sim = 10  # Number of simulations for evaluation.
learning_rate = 5e-4  # Learning rate for the optimizer.

# Aggregate all hyperparameters into a single tuple for passing to the DQN agent.
arguments = (
    action_space,
    observation_space,
    gamma,
    batch_size,
    buffer_capacity,
    update_target_every,
    epsilon_start,
    decrease_epsilon_factor,
    epsilon_min,
    learning_rate,
    decay_rate,
    decay_step_size,
)

N_episodes = 1000  # Number of training episodes.

# Create an instance of the DQN agent with the specified arguments.
agent = DQN(*arguments)

def train(env, agent, N_episodes, eval_every=10, reward_threshold=20, print_every=10):
    """Train the DQN agent.

    Args:
        env: The environment in which the agent operates.
        agent: The DQN agent to train.
        N_episodes (int): Number of episodes to train the agent for.
        eval_every (int): Frequency of evaluations during training.
        reward_threshold (float): Threshold for reward to determine when to stop training.
        print_every (int): How often to print training progress.

    Returns:
        tuple: Tuple containing the losses and mean rewards for each episode.
    """
    total_time = 0  # Initialize the total time of training.
    state, _ = env.reset()  # Reset the environment state.
    losses = []  # Initialize the list to store losses per episode.
    episode_rewards = []  # Initialize the list to store rewards per episode.
    mean_rewards = []  # Initialize the list to store mean rewards for plotting.
    episode_lengths = []  # Initialize the list to store the length of each episode.

    for ep in range(N_episodes):
        done = False
        state, _ = env.reset()
        state = state.flatten()  # Flatten the state for processing.
        episode_reward = 0
        episode_length = 0

        while not done:
            use_thompson = np.random.rand() < 0.1  # Decide whether to use Thompson Sampling.
            action = agent.get_action(state, use_thompson_sampling=use_thompson)

            # Take the action in the environment and observe the next state and reward.
            next_state, reward, terminated, truncated, _ = env.step(action)
            next_state = next_state.flatten().astype(np.float32)  # Flatten the next state.
            loss_val = agent.update(state, action, reward, terminated, next_state)

            state = next_state
            episode_reward += reward
            losses.append(loss_val)

            done = terminated or truncated
            episode_length += 1

        episode_rewards.append(episode_reward)
        episode_lengths.append(episode_length)

        if ((ep + 1) % eval_every == 0):
            # Evaluate the agent's performance after 'eval_every' episodes.
            rewards = eval_agent(agent, env, n_sim)
            mean_reward = np.mean(rewards)
            mean_rewards.append(mean_reward)
            print(f"Episode = {ep + 1}, Reward = {mean_reward}, Loss = {loss_val}, Epsilon = {agent.epsilon}")

            if mean_reward >= reward_threshold:
                # Stop training if the mean reward exceeds the reward threshold.
                print("Reward threshold met, stopping training.")
                break

    return losses, mean_rewards

# Run the training loop
losses, mean_rewards = train(env, agent, N_episodes, print_every=10)

# Plot the training losses over episodes.
plt.plot(losses)
plt.title("Training Losses Over Episodes")
plt.xlabel("Training Steps")
plt.ylabel("Loss")
plt.show()

# Evaluate the final policy of the agent after training.
rewards = eval_agent(agent, env, n_sim)
print("\nmean reward after training = ", np.mean(rewards))


In [None]:
def plot_mean_rewards(mean_rewards, eval_every=10):
    """
    Plot the mean rewards over training episodes at specified evaluation intervals.

    Args:
        mean_rewards (list): List of mean rewards corresponding to evaluation points.
        eval_every (int): Interval of episodes between evaluations.
    """
    # Set up the figure size for the plot
    plt.figure(figsize=(10, 5))

    # Generate evaluation points based on the frequency of evaluations
    eval_points = np.arange(eval_every, eval_every * len(mean_rewards) + 1, eval_every)

    # Plot mean rewards per evaluation interval
    plt.plot(eval_points, mean_rewards, label='Mean Reward per Eval')

    # Check if mean_rewards is not empty
    if mean_rewards:
        # Find the first peak in mean rewards
        for i in range(1, len(mean_rewards) - 1):
            # A peak is defined as a point where the reward is higher than the rewards at adjacent points
            if mean_rewards[i] > mean_rewards[i - 1] and mean_rewards[i] > mean_rewards[i + 1]:
                first_peak_index = i
                break
        else:
            # If no peak is found, default to the first index
            first_peak_index = 0

        # Calculate the average reward from the point after the first peak to the end of the list
        mean_reward_after_peak = np.mean(mean_rewards[first_peak_index + 1:])
        # Draw a horizontal line representing this average reward value
        plt.axhline(y=mean_reward_after_peak, color='r', linestyle='--', label=f'Mean Reward After First Peak: {mean_reward_after_peak:.2f}')

    # Add title and labels with font sizes adjusted
    plt.title('Mean Rewards Over Training Episodes', fontsize=18)
    plt.xlabel('Episode Number', fontsize=12)
    plt.ylabel('Mean Reward', fontsize=12)

    # Set x-ticks to display every 100 episodes, based on evaluation points
    tick_interval = 100  # Define the interval for x-ticks
    x_ticks = eval_points[::tick_interval // eval_every]
    plt.xticks(x_ticks)

    # Display the legend and grid, and adjust layout
    plt.legend()
    plt.grid(True)
    plt.tight_layout()  # Adjust layout to prevent clipping of tick labels
    plt.show()

In [None]:
# Initialize the environment with video recording capabilities
# Uncomment the appropriate line depending on the environment setup
# env = gym.make('highway-fast-v0', render_mode='rgb_array')
env = record_videos(env)  # Wraps the environment to record videos

# Run a number of episodes for testing the agent
for episode in trange(1, desc='Test episodes'):
    # Reset the environment and get the initial observation and info, marking 'done' as False
    (obs, info), done = env.reset(), False
    # Flatten the observation for compatibility with the agent's expected input format
    obs = obs.flatten()

    # Initialize a counter to limit the number of steps per episode
    i = 0
    # Loop until the episode is done or the step limit is reached
    while not done and i < 1000:
        # Select an action using the agent's policy
        action = agent.get_action(obs, agent.epsilon)
        # Execute the action in the environment and observe the results
        obs, reward, done, truncated, info = env.step(int(action))
        # Flatten the observation again
        obs = obs.flatten()
        # Increment the step counter
        i += 1

# Close the environment to clean up resources
env.close()

In [None]:
show_videos()