In [None]:
from typing import Tuple

import torch
import torch.nn as nn
import torch.optim as optim
import random
import numpy as np
from torch import Tensor

torch.__version__

In [None]:
# Determine the best available device for PyTorch operations (Device Agnostic Code)
if torch.cuda.is_available():
    device = 'cuda' # GPU
elif torch.backends.mps.is_available():
    device = 'mps' # GPU for MacOS (Metal Programming Framework)
else:
    device = 'cpu' # CPU

print(f'Device set to: {device}')

In [None]:
# Constants
TLS_ID = '209'
HIDDEN_SIZE = 64
LEARNING_RATE = 1e-3
EPS_START = 0.9
EPS_END = 0.05
EPS_DECAY = 1_000
BATCH_SIZE = 128
GAMMA = 0.99
TAU = 0.005
EPISODES = 1_000

In [None]:
import api_endpoints
# Check what data we have
simulation_id = api_endpoints.start_simulation('scenarios/bologna/acosta/run.sumocfg')
simulation_id

In [None]:
response = api_endpoints.get_initial_data(simulation_id)
tls_209 = response['data']['tls']['209']
tls_209

In [None]:
n_observations = len(tls_209['lanes'] * 7)
n_actions = 3 # Step, Change Phase, Change Program

print(f'Observations: {n_observations} | Actions: {n_actions}')

In [None]:
from collections import namedtuple, deque

Transition = namedtuple('Transition',
                        ('state', 'action', 'next_state', 'reward'))


class ReplayMemory(object):

    def __init__(self, capacity):
        self.memory = deque([], maxlen=capacity)

    def push(self, *args):
        """Save a transition"""
        self.memory.append(Transition(*args))

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

In [None]:
# Neural Network Atchitecture
import api_endpoints
class SimpleNetwork(nn.Module):
    def __init__(self, state_size: int, action_size: int, hidden_size: int = 64):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(state_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, action_size)
        )
        
    def forward(self, state: torch.Tensor) -> torch.Tensor:
        return self.layers(state)

In [None]:
def reward_func(states: dict) -> torch.Tensor:
    penalty = 0
    for lane in states:
        if not lane:
            continue
        if lane['max_waiting_time'] > 0:
            queue_length_percentage = lane['queue_length'] / (lane['total_cars'] / lane['occupancy'])
            penalty -= queue_length_percentage * lane['max_waiting_time']
        else:
            penalty += lane['average_speed']
    return torch.tensor(penalty, dtype=torch.float16, device=device)

In [None]:
def extract_state_tensor(simulation_id: str, number_of_steps: int, tls_id: str, action: int) -> tuple[torch.Tensor, torch.Tensor]:
    if action == 0:
        response = api_endpoints.step_simulation(simulation_id, number_of_steps, tls_id)
    elif action == 1:
        response = api_endpoints.set_traffic_light_phase(simulation_id, tls_id, make_step=number_of_steps)
    elif action == 2:
        response = api_endpoints.switch_traffic_light_program(simulation_id, tls_id, make_step=number_of_steps) #TODO: Solve the issue with the program selection - (Either change in cyclic ways or add additional output to the network)
    else:
        raise Exception('Invalid action')
    
    # response = api_endpoints.step_simulation(simulation_id, number_of_steps, tls_id)
    metrics = response['vehicles_in_tls'][tls_id]['longest_waiting_time_car_in_lane']
    extracted_data = []
    for lane in metrics:
        values = list(metrics[lane].values())
        if values:
            extracted_data.extend(metrics[lane].values())
        else:
            extracted_data.extend([0] * 7)
    state = torch.tensor(extracted_data, dtype=torch.float16, device=device)
    
    reward = reward_func(metrics)
    
    return state, reward

In [None]:
def reset_simulation(simulation_id: str) -> torch.Tensor:
    response = api_endpoints.reset_simulation(simulation_id)
    if response['status'] != 'success':
        raise Exception('Error resetting simulation')
    state, reward = extract_state_tensor(simulation_id, 1, '209')
    return state

In [None]:
import json

state = extract_state_tensor(simulation_id, 20, '209')
len(state), state

In [None]:
policy_net = SimpleNetwork(n_observations, n_actions, HIDDEN_SIZE).to(device)
target_net = SimpleNetwork(n_observations, n_actions, HIDDEN_SIZE).to(device)
target_net.load_state_dict(policy_net.state_dict())

optimizer = optim.Adam(policy_net.parameters(), lr=LEARNING_RATE)
memory = ReplayMemory(10000)

steps_done = 0

In [None]:
def select_action(state: torch.Tensor) -> torch.Tensor:
    global steps_done
    sample = random.random()
    eps_threshold = EPS_END + (EPS_START - EPS_END) * np.exp(-1. * steps_done / EPS_DECAY)
    steps_done += 1
    if sample > eps_threshold:
        with torch.inference_mode():
            return policy_net(state.to(device)).max(1).indices.view(1, 1)
    else:
        return torch.tensor([[random.randrange(n_actions)]], device=device, dtype=torch.long)

In [None]:
episode_durations = []

In [None]:
def optimize_model():
    if len(memory) < BATCH_SIZE:
        return
    transitions = memory.sample(BATCH_SIZE)
    # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for
    # detailed explanation). This converts batch-array of Transitions
    # to Transition of batch-arrays.
    batch = Transition(*zip(*transitions))

    # Compute a mask of non-final states and concatenate the batch elements
    # (a final state would've been the one after which simulation ended)
    non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
                                          batch.next_state)), device=device, dtype=torch.bool)
    non_final_next_states = torch.cat([s for s in batch.next_state
                                                if s is not None])
    state_batch = torch.cat(batch.state)
    action_batch = torch.cat(batch.action)
    reward_batch = torch.cat(batch.reward)

    # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
    # columns of actions taken. These are the actions which would've been taken
    # for each batch state according to policy_net
    state_action_values = policy_net(state_batch).gather(1, action_batch)

    # Compute V(s_{t+1}) for all next states.
    # Expected values of actions for non_final_next_states are computed based
    # on the "older" target_net; selecting their best reward with max(1).values
    # This is merged based on the mask, such that we'll have either the expected
    # state value or 0 in case the state was final.
    next_state_values = torch.zeros(BATCH_SIZE, device=device)
    with torch.inference_mode():
        next_state_values[non_final_mask] = target_net(non_final_next_states).max(1).values
    # Compute the expected Q values
    expected_state_action_values = (next_state_values * GAMMA) + reward_batch

    # Compute Huber loss
    criterion = nn.SmoothL1Loss()
    loss = criterion(state_action_values, expected_state_action_values.unsqueeze(1))

    # Optimize the model
    optimizer.zero_grad()
    loss.backward()
    # In-place gradient clipping
    torch.nn.utils.clip_grad_value_(policy_net.parameters(), 100)
    optimizer.step()

In [None]:
import matplotlib
import matplotlib.pyplot as plt

is_ipython = 'inline' in matplotlib.get_backend()
if is_ipython:
    from IPython import display

plt.ion()

def plot_durations(show_result=False):
    plt.figure(1)
    durations_t = torch.tensor(episode_durations, dtype=torch.float)
    if show_result:
        plt.title('Result')
    else:
        plt.clf()
        plt.title('Training...')
    plt.xlabel('Episode')
    plt.ylabel('Duration')
    plt.plot(durations_t.numpy())
    # Take 100 episode averages and plot them too
    if len(durations_t) >= 100:
        means = durations_t.unfold(0, 100, 1).mean(1).view(-1)
        means = torch.cat((torch.zeros(99), means))
        plt.plot(means.numpy())

    plt.pause(0.001)  # pause a bit so that plots are updated
    if is_ipython:
        if not show_result:
            display.display(plt.gcf())
            display.clear_output(wait=True)
        else:
            display.display(plt.gcf())

In [None]:
from itertools import count

for i_episode in range(EPISODES):
    state = reset_simulation(simulation_id)
    for t in range(1_000):
        # Select and perform an action
        action = select_action(state)
        observation, reward = extract_state_tensor(simulation_id, 1, '209', action)

        # Store the transition in memory
        memory.push(state, action, next_state, reward)

        # Move to the next state
        state = next_state

        # Perform one step of the optimization (on the target network)
        optimize_model()
        if response['status'] == 'success':
            break