In [1]:
import time
from typing import Tuple, Any, Union, Dict
import requests
import torch
import torch.nn as nn
import torch.optim as optim
import random
import numpy as np
import os

from torch import Tensor
from torchinfo import summary
from api_endpoints import *
import nest_asyncio
nest_asyncio.apply()

torch.__version__



'2.2.2'

In [2]:
# Determine the best available device for PyTorch operations (Device Agnostic Code)
if torch.cuda.is_available():
    device = 'cuda' # GPU
elif torch.backends.mps.is_available():
    device = 'mps' # GPU for MacOS (Metal Programming Framework)
else:
    device = 'cpu' # CPU
    
# device='cpu'
print(f'Device set to: {device}')

Device set to: mps


In [3]:
# Constants
params = {
    "HIDDEN_SIZE": 64,
    "LEARNING_RATE": 1e-1,
    "EPS_START": 0.9,
    "EPS_END": 0.05,
    "EPS_DECAY": 1_000,
    "BATCH_SIZE": 128,
    "GAMMA": 0.80,
    "TAU": 0.005,
    "MEM_SIZE": 100_000,
    "EPISODES": 10
}
architecture = "SimpleNetwork with 3 layers"

In [4]:

# Check what data we have
simulation_id = start_simulation('scenarios/bologna/acosta/run.sumocfg',is_gui=False, params=params, architecture=architecture)
simulation_id

'51c33386-363d-48cc-b3c3-487ae310dad3'

In [5]:
response = get_initial_data(simulation_id)
tls_names = list(response['data']['tls'].keys())
selected_tls_name = random.choice(tls_names)
selected_tls = response['data']['tls'][selected_tls_name]
not_selected_tls_names = [name for name in tls_names if name != selected_tls_name]
selected_program_ids = [program['program_id']  for program in selected_tls["programs"]]
selected_program_ids.sort()
print("Selected tls:",selected_tls_name)

Selected tls: 210


In [6]:
response

{'data': {'params_count': 7,
  'tls': {'209': {'lanes': ['153_0', '187_0', '188_0', '189[1][1]_0', '88_0'],
    'programs': [{'phases': [{'duration': 42.0,
        'maxDur': 42.0,
        'minDur': 42.0,
        'state': 'GrGGGGg'},
       {'duration': 3.0, 'maxDur': 3.0, 'minDur': 3.0, 'state': 'yryyyyy'},
       {'duration': 42.0, 'maxDur': 42.0, 'minDur': 42.0, 'state': 'GGrGGrr'},
       {'duration': 3.0, 'maxDur': 3.0, 'minDur': 3.0, 'state': 'yyryyrr'}],
      'program_id': '0'},
     {'phases': [{'duration': 60.0,
        'maxDur': 117.0,
        'minDur': 45.0,
        'state': 'GrGGGGg'},
       {'duration': 60.0, 'maxDur': 3.0, 'minDur': 3.0, 'state': 'yrGGGyy'},
       {'duration': 60.0, 'maxDur': 7.0, 'minDur': 7.0, 'state': 'rrGGGrr'},
       {'duration': 60.0, 'maxDur': 3.0, 'minDur': 3.0, 'state': 'rryyyrr'},
       {'duration': 60.0, 'maxDur': 3.0, 'minDur': 3.0, 'state': 'rrrrrrr'},
       {'duration': 60.0, 'maxDur': 26.0, 'minDur': 26.0, 'state': 'rGrrrrr'},
       {

In [7]:
selected_tls

{'lanes': ['113_0',
  '113_1',
  '113_2',
  '117_0',
  '31_0',
  '31_1',
  '31_2',
  '34_0',
  '34_1',
  '34_2',
  '43[0]_2',
  '43[1]_0',
  '43[1]_1',
  '43[1]_2',
  '46_0',
  '46_1',
  '46_2'],
 'programs': [{'phases': [{'duration': 15.0,
     'maxDur': 15.0,
     'minDur': 15.0,
     'state': 'rrrGGGGGrrrrrrGGrrrr'},
    {'duration': 3.0,
     'maxDur': 3.0,
     'minDur': 3.0,
     'state': 'rrryyyyyrrrrrryyrrrr'},
    {'duration': 18.0,
     'maxDur': 18.0,
     'minDur': 18.0,
     'state': 'GGgrrrrrGGgrrrrrGGGG'},
    {'duration': 3.0,
     'maxDur': 3.0,
     'minDur': 3.0,
     'state': 'yygrrrrryygrrrrryyyy'},
    {'duration': 6.0,
     'maxDur': 6.0,
     'minDur': 6.0,
     'state': 'rrGrrrrrrrGrrrrrGGGG'},
    {'duration': 3.0,
     'maxDur': 3.0,
     'minDur': 3.0,
     'state': 'rryrrrrrrryrrrrryyyy'},
    {'duration': 18.0,
     'maxDur': 18.0,
     'minDur': 18.0,
     'state': 'GrrrrrrrrrrGGGrrGGGG'},
    {'duration': 3.0,
     'maxDur': 3.0,
     'minDur': 3.0,
    

In [6]:
n_observations = len(selected_tls['lanes'] * 7)
programs_count = len(selected_program_ids)
n_actions = 2 + programs_count # Step, Change Phase, Change Program

print(f'Observations: {n_observations} | Actions: {n_actions}')

Observations: 119 | Actions: 4


In [7]:
from collections import namedtuple, deque

Transition = namedtuple('Transition',
                        ('state', 'action', 'next_state', 'reward'))


class ReplayMemory(object):

    def __init__(self, capacity):
        self.memory = deque([], maxlen=capacity)

    def push(self, state, action, next_state, reward):
        """Save a transition"""
        # Convert action to tensor if it's an integer
        if isinstance(action, int):
            action = torch.tensor([[action]], device=device, dtype=torch.long)
        self.memory.append(Transition(state, action, next_state, reward))

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

In [8]:
# Neural Network Atchitecture
import api_endpoints
class SimpleNetwork(nn.Module):
    def __init__(self, state_size: int, action_size: int, hidden_size: int = 64):
        super().__init__()
        self.layer1 = nn.Linear(state_size, hidden_size)
        self.layer2 = nn.Linear(hidden_size, hidden_size)
        self.layer3 = nn.Linear(hidden_size, action_size)
        
    def forward(self, state: torch.Tensor) -> torch.Tensor:
        return self.layer3(nn.functional.relu(self.layer2(nn.functional.relu(self.layer1(state)))))

In [9]:
def reward_func(states: dict, cars_that_left: int) -> torch.Tensor:
    penalty = cars_that_left * 10
    for lane in states.values():
        if not lane:
            continue
        if lane.get('max_waiting_time',0.) > 0:
            queue_length_percentage = lane['queue_length'] / (lane['total_cars'] / lane['occupancy'])
            penalty -= queue_length_percentage * lane['max_waiting_time']
        else:
            penalty += lane['average_speed']
    return torch.tensor(penalty, dtype=torch.float32, device=device)

In [10]:
from enum import Enum


class Action(Enum):
    STEP = 0
    NEXT_PHASE = 1
    SWITCH_PROGRAM = 2

def extract_state_tensor(simulation_id: str, number_of_steps: int, tls_id: str, action: int) -> tuple[Any, Any, Any]:
    if action == Action.STEP:
        response = api_endpoints.step_simulation(simulation_id, number_of_steps, tls_id) # Make step
    elif action == Action.NEXT_PHASE:
        response = api_endpoints.set_traffic_light_phase(tls_id,simulation_id, make_step=number_of_steps) # Set next phase
    else: # Switch Program
        selected_program_index  = action - 2
        if selected_program_index >= len(selected_program_ids):
            raise RuntimeError("Illegal action")
        selected_program = selected_program_ids[int(selected_program_index)]
        response = api_endpoints.switch_traffic_light_program(tls_id= tls_id,session_id=simulation_id,program_id=selected_program, make_step=number_of_steps)
        if not response:
            response = api_endpoints.step_simulation(simulation_id, number_of_steps, tls_id)

    # response = api_endpoints.step_simulation(simulation_id, number_of_steps, tls_id)
    # print(response)
    is_ended = response['is_ended']
    metrics = response['vehicles_in_tls'][tls_id]['longest_waiting_time_car_in_lane']
    cars_that_left = response['cars_that_left']
    extracted_data = []
    for lane in metrics:
        values = list(metrics[lane].values())
        if values:
            extracted_data.extend([float(x) for x in metrics[lane].values()])
        else:
            extracted_data.extend([0. for _ in range(7)])
    state = torch.tensor(extracted_data, dtype=torch.float32, device=device)
    
    # print(metrics)
    reward = reward_func(metrics, cars_that_left)
    return state, reward, is_ended

In [11]:
def reset_simulation(simulation_id: str) -> torch.Tensor:
    response = api_endpoints.reset_simulation(simulation_id)
    if response['status'] != 'success':
        raise Exception('Error resetting simulation')
    for name in not_selected_tls_names:
        api_endpoints.switch_traffic_light_program(name,simulation_id,"0",forced=True)
    state, reward, _ = extract_state_tensor(simulation_id, 1, selected_tls_name,0)
    return state

In [12]:
policy_net = SimpleNetwork(n_observations, n_actions, params.HIDDEN_SIZE).to(device)
target_net = SimpleNetwork(n_observations, n_actions, params.HIDDEN_SIZE).to(device)
target_net.load_state_dict(policy_net.state_dict())

optimizer = optim.Adam(policy_net.parameters(), lr=params.LEARNING_RATE)
memory = ReplayMemory(params.MEM_SIZE)

steps_done = 0

In [13]:
def select_action(state: torch.Tensor) -> int:
    global steps_done
    sample = random.random()
    eps_threshold = params.EPS_END + (params.EPS_START - params.EPS_END) * np.exp(-1. * steps_done / params.EPS_DECAY)
    steps_done += 1
    if sample > eps_threshold:
        with torch.inference_mode():
            # Get the action as an integer
            return policy_net(state.float()).max(1).indices.item()
    else:
        # Randomly select an action and return as integer
        return random.randrange(n_actions)

In [14]:
episode_durations = []

In [15]:
def optimize_model():
    if len(memory) < params.BATCH_SIZE:
        return 0
    transitions = memory.sample(params.BATCH_SIZE)
    # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for
    # detailed explanation). This converts batch-array of Transitions
    # to Transition of batch-arrays.
    batch = Transition(*zip(*transitions))

    # Compute a mask of non-final states and concatenate the batch elements
    # (a final state would've been the one after which simulation ended)
    non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
                                          batch.next_state)), device=device, dtype=torch.bool)
    non_final_next_states = torch.cat([s for s in batch.next_state
                                                if s is not None])
    state_batch = torch.cat(batch.state)
    action_batch = torch.cat(batch.action).to(torch.int64)
    reward_batch = torch.cat(batch.reward)

    # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
    # columns of actions taken. These are the actions which would've been taken
    # for each batch state according to policy_net
    state_action_values = policy_net(state_batch).gather(1, action_batch)

    # Compute V(s_{t+1}) for all next states.
    # Expected values of actions for non_final_next_states are computed based
    # on the "older" target_net; selecting their best reward with max(1).values
    # This is merged based on the mask, such that we'll have either the expected
    # state value or 0 in case the state was final.
    next_state_values = torch.zeros(params.BATCH_SIZE, device=device)
    with torch.inference_mode():
        next_state_values[non_final_mask] = target_net(non_final_next_states).max(1).values
    # Compute the expected Q values
    expected_state_action_values = (next_state_values * params.GAMMA) + reward_batch

    # Compute Huber loss
    criterion = nn.SmoothL1Loss()
    loss = criterion(state_action_values, expected_state_action_values.unsqueeze(1))

    # Optimize the model
    optimizer.zero_grad()
    loss.backward()
    # In-place gradient clipping
    torch.nn.utils.clip_grad_value_(policy_net.parameters(), 1000)
    optimizer.step()
    return loss

In [16]:
import matplotlib
import matplotlib.pyplot as plt

is_ipython = 'inline' in matplotlib.get_backend()
if is_ipython:
    from IPython import display

plt.ion()

def plot_durations(show_result=False):
    plt.figure(1)
    durations_t = torch.tensor(episode_durations, dtype=torch.float32)
    if show_result:
        plt.title('Result')
    else:
        plt.clf()
        plt.title('Training...')
    plt.xlabel('Episode')
    plt.ylabel('Duration')
    plt.plot(durations_t.numpy())
    # Take 100 episode averages and plot them too
    if len(durations_t) >= 100:
        means = durations_t.unfold(0, 100, 1).mean(1).view(-1)
        means = torch.cat((torch.zeros(99), means))
        plt.plot(means.numpy())

    plt.pause(0.001)  # pause a bit so that plots are updated
    if is_ipython:
        if not show_result:
            display.display(plt.gcf())
            display.clear_output(wait=True)
        else:
            display.display(plt.gcf())

In [17]:
from itertools import count
from tqdm import tqdm
steps_count = 5
for i_episode in tqdm(range(params.EPISODES)):
    state = reset_simulation(simulation_id)
    state = torch.tensor(state, dtype=torch.float32, device=device).unsqueeze(0)
    accumulated_loss = 0
    steps_done = 0
    for t in count():
        # Select and perform an action
        # print(f"Selecting action with state: {state}")
        action = select_action(state)
        # print(f'Getting next state and reward with action:{action}')
        observation, reward, is_ended = extract_state_tensor(simulation_id, steps_count, selected_tls_name, action)
        reward = torch.tensor([reward], device=device)
        if is_ended:
            next_state = None
        else:
            next_state = torch.tensor(observation, dtype=torch.float32, device=device).unsqueeze(0)
        # Store the transition in memory
        memory.push(state, action, next_state, reward)

          # Move to the next state
        state = next_state
        
        
        # Perform one step of the optimization (on the target network)
        loss = optimize_model()
        accumulated_loss += loss
        
        # Soft update of the target network's weights
        # θ′ ← τ θ + (1 −τ )θ′
        target_net_state_dict = target_net.state_dict()
        policy_net_state_dict = policy_net.state_dict()

        for key in policy_net_state_dict:
            target_net_state_dict[key] = policy_net_state_dict[key]*TAU + target_net_state_dict[key]*(1-TAU)
        target_net.load_state_dict(target_net_state_dict)
        if is_ended:
            episode_durations.append(t + 1)
            plot_durations()
            break
        # time.sleep(0.01)
    print(f'Episode: {i_episode}/{params.EPISODES} | Loss: {accumulated_loss/t}')

100%|██████████| 10/10 [22:20<00:00, 134.06s/it]

Episode: 9/10 | Loss: 23.261978149414062





<Figure size 640x480 with 0 Axes>

In [18]:
stop_simulation(simulation_id)

{'status': 'success'}

In [19]:
simulation_id

'ce0b92b0-b682-45a4-b8d1-8335c86ffcc5'