In [2]:
import numpy as np
import math
from tqdm import tqdm

In [3]:
from flow.controllers import RLController, IDMController, StaticLaneChanger, ContinuousRouter
from flow.core.experiment import Experiment
from flow.core.params import SumoParams, EnvParams, NetParams, \
    SumoCarFollowingParams
from flow.core.params import VehicleParams, InitialConfig
from flow.envs.loop.loop_accel import AccelEnv, ADDITIONAL_ENV_PARAMS
from flow.scenarios.figure_eight import Figure8Scenario, ADDITIONAL_NET_PARAMS

In [10]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T
from torch.distributions import Categorical
from torch.autograd import Variable

In [5]:
torch.cuda.is_available()

True

# Initialize the Environment

In [6]:
# SumoParams
sim_params = SumoParams(sim_step=0.1, render=False)


# Vehicles Setting
vehicles = VehicleParams()

vehicles.add(
    veh_id="rl",
    acceleration_controller=(RLController, {}),
    routing_controller=(ContinuousRouter, {}),
    car_following_params=SumoCarFollowingParams(
        speed_mode="obey_safe_speed",
    ),
    num_vehicles=1)

vehicles.add(
    veh_id="idm",
    acceleration_controller=(IDMController, {}),
    lane_change_controller=(StaticLaneChanger, {}),
    routing_controller=(ContinuousRouter, {}),
    car_following_params=SumoCarFollowingParams(
        speed_mode="obey_safe_speed",
    ),
    initial_speed=0,
    num_vehicles=14)


# Additional Env params
HORIZON = 1500

additional_env_params = {
    "target_velocity": 20,
    "max_accel": 3,
    "max_decel": 3,
    "sort_vehicles": False
}
env_params = EnvParams(
    horizon=HORIZON, additional_params=additional_env_params)


# Additional Net params
additional_net_params = {
    "radius_ring": 30,
    "lanes": 1,
    "speed_limit": 30,
    "resolution": 40
}
net_params = NetParams(
    no_internal_links=False, additional_params=additional_net_params)


## Initial config
initial_config = InitialConfig(spacing="uniform")


## Scenario
exp_tag = "figure-eight-control"

scenario = Figure8Scenario(
    exp_tag,
    vehicles,
    net_params,
    initial_config=initial_config)

# Environment

In [7]:
class AccelEnv_torch(AccelEnv):
    def reset(self):
        obs = super().reset()
        return torch.tensor(obs).float().cuda()
        
    def step(self, action):
        new_obs, reward, done, info = super().step(action)
        new_obs = torch.tensor(new_obs).float().cuda()
        return new_obs, reward, done, info

In [8]:
env = AccelEnv_torch(env_params, sim_params, scenario)

env.reset()

tensor([0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 6.9524e-04, 9.0580e-02, 1.5725e-01,
        2.2391e-01, 2.9058e-01, 3.5725e-01, 4.2391e-01, 4.9058e-01, 5.5725e-01,
        6.2391e-01, 6.9058e-01, 7.5725e-01, 8.2391e-01, 8.9058e-01, 9.5725e-01],
       device='cuda:0')

# States, Actions, Rewards, Terminates

* States: The state consists of the velocities and absolute position of all vehicles in the network. This assumes a constant number of vehicles.

(speed x num_vehicles), (position x num_vehicles)

* Actions: Actions are a list of acceleration for each rl vehicles, bounded by the maximum accelerations and decelerations specified in EnvParams.

accel x num_rl_vehicles

* Rewards: The reward function is the two-norm of the distance of the speed of the vehicles in the network from the "target_velocity" term. For a description of the reward, see: flow.core.rewards.desired_speed

* Termination: A rollout is terminated if the time horizon is reached or if two vehicles collide into one another.

# REINFORCE

In [9]:
pi = Variable(torch.FloatTensor([math.pi])).cuda()

def normal(x, mu, sigma_sq):
    a = (-1*(Variable(x)-mu).pow(2)/(2*sigma_sq)).exp()
    b = 1/(2*sigma_sq*pi.expand_as(sigma_sq)).sqrt()
    return a*b

In [44]:
class Policy(nn.Module):
    def __init__(self, num_inputs, action_space):
        super(Policy, self).__init__()
        self.action_space = action_space
        num_outputs = action_space.shape[0]
        
        self.linear1 = nn.Linear(num_inputs, 128)
        self.dropout = nn.dropout(p=0.6)
        self.linear2_mu = nn.Linear(128, num_outputs)
        self.linear2_sigma = nn.Linear(128, num_outputs)
    
    def forward(self, inputs):
        x = inputs
        x = F.relu(self.linear1(x))
        mu = self.linear2_mu(x)
        sigma_sq = self.linear2_sigma(x)
        sigma_sq = F.softplus(sigma_sq)  # make it positive
        
        return mu, sigma_sq

class REINFORCE:
    def __init__(self, hidden_size, num_inputs, action_space):
        self.action_space = action_space
        self.model = Policy(hidden_size, num_inputs, action_space)
        self.model = self.model.cuda()
        self.optimizer = optim.Adam(self.model.parameters(), lr=1e-3)
    
    def select_action(self, state):
        mu, sigma_sq = self.model(Variable(state).cuda())
        eps = torch.randn(mu.size())
        action = (mu + sigma_sq.sqrt()*Variable(eps).cuda()).data
        prob = normal(action, mu, sigma_sq)
        entropy = -0.5*((2*pi.expand_as(sigma_sq)+sigma_sq).log()+1)
        
        log_prob = prob.log()
        return action, log_prob, entropy

In [45]:
def generate_session(_agent, t_max=1000):
    s_0 = env.reset()
    states, actions, rewards, log_probs, entropies = [], [], [], [], []
    done = False

    for t in range(t_max):
        # Get actions and convert to numpy array
        action, log_prob, entropy = _agent.select_action(s_0)
        s_1, r, done, _ = env.step(action.cpu())

        states.append(s_0)
        rewards.append(r)
        actions.append(action)
        log_probs.append(log_prob)
        entropies.append(entropy)
        s_0 = s_1
           
        if done:
            break
    
    states = torch.stack(states)
    actions = torch.cat(actions)
    log_probs = torch.cat(log_probs)
    entropies = torch.cat(entropies)
    
    return states, actions, rewards, log_probs, entropies

In [46]:
def discouted_rewards(rewards, gamma=0.99):
    r = np.array([gamma**i * rewards[i] 
                  for i in range(len(rewards))])
    # Reverse the array direction for cumsum and then
    # revert back to the original order
    r = r.cumsum()[::-1].copy()
    return torch.tensor(r).float().cuda()

In [47]:
def train_on_session(_agent, states, actions, rewards, 
                     log_probs, entropies, gamma=0.99):
    agent.optimizer.zero_grad()
    returns = discouted_rewards(rewards)
    loss = (-returns * log_probs - 0.0001 * entropies).mean()
    loss.backward() #retain_graph=True)
    agent.optimizer.step()
    
    return np.sum(rewards)

In [48]:
hidden_size = 16
num_inputs = env.observation_space.shape[0]
action_space = env.action_space

agent = REINFORCE(16, num_inputs, action_space)

In [49]:
%%time

for i in tqdm(range(30)):
    rewards = [train_on_session(agent, *generate_session(agent, t_max=1000))
        for _ in range(100)]  # generate new sessions
    
    print("mean reward:%.3f" % (np.mean(rewards)))


  0%|          | 0/30 [00:00<?, ?it/s][A
  3%|▎         | 1/30 [01:16<36:57, 76.46s/it][A

mean reward:15.076



  7%|▋         | 2/30 [02:17<33:31, 71.82s/it][A

mean reward:15.048



 10%|█         | 3/30 [03:05<29:08, 64.77s/it][A

mean reward:17.508



 13%|█▎        | 4/30 [03:44<24:44, 57.10s/it][A

mean reward:12.386



 17%|█▋        | 5/30 [04:37<23:16, 55.87s/it][A

mean reward:14.958



 20%|██        | 6/30 [05:55<24:59, 62.48s/it][A

mean reward:15.838



 23%|██▎       | 7/30 [06:53<23:26, 61.14s/it][A

mean reward:13.971



 27%|██▋       | 8/30 [07:39<20:45, 56.59s/it][A

mean reward:15.656



 30%|███       | 9/30 [08:19<18:04, 51.64s/it][A

mean reward:15.070



 33%|███▎      | 10/30 [09:08<16:52, 50.64s/it][A

mean reward:17.160



 37%|███▋      | 11/30 [09:46<14:52, 46.99s/it][A

mean reward:14.421



 40%|████      | 12/30 [10:39<14:37, 48.75s/it][A

mean reward:18.139



 43%|████▎     | 13/30 [11:26<13:41, 48.33s/it][A

mean reward:14.771



 47%|████▋     | 14/30 [12:18<13:07, 49.22s/it][A

mean reward:16.564



 50%|█████     | 15/30 [13:09<12:25, 49.70s/it][A

mean reward:16.417



 53%|█████▎    | 16/30 [13:48<10:52, 46.64s/it][A

mean reward:14.065



 57%|█████▋    | 17/30 [14:50<11:07, 51.35s/it][A

mean reward:20.219



 60%|██████    | 18/30 [15:38<10:03, 50.30s/it][A

mean reward:17.018



 63%|██████▎   | 19/30 [16:39<09:47, 53.43s/it][A

mean reward:19.509



 67%|██████▋   | 20/30 [17:26<08:34, 51.47s/it][A

mean reward:15.580



 70%|███████   | 21/30 [18:11<07:25, 49.49s/it][A

mean reward:15.525



 73%|███████▎  | 22/30 [18:54<06:20, 47.54s/it][A

mean reward:15.699



 77%|███████▋  | 23/30 [19:31<05:11, 44.48s/it][A

mean reward:14.337



 80%|████████  | 24/30 [20:14<04:24, 44.00s/it][A

mean reward:16.332



 83%|████████▎ | 25/30 [20:57<03:38, 43.71s/it][A

mean reward:16.654



 87%|████████▋ | 26/30 [22:03<03:21, 50.40s/it][A

mean reward:20.501



 90%|█████████ | 27/30 [22:55<02:32, 50.77s/it][A

mean reward:14.330



 93%|█████████▎| 28/30 [23:46<01:41, 50.88s/it][A

mean reward:16.277



 97%|█████████▋| 29/30 [24:36<00:50, 50.78s/it][A

mean reward:17.523



100%|██████████| 30/30 [25:27<00:00, 50.68s/it][A

mean reward:16.109
CPU times: user 20min 10s, sys: 1min 18s, total: 21min 28s
Wall time: 25min 27s


# Test

In [100]:
sim_params = SumoParams(sim_step=0.1, render=True)
env = AccelEnv_torch(env_params, sim_params, scenario)

env.reset()

tensor([0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 6.9524e-04, 9.0580e-02, 1.5725e-01,
        2.2391e-01, 2.9058e-01, 3.5725e-01, 4.2391e-01, 4.9058e-01, 5.5725e-01,
        6.2391e-01, 6.9058e-01, 7.5725e-01, 8.2391e-01, 8.9058e-01, 9.5725e-01],
       device='cuda:0')

In [226]:
env.step(1)

(tensor([0.3592, 0.3547, 0.3545, 0.3547, 0.3527, 0.3398, 0.2742, 0.3227, 0.3619,
         0.3546, 0.3551, 0.3550, 0.3529, 0.3408, 0.3299, 0.1683, 0.2703, 0.3369,
         0.4032, 0.4698, 0.5361, 0.5981, 0.6497, 0.7319, 0.8035, 0.8699, 0.9366,
         1.0033, 0.0361, 0.0913], device='cuda:0'),
 0.515206968561324,
 False,
 {})

In [24]:
def test(_agent, t_max=1000):
    s_0 = env.reset()
    states, actions, rewards, log_probs, entropies = [], [], [], [], []
    done = False

    for t in range(t_max):
        # Get actions and convert to numpy array
        action, log_prob, entropy = _agent.select_action(s_0)
        s_1, r, done, _ = env.step(action.cpu())

        s_0 = s_1
           
        if done:
            break

In [32]:
test(agent)