In [31]:
import numpy as np
import math

In [2]:
from flow.controllers import RLController, IDMController, StaticLaneChanger, ContinuousRouter
from flow.core.experiment import Experiment
from flow.core.params import SumoParams, EnvParams, NetParams, \
    SumoCarFollowingParams
from flow.core.params import VehicleParams, InitialConfig
from flow.envs.loop.loop_accel import AccelEnv, ADDITIONAL_ENV_PARAMS
from flow.scenarios.figure_eight import Figure8Scenario, ADDITIONAL_NET_PARAMS

In [29]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T
from torch.autograd import Variable

In [4]:
torch.cuda.is_available()

True

# Initialize the Environment

In [21]:
# SumoParams
sim_params = SumoParams(render=True)


# Vehicles Setting
vehicles = VehicleParams()

vehicles.add(
    veh_id="rl",
    acceleration_controller=(RLController, {}),
    routing_controller=(ContinuousRouter, {}),
    car_following_params=SumoCarFollowingParams(
        speed_mode="obey_safe_speed",
    ),
    num_vehicles=1)

vehicles.add(
    veh_id="idm",
    acceleration_controller=(IDMController, {}),
    lane_change_controller=(StaticLaneChanger, {}),
    routing_controller=(ContinuousRouter, {}),
    car_following_params=SumoCarFollowingParams(
        speed_mode="obey_safe_speed",
    ),
    initial_speed=0,
    num_vehicles=14)


# Additional Env params
HORIZON = 1500

additional_env_params = {
    "target_velocity": 20,
    "max_accel": 3,
    "max_decel": 3,
    "sort_vehicles": False
}
env_params = EnvParams(
    horizon=HORIZON, additional_params=additional_env_params)


# Additional Net params
additional_net_params = {
    "radius_ring": 30,
    "lanes": 1,
    "speed_limit": 30,
    "resolution": 40
}
net_params = NetParams(
    no_internal_links=False, additional_params=additional_net_params)


## Initial config
initial_config = InitialConfig(spacing="uniform")


## Scenario
exp_tag = "figure-eight-control"

scenario = Figure8Scenario(
    exp_tag,
    vehicles,
    net_params,
    initial_config=initial_config)

# Environment

In [11]:
env = AccelEnv(env_params, sim_params, scenario)

# States, Actions, Rewards, Terminates

* States: The state consists of the velocities and absolute position of all vehicles in the network. This assumes a constant number of vehicles.

(speed x num_vehicles), (position x num_vehicles)

* Actions: Actions are a list of acceleration for each rl vehicles, bounded by the maximum accelerations and decelerations specified in EnvParams.

accel x num_rl_vehicles

* Rewards: The reward function is the two-norm of the distance of the speed of the vehicles in the network from the "target_velocity" term. For a description of the reward, see: flow.core.rewards.desired_speed

* Termination: A rollout is terminated if the time horizon is reached or if two vehicles collide into one another.

In [59]:
print(env.action_space)
print(env.observation_space.shape[0])
print(env.sorted_ids)

Box(1,)
30
['rl_0', 'idm_0', 'idm_1', 'idm_2', 'idm_3', 'idm_4', 'idm_5', 'idm_6', 'idm_7', 'idm_8', 'idm_9', 'idm_10', 'idm_11', 'idm_12', 'idm_13']


In [16]:
state = env.reset()
print(state)

[0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 6.95237622e-04
 9.05795302e-02 1.57246197e-01 2.23912864e-01 2.90579530e-01
 3.57246197e-01 4.23912864e-01 4.90579530e-01 5.57246197e-01
 6.23912864e-01 6.90579530e-01 7.57246197e-01 8.23912864e-01
 8.90579530e-01 9.57246197e-01]


# REINFORCE

In [55]:
pi = Variable(torch.FloatTensor([math.pi])).cuda()

def normal(x, mu, sigma_sq):
    a = (-1*(Variable(x)-mu).pow(2)/(2*sigma_sq)).exp()
    b = 1/(2*sigma_sq*pi.expand_as(sigma_sq)).sqrt()
    return a*b

In [75]:
class Policy(nn.Module):
    def __init__(self, hidden_size, num_inputs, action_space):
        super(Policy, self).__init__()
        self.action_space = action_space
        num_outputs = action_space.shape[0]
        
        self.linear1 = nn.Linear(num_inputs, hidden_size)
        self.linear2_mu = nn.Linear(hidden_size, num_outputs)
        self.linear2_sigma = nn.Linear(hidden_size, num_outputs)
    
    def forward(self, inputs):
        x = inputs
        x = F.relu(self.linear1(x))
        mu = self.linear2_mu(x)
        sigma_sq = self.linear2_sigma(x)
        sigma_sq = F.softplus(sigma_sq)  # make it positive
        
        return mu, sigma_sq

In [77]:
class REINFORCE:
    def __init__(self, hidden_size, num_inputs, action_space):
        self.action_space = action_space
        self.model = Policy(hidden_size, num_inputs, action_space)
        self.model = self.model.cuda()
        self.optimizer = optim.Adam(self.model.parameters(), lr=1e-3)
    
    def select_action(self, state):
        mu, sigma_sq = self.model(Variable(state).cuda())
        eps = torch.randn(mu.size())
        action = (mu + sigma_sq.sqrt()*Variable(eps).cuda()).data
        prob = normal(action, mu, sigma_sq)
        entropy = -0.5*((2*pi.expand_as(sigma_sq)*sigma_eq).log()+1)
        
        log_prob = prob.log()
        return action, log_prob, entropy

In [73]:
hidden_size = 16
num_inputs = env.observation_space.shape[0]
action_space = env.action_space

agent = REINFORCE(16, num_inputs, action_space)

In [74]:
agent.model(torch.tensor(state).float().cuda())

(tensor([-0.4686], device='cuda:0', grad_fn=<AddBackward0>),
 tensor([-0.2907], device='cuda:0', grad_fn=<AddBackward0>))

In [78]:
def generate_session(_agent, t_max=1000):
    s_0 = env.reset()
    states, actions, rewards = [], [], []
    done = False

    for t in range(t_max):
        # Get actions and convert to numpy array
        action = _agent.select_action(s_0)
        s_1, r, done, _ = env.step(action)

        states.append(s_0)
        rewards.append(r)
        actions.append(action)
        s_0 = s_1
           
        if done:
            break
    
    return states, actions, rewards

In [79]:
generate_session(agent)

TypeError: Variable data has to be a tensor, but got numpy.ndarray

In [58]:
def discouted_rewards(rewards, gamma=0.99):
    r = np.array([gamma**i * rewards[i] 
                  for i in range(len(rewards))])
    # Reverse the array direction for cumsum and then
    # revert back to the original order
    r = r.cumsum()[::-1].copy()
    return r

In [25]:
def train_on_session(_agent, _optimizer, states, actions, rewards, gamma=0.99):
    _optimizer.zero_grad()
    state_tensor = torch.FloatTensor(states)
    reward_tensor = torch.FloatTensor(discount_rewards(rewards))
    # Actions are used as indices, must be LongTensor
    action_tensor = torch.LongTensor(actions)

    # Calculate loss
    prob = _agent.predict(state_tensor)
    logprob = torch.log(
        _agent.predict(state_tensor))
    selected_logprobs = logprob[np.arange(len(action_tensor)), action_tensor]
    selected_probs = prob[np.arange(len(action_tensor)), action_tensor]
    entropy = - torch.sum(selected_probs * selected_logprobs)
    loss = -(reward_tensor * selected_logprobs).mean() - 0.001*entropy

    # Calculate gradients
    loss.backward()
    # Apply gradients
    _optimizer.step()
    
    return np.sum(rewards)