In [1]:
import numpy as np
import time
import datetime

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.multiprocessing as mp
from torch import optim

import scipy.stats as ss
import tensorboardX
from tensorboardX import SummaryWriter
import gym

import ray
from ray.experimental.queue import Queue

# sys.path.append(os.path.join(os.getcwd(), '..'))
# sys.path = list(set(sys.path))

benchmark_name = 'figureeight0'
print('benchmark: {}'.format(benchmark_name))

from flow.utils.registry import make_create_env
benchmark = __import__(
    "flow.benchmarks.%s" % benchmark_name, fromlist=["flow_params"])
flow_params = benchmark.flow_params
HORIZON = flow_params['env'].horizon

def make_env(create_env):
    def _thunk():
        env = create_env()
        return env
    return _thunk

benchmark: figureeight0


In [2]:
ray.init(num_cpus=3, include_webui=False, ignore_reinit_error=True)

Process STDOUT and STDERR is being redirected to /tmp/ray/session_2019-04-10_06-53-52_1594/logs.
Waiting for redis server at 127.0.0.1:49070 to respond...
Waiting for redis server at 127.0.0.1:57238 to respond...
Starting the Plasma object store with 13.355121049 GB memory using /dev/shm.


{'node_ip_address': '169.237.32.118',
 'object_store_addresses': ['/tmp/ray/session_2019-04-10_06-53-52_1594/sockets/plasma_store'],
 'raylet_socket_names': ['/tmp/ray/session_2019-04-10_06-53-52_1594/sockets/raylet'],
 'redis_address': '169.237.32.118:49070',
 'webui_url': ''}

In [7]:
def init_weights(m):
    if isinstance(m, nn.Linear):
        nn.init.normal_(m.weight, mean=0., std=0.1)
        nn.init.constant_(m.bias, 0.1)

class NeuralNetwork(nn.Module):
    '''
    Neural network for continuous action space
    '''
    def __init__(self, num_inputs, num_outputs, fcnet_hiddens, std=0.0):
        super(NeuralNetwork, self).__init__()
        last_layer_size = num_inputs
        layers = []
        
        for size in fcnet_hiddens:
            layers.append(nn.Linear(last_layer_size, size))
            layers.append(nn.ReLU())
            last_layer_size = size
            
        layers.append(nn.Linear(fcnet_hiddens[-1], num_outputs))
        
        self.actor = nn.Sequential(*layers)
        self.apply(init_weights)

    def forward(self, x):
        action_logits = self.actor(x.float())
        return action_logits

    
def sample_noise(model):
    '''
    Sample noise for each parameter of the neural net
    '''
    nn_noise = []
    for n in model.parameters():
        noise = np.random.normal(size=n.data.numpy().shape)
        nn_noise.append(noise)
    return np.array(nn_noise)


@ray.remote
class Evaluator:
    def __init__(self, fcnet_hiddens, create_env):
        self.env = create_env()
        num_inputs  = env.observation_space.shape[0]
        num_outputs = env.action_space.shape[0]
        self.max_accel = env.action_space.high[0]
        self.min_accel = env.action_space.low[0]
        self.model = NeuralNetwork(num_inputs, num_outputs, fcnet_hiddens)

    def evaluate_neuralnet(self):
        '''
        Evaluate an agent running it in the environment and computing the total reward
        '''
        obs = self.env.reset()
        game_reward = 0

        while True:
            # Output of the neural net
            net_output = self.model(torch.tensor(obs))
            # the action is the value clipped returned by the neural_net
            action = torch.clamp(net_output, self.min_accel, self.max_accel)
            new_obs, reward, done, _ = self.env.step(action.detach().cpu().numpy())
            obs = new_obs

            game_reward += reward

            if done:
                break

        return game_reward

    def evaluate_noisy_net(self, noise):
        '''
        Evaluate a noisy agent by adding the noise to the plain agent
        '''
        old_dict = self.model.state_dict()

        # add the noise to each parameter of the NN
        for n, p in zip(noise, self.model.parameters()):
            p.data += torch.FloatTensor(n * STD_NOISE)

        # evaluate the agent with the noise
        reward = self.evaluate_neuralnet()
        # load the previous paramater (the ones without the noise)
        self.model.load_state_dict(old_dict)

        return reward       
    
    def evaluate_seed_reward(self, act_params):
        # load the actor params
        self.model.load_state_dict(act_params)

        # get a random seed
        seed = np.random.randint(1e6)
        # set the new seed
        np.random.seed(seed)

        noise = sample_noise(self.model)

        pos_rew = self.evaluate_noisy_net(noise)
        # Mirrored sampling
        neg_rew = self.evaluate_noisy_net(-noise)

        return [[pos_rew, neg_rew], seed]
    
    # def queue_get_put_results(self, params_queue, output_queue):
    #     while True:
    #         act_params = params_queue.get()
    #         if act_params != None:
    #             res = self.evaluate_seed_reward(act_params)
    #             output_queue.put(res)
    #         else:
    #             break
    
    
def make_batch_results(model, envs, batch_size):
    batch_noise = []
    batch_reward = []
    task_dict = {ev:ev.evaluate_seed_reward.remote(model.state_dict()) for ev in envs}
    for i in range(batch_size):
        ready_id, yet_id = ray.wait(list(task_dict.values()))
        ready_id = ready_id[0]   

        p_rews, p_seed = ray.get(ready_id)
        np.random.seed(p_seed)
        noise = sample_noise(model)
        batch_noise.append(noise)
        batch_noise.append(-noise)

        batch_reward.append(p_rews[0])
        batch_reward.append(p_rews[1])

        for ev, id_ in task_dict.items():
            if id_ == ready_id:
                task_dict[ev] = ev.evaluate_seed_reward.remote(model.state_dict())
    
    return batch_noise, batch_reward

In [8]:
def normalized_rank(rewards):
    '''
    Rank the rewards and normalize them.
    '''
    ranked = ss.rankdata(rewards)
    norm = (ranked - 1) / (len(ranked) - 1)
    norm -= 0.5
    return norm


# Debug

In [131]:
# Hyperparameters
STD_NOISE = 0.05
BATCH_SIZE = 5
LEARNING_RATE = 0.01
MAX_ITERATIONS = 3
MAX_WORKERS = 3
fcnet_hiddens = [100, 50, 25]

now = datetime.datetime.now()
date_time = "{}_{}.{}.{}".format(now.day, now.hour, now.minute, now.second)
# Writer name
writer_name = 'ASY_ES_{}_{}_{}_{}_{}_{}'.format(benchmark_name, date_time, str(STD_NOISE), str(BATCH_SIZE), str(LEARNING_RATE), str(MAX_ITERATIONS), str(MAX_WORKERS))
print('Name:', writer_name)
writer = SummaryWriter(log_dir='../content/runs/'+writer_name)

Name: ASY_ES_figureeight0_10_16.0.47_0.05_5_0.01_3


In [132]:
# Create Environment
create_env, env_name = make_create_env(params=flow_params, version=0)
envs = [Evaluator.remote(fcnet_hiddens, create_env) for i in range(MAX_WORKERS)]
env = create_env()
num_inputs  = env.observation_space.shape[0]
num_outputs = env.action_space.shape[0]

 Starting SUMO on port 37791


In [130]:
%%time
batch_noise, batch_reward = make_batch_results(actor, envs, BATCH_SIZE)

CPU times: user 96 ms, sys: 112 ms, total: 208 ms
Wall time: 54 s


In [124]:
len(batch_reward)

20

# Main

In [134]:
# Initialize the agent
actor = NeuralNetwork(num_inputs, num_outputs, fcnet_hiddens)
# Initialize the optimizer
optimizer = optim.Adam(actor.parameters(), lr=LEARNING_RATE)


# Execute the main loop MAX_ITERATIONS times
for n_iter in range(MAX_ITERATIONS):
    it_time = time.time()

    batch_noise, batch_reward = make_batch_results(actor, envs, BATCH_SIZE)

    # Print some stats
    print(n_iter, 'Mean:',np.round(np.mean(batch_reward), 2), 'Max:', np.round(np.max(batch_reward), 2), 'Time:', np.round(time.time()-it_time, 2))
    writer.add_scalar('reward', np.mean(batch_reward), n_iter)

    # Rank the reward and normalize it
    batch_reward = normalized_rank(batch_reward)


    th_update = []
    optimizer.zero_grad()
    # for each actor's parameter, and for each noise in the batch, update it by the reward * the noise value
    for idx, p in enumerate(actor.parameters()):
        upd_weights = np.zeros(p.data.shape)

        for n,r in zip(batch_noise, batch_reward):
            upd_weights += r*n[idx]

        upd_weights = upd_weights / (BATCH_SIZE*STD_NOISE)
        # put the updated weight on the gradient variable so that afterwards the optimizer will use it
        p.grad = torch.FloatTensor( -upd_weights)
        th_update.append(np.mean(upd_weights))

    # Optimize the actor's NN
    optimizer.step()

    writer.add_scalar('loss', np.mean(th_update), n_iter)

# tensorboard --logdir content/runs --host localhost

0 Mean: 437.41 Max: 490.44 Time: 27.64
1 Mean: 382.67 Max: 522.5 Time: 36.06
2 Mean: 441.18 Max: 506.8 Time: 32.96
