In [1]:
import numpy as np
import time
import datetime

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.multiprocessing as mp
from torch import optim

import scipy.stats as ss
import tensorboardX
from tensorboardX import SummaryWriter
import gym

import ray
from ray.experimental.queue import Queue
import sys, os

sys.path.append(os.path.join(os.getcwd(), '..'))
sys.path = list(set(sys.path))
from common.evaluate import make_vis_env, test_env

benchmark_name = 'figureeight0'
print('benchmark: {}'.format(benchmark_name))

from flow.utils.registry import make_create_env
benchmark = __import__(
    "flow.benchmarks.%s" % benchmark_name, fromlist=["flow_params"])
flow_params = benchmark.flow_params
HORIZON = flow_params['env'].horizon

def make_env(create_env):
    def _thunk():
        env = create_env()
        return env
    return _thunk

benchmark: figureeight0


In [2]:
ray.init(num_cpus=63, include_webui=False, ignore_reinit_error=True)

Process STDOUT and STDERR is being redirected to /tmp/ray/session_2019-04-10_19-50-13_32872/logs.
Waiting for redis server at 127.0.0.1:28433 to respond...
Waiting for redis server at 127.0.0.1:36695 to respond...
Starting the Plasma object store with 20.0 GB memory using /dev/shm.


{'node_ip_address': '10.138.0.2',
 'object_store_addresses': ['/tmp/ray/session_2019-04-10_19-50-13_32872/sockets/plasma_store'],
 'raylet_socket_names': ['/tmp/ray/session_2019-04-10_19-50-13_32872/sockets/raylet'],
 'redis_address': '10.138.0.2:28433',
 'webui_url': ''}

In [4]:
def init_weights(m):
    if isinstance(m, nn.Linear):
        nn.init.normal_(m.weight, mean=0., std=0.1)
        nn.init.constant_(m.bias, 0.1)

class NeuralNetwork(nn.Module):
    '''
    Neural network for continuous action space
    '''
    def __init__(self, num_inputs, num_outputs, fcnet_hiddens, std=0.0):
        super(NeuralNetwork, self).__init__()
        last_layer_size = num_inputs
        layers = []
        
        for size in fcnet_hiddens:
            layers.append(nn.Linear(last_layer_size, size))
            layers.append(nn.ReLU())
            last_layer_size = size
            
        layers.append(nn.Linear(fcnet_hiddens[-1], num_outputs))
        
        self.actor = nn.Sequential(*layers)
        self.apply(init_weights)

    def forward(self, x):
        action_logits = self.actor(x.float())
        return action_logits

    
def sample_noise(model):
    '''
    Sample noise for each parameter of the neural net
    '''
    nn_noise = []
    for n in model.parameters():
        noise = np.random.normal(size=n.data.numpy().shape)
        nn_noise.append(noise)
    return np.array(nn_noise)


@ray.remote
class Evaluator:
    def __init__(self, fcnet_hiddens, create_env):
        self.env = create_env()
        num_inputs  = self.env.observation_space.shape[0]
        num_outputs = self.env.action_space.shape[0]
        self.max_accel = self.env.action_space.high[0]
        self.min_accel = self.env.action_space.low[0]
        self.model = NeuralNetwork(num_inputs, num_outputs, fcnet_hiddens)

    def evaluate_neuralnet(self):
        '''
        Evaluate an agent running it in the environment and computing the total reward
        '''
        obs = self.env.reset()
        game_reward = 0

        while True:
            # Output of the neural net
            net_output = self.model(torch.tensor(obs))
            # the action is the value clipped returned by the neural_net
            action = torch.clamp(net_output, self.min_accel, self.max_accel)
            new_obs, reward, done, _ = self.env.step(action.detach().cpu().numpy())
            obs = new_obs

            game_reward += reward

            if done:
                break

        return game_reward

    def evaluate_noisy_net(self, noise):
        '''
        Evaluate a noisy agent by adding the noise to the plain agent
        '''
        old_dict = self.model.state_dict()

        # add the noise to each parameter of the NN
        for n, p in zip(noise, self.model.parameters()):
            p.data += torch.FloatTensor(n * STD_NOISE)

        # evaluate the agent with the noise
        reward = self.evaluate_neuralnet()
        # load the previous paramater (the ones without the noise)
        self.model.load_state_dict(old_dict)

        return reward       
    
    def evaluate_seed_reward(self, act_params):
        # load the actor params
        self.model.load_state_dict(act_params)

        # get a random seed
        seed = np.random.randint(1e6)
        # set the new seed
        np.random.seed(seed)

        noise = sample_noise(self.model)

        pos_rew = self.evaluate_noisy_net(noise)
        # Mirrored sampling
        neg_rew = self.evaluate_noisy_net(-noise)

        return [[pos_rew, neg_rew], seed]
    
    # def queue_get_put_results(self, params_queue, output_queue):
    #     while True:
    #         act_params = params_queue.get()
    #         if act_params != None:
    #             res = self.evaluate_seed_reward(act_params)
    #             output_queue.put(res)
    #         else:
    #             break
    
    
def make_batch_results(model, envs, batch_size):
    batch_noise = []
    batch_reward = []
    task_dict = {env:env.evaluate_seed_reward.remote(model.state_dict()) for env in envs}
    for i in range(batch_size):
        ready_id, yet_id = ray.wait(list(task_dict.values()))
        ready_id = ready_id[0]   

        p_rews, p_seed = ray.get(ready_id)
        np.random.seed(p_seed)
        noise = sample_noise(model)
        batch_noise.append(noise)
        batch_noise.append(-noise)

        batch_reward.append(p_rews[0])
        batch_reward.append(p_rews[1])

        for env, id_ in task_dict.items():
            if id_ == ready_id:
                task_dict[env] = env.evaluate_seed_reward.remote(model.state_dict())
    
    return batch_noise, batch_reward


def normalized_rank(rewards):
    '''
    Rank the rewards and normalize them.
    '''
    ranked = ss.rankdata(rewards)
    norm = (ranked - 1) / (len(ranked) - 1)
    norm -= 0.5
    return norm

# Debug

In [5]:
# Hyperparameters
STD_NOISE = 0.02
BATCH_SIZE = 50
LEARNING_RATE = 0.02
MAX_ITERATIONS = 300
MAX_WORKERS = 63
fcnet_hiddens = [100, 50, 25]

In [6]:
# Create Environment
create_env, env_name = make_create_env(params=flow_params, version=0)
envs = [Evaluator.remote(fcnet_hiddens, create_env) for i in range(MAX_WORKERS)]
env = create_env()
num_inputs  = env.observation_space.shape[0]
num_outputs = env.action_space.shape[0]

 Starting SUMO on port 44063


# Main

In [8]:
# save the results
now = datetime.datetime.now()
date_time = "{}_{}.{}.{}".format(now.day, now.hour, now.minute, now.second)
# Writer name
writer_name = 'ASY_ES_{}_{}_{}_{}_{}_{}/'.format(benchmark_name, date_time, str(STD_NOISE), str(BATCH_SIZE), str(LEARNING_RATE), str(MAX_ITERATIONS), str(MAX_WORKERS))
print('Name:', writer_name)
result_path = '../result/ES/'
writer = SummaryWriter(log_dir=result_path+writer_name)

# Initialize the agent
actor = NeuralNetwork(num_inputs, num_outputs, fcnet_hiddens)
# Initialize the optimizer
optimizer = optim.Adam(actor.parameters(), lr=LEARNING_RATE)


# Execute the main loop MAX_ITERATIONS times
for n_iter in range(MAX_ITERATIONS):
    it_time = time.time()

    batch_noise, batch_reward = make_batch_results(actor, envs, BATCH_SIZE)

    # Print some stats
    print(n_iter, 'Mean:',np.round(np.mean(batch_reward), 2), 'Max:', np.round(np.max(batch_reward), 2), 'Time:', np.round(time.time()-it_time, 2))
    
    writer.add_scalar('mean_reward', np.mean(batch_reward), n_iter)
    writer.add_scalar('max_reward', np.max(batch_reward), n_iter)
    writer.add_scalar('mediun_reward', np.median(batch_reward), n_iter)

    # Rank the reward and normalize it
    batch_reward = normalized_rank(batch_reward)


    th_update = []
    optimizer.zero_grad()
    # for each actor's parameter, and for each noise in the batch, update it by the reward * the noise value
    for idx, p in enumerate(actor.parameters()):
        upd_weights = np.zeros(p.data.shape)

        for n,r in zip(batch_noise, batch_reward):
            upd_weights += r*n[idx]

        upd_weights = upd_weights / (BATCH_SIZE*STD_NOISE)
        # put the updated weight on the gradient variable so that afterwards the optimizer will use it
        p.grad = torch.FloatTensor(-upd_weights)
        th_update.append(np.mean(upd_weights))

    # Optimize the actor's NN
    optimizer.step()

    writer.add_scalar('loss', np.mean(th_update), n_iter)
    
    if n_iter % 25 == 0:
        model_path = result_path + writer_name + 'checkpoint' + str(n_iter) + '.pt'
        torch.save(actor.state_dict(), model_path)
# tensorboard --logdir content/runs --host localhost

Name: ASY_ES_figureeight0_10_19.51.54_0.02_50_0.02_300/
0 Mean: 380.73 Max: 490.99 Time: 23.66
1 Mean: 454.59 Max: 535.95 Time: 46.9
2 Mean: 450.78 Max: 523.73 Time: 47.57
3 Mean: 440.21 Max: 532.63 Time: 50.74
4 Mean: 439.98 Max: 534.86 Time: 25.56
5 Mean: 434.18 Max: 511.34 Time: 45.87
6 Mean: 436.69 Max: 505.3 Time: 44.93
7 Mean: 433.6 Max: 501.87 Time: 45.97
8 Mean: 433.16 Max: 508.5 Time: 43.77
9 Mean: 442.39 Max: 510.46 Time: 26.88
10 Mean: 440.86 Max: 507.5 Time: 44.11
11 Mean: 428.08 Max: 496.68 Time: 43.34
12 Mean: 439.84 Max: 501.21 Time: 43.31
13 Mean: 398.64 Max: 513.08 Time: 39.98
14 Mean: 423.88 Max: 514.68 Time: 30.43
15 Mean: 465.65 Max: 520.82 Time: 43.93
16 Mean: 472.03 Max: 519.51 Time: 44.66
17 Mean: 465.08 Max: 525.84 Time: 44.34
18 Mean: 442.85 Max: 516.49 Time: 40.42
19 Mean: 440.98 Max: 530.14 Time: 32.24
20 Mean: 441.27 Max: 522.72 Time: 43.92
21 Mean: 456.12 Max: 534.9 Time: 43.63
22 Mean: 456.19 Max: 530.87 Time: 43.48
23 Mean: 453.54 Max: 550.84 Time: 37.73


203 Mean: 348.76 Max: 454.98 Time: 35.03
204 Mean: 345.26 Max: 458.64 Time: 35.87
205 Mean: 361.69 Max: 454.86 Time: 36.81
206 Mean: 343.75 Max: 457.26 Time: 34.4
207 Mean: 333.87 Max: 454.3 Time: 34.79
208 Mean: 340.94 Max: 456.06 Time: 33.16
209 Mean: 336.56 Max: 456.28 Time: 35.44
210 Mean: 340.7 Max: 455.93 Time: 35.16
211 Mean: 321.64 Max: 454.15 Time: 33.29
212 Mean: 329.86 Max: 456.64 Time: 34.17
213 Mean: 316.09 Max: 458.5 Time: 33.64
214 Mean: 322.6 Max: 455.34 Time: 35.03
215 Mean: 322.98 Max: 460.52 Time: 34.65
216 Mean: 313.43 Max: 457.43 Time: 35.13
217 Mean: 307.93 Max: 454.98 Time: 33.24
218 Mean: 298.7 Max: 458.38 Time: 30.77
219 Mean: 293.7 Max: 456.18 Time: 32.56
220 Mean: 266.61 Max: 458.93 Time: 32.15
221 Mean: 243.1 Max: 457.86 Time: 28.1
222 Mean: 215.88 Max: 460.95 Time: 29.37
223 Mean: 218.69 Max: 457.96 Time: 29.41
224 Mean: 180.59 Max: 450.48 Time: 25.38
225 Mean: 255.58 Max: 454.39 Time: 31.74
226 Mean: 243.81 Max: 456.46 Time: 27.61
227 Mean: 237.67 Max: 455

# Test

In [10]:
def evaluate_neuralnet(env, model):
    '''
    Evaluate an agent running it in the environment and computing the total reward
    '''
    obs = env.reset()
    game_reward = 0
    max_accel = env.action_space.high[0]
    min_accel = env.action_space.low[0]

    while True:
        # Output of the neural net
        net_output = model(torch.tensor(obs))
        # the action is the value clipped returned by the neural_net
        action = torch.clamp(net_output, min_accel, max_accel)
        new_obs, reward, done, _ = env.step(action.detach().cpu().numpy())
        obs = new_obs

        game_reward += reward

        if done:
            break

    return game_reward

In [None]:
env = make_vis_env(benchmark_name)
num_inputs  = env.observation_space.shape[0]
num_outputs = env.action_space.shape[0]

In [15]:
actor = NeuralNetwork(num_inputs, num_outputs, fcnet_hiddens)
model_path = '../result/ES/ASY_ES_figureeight0_10_17.57.46_0.05_50_0.01_100/checkpoint90.pt'
actor.load_state_dict(torch.load(model_path))

evaluate_neuralnet(env, actor)

462.29247035806213