In [1]:
import sys
import time
import json
from alpyne.client.alpyne_client import AlpyneClient
from stable_baselines3 import PPO, A2C
from stable_baselines3.common.utils import set_random_seed
from stable_baselines3.common.policies import ActorCriticPolicy
from stable_baselines3.common.env_util import make_vec_env

sys.path.append("../..")
from thesis.envs.minimatrix import MiniMatrix
from thesis.callbacks.minimatrix import MiniMatrixCallback
from thesis.policies.agv_routing_attention import AgvRoutingFE

seed = 42
set_random_seed(seed)

In [2]:
models_dir = "../../models/MiniMatrix_Routing_Attn"
logdir = "../../logs/MiniMatrix_Routing_Attn"
fleetsize = 1
max_fleetsize = 10
run_name = f"PPO-{fleetsize}-{max_fleetsize}-{time.strftime('%d_%m-%H_%M_%S')}-{seed}"

In [3]:
env_args = dict(
        reward_target = 1, 
        reward_distance = 0.05,
        reward_block = -0.5, 
        dispatchinginterval=60,
        routinginterval = 2,
        withCollisions = True,
        # blockTimeout = 5,
        # routingOnNode = False,
        # coordinates = False,
        includeNodesInReach = True,
    )

ppo_args = dict(
    batch_size = 256,
    ent_coef = 0.002,
    target_kl = 0.003,
    gamma = 0.8,
)
fe_args = dict(
    max_fleetsize=max_fleetsize,
    embed_dim = 32,
    n_heads = 4,
    depth = 6
)
net_arch = [dict(pi = [], vf = [32,16])]

hparams = dict(
    fleetsize = fleetsize,
    max_fleetsize = max_fleetsize,
    env_args = env_args,
    ppo_args = ppo_args,
    fe_args = fe_args,
    net_arch = net_arch
)
with open(f"{models_dir}/{run_name}.json", 'w') as outfile:
    json.dump(hparams, outfile)

In [4]:
import torch as th
from typing import Tuple, Callable
from stable_baselines3.common.distributions import (
    BernoulliDistribution,
    CategoricalDistribution,
    DiagGaussianDistribution,
    Distribution,
    MultiCategoricalDistribution,
    StateDependentNoiseDistribution,
    make_proba_distribution,
)
from stable_baselines3.common.type_aliases import Schedule
from torch import nn

class CustomACPolicy(ActorCriticPolicy):

    def _get_action_dist_from_latent(self, latent_pi: th.Tensor) -> Distribution:
        latent_pi_reshaped = latent_pi.view(latent_pi.shape[:-1] + (self.features_extractor.max_fleetsize, self.features_extractor.embed_dim))
        return super()._get_action_dist_from_latent(latent_pi_reshaped)

    def _build(self, lr_schedule: Schedule) -> None:
        ret =  super()._build(lr_schedule)
        self.action_net = nn.Sequential(
            nn.Linear(self.features_extractor.embed_dim , 5, bias = False),
            nn.Softmax(-1),
            nn.Flatten(1),
        )
        return ret




In [5]:
i = [0]

client = AlpyneClient("../../envs/MiniMatrix.zip", port=51150)

env = make_vec_env(MiniMatrix, 4, env_kwargs=dict(
    max_seconds = 60*60, 
    fleetsize = fleetsize, 
    max_fleetsize=max_fleetsize, 
    config_args = env_args,
    counter = i,
    client = client
))

model =PPO(
    CustomACPolicy,
    env, 
    tensorboard_log= logdir,
    device = "cuda",
    policy_kwargs=dict(
        net_arch = net_arch,
        features_extractor_class=AgvRoutingFE, 
        features_extractor_kwargs=fe_args
        ),
    **ppo_args,
    )

  warn(f"Unzipping to temporary directory ({tmp_dir})")


In [6]:
TIMESTEPS = 50000
for i in range(1, 15):
    model.learn(total_timesteps=TIMESTEPS, reset_num_timesteps=False, tb_log_name=run_name)#,callback=MiniMatrixCallback())
    model.save(f"{models_dir}/{run_name}-{TIMESTEPS * i}")

KeyboardInterrupt: 