In [1]:
import sys
import time
import json
from alpyne.client.alpyne_client import AlpyneClient
from stable_baselines3 import DQN, HerReplayBuffer
from stable_baselines3.common.utils import set_random_seed
from stable_baselines3.common.env_util import make_vec_env
from gym import GoalEnv

sys.path.append("../..")
from thesis.envs.matrix_routing_multiagent import MatrixRoutingMA
from thesis.policies.routing_attention import RoutingFE_offPolicy
from thesis.policies.ppo_ac_attention import AttentionACPolicy

seed = 42
set_random_seed(seed)

In [2]:
models_dir = "../../models/MiniMatrix_Routing_MA"
logdir = "../../logs/MiniMatrix_Routing_MA"
fleetsize = 6
max_fleetsize = 10
run_name = f"HER-{fleetsize}-{max_fleetsize}-{time.strftime('%d_%m-%H_%M_%S')}-{seed}"

In [3]:
env_args = dict(
        reward_target = 1, 
        reward_distance = 0.05,
        reward_block = -0.5, 
        dispatchinginterval=120,
        routinginterval = 2,
        withCollisions = True,
        #blockTimeout = ,
        includeNodesInReach = True,
        dispatcher_args=dict(distance=6)
    )

alg_args = dict(
    #learning_rate = 3e-3,
    #buffer_size = 100000
)
fe_args = dict(
    max_fleetsize=max_fleetsize,
    embed_dim = 64,
    n_heads = 8,
    depth = 8
)
net_arch = dict(qf = [], pi = [])

hparams = dict(
    fleetsize = fleetsize,
    max_fleetsize = max_fleetsize,
    env_args = env_args,
    alg_args = alg_args,
    fe_args = fe_args,
    net_arch = net_arch
)
with open(f"{models_dir}/{run_name}.json", 'w') as outfile:
    json.dump(hparams, outfile, indent = 3)

In [4]:
from gym import spaces
from alpyne.data.spaces import Observation
import numpy as np
import torch

class MatrixRoutingMAGoal(MatrixRoutingMA, GoalEnv):
    
    def _convert_from_observation(self, observation:Observation):
        obs = MatrixRoutingMA._convert_from_observation(self,observation=observation)
        obs = obs.reshape((self.max_fleetsize, len(obs)//self.max_fleetsize))
        return dict(
            observation = obs.flatten(),
            desired_goal= obs[0, 7:9].flatten(),
            achieved_goal= obs[0, 5:7].flatten(),
        )

    def _get_observation_space(self) -> spaces.Dict:
        spc = MatrixRoutingMA._get_observation_space(self)
        spc
        return spaces.Dict(
            dict(
                observation = spc,
                desired_goal= spaces.Box(low=0, high=1, shape=(2,)),
                achieved_goal= spaces.Box(low=0, high=1, shape=(2,))
            )
        )

    def step(self, action):
        obs, reward, done, info = MatrixRoutingMA.step(self, action)
        reward = self.compute_reward(obs["achieved_goal"],obs["desired_goal"], info)
        return obs, float(reward), done, info

    def compute_reward(self, achieved_goal, desired_goal, info) -> float:
        return np.multiply(1, np.all(np.isclose(achieved_goal, desired_goal, 0.01), axis = -1))

In [5]:


client = AlpyneClient("../../envs/MiniMatrix.zip", port=51142)

env = MatrixRoutingMAGoal(
    max_seconds = 10*60, 
    fleetsize = fleetsize, 
    max_fleetsize=max_fleetsize, 
    config_args = env_args,
    counter = i,
    client = client
)

  warn(f"Unzipping to temporary directory ({tmp_dir})")


In [6]:
model =DQN(
    "MultiInputPolicy",
    env, 
    tensorboard_log= logdir,
    device = "cuda",
    policy_kwargs=dict(
        #net_arch = net_arch,
        features_extractor_class=RoutingFE_offPolicy, 
        features_extractor_kwargs=fe_args
        ),
    replay_buffer_class=HerReplayBuffer,    
    replay_buffer_kwargs=dict(
        max_episode_length=10*60,
    ),
    buffer_size = 60000,
    optimize_memory_usage=True,
    **alg_args,
    )

In [7]:
TIMESTEPS = 50000
for i in range(1, 15):
    model.learn(total_timesteps=TIMESTEPS, reset_num_timesteps=False, tb_log_name=run_name)#,callback=MiniMatrixCallback())
    model.save(f"{models_dir}/{run_name}-{TIMESTEPS * i}")

KeyboardInterrupt: 

: 