# Introduction

This is a robotic structure assembly (18 connection-task) application scenario for gantry robot system.

# Imports

In [1]:
from simple_stru_sampler import make_generator_params
from rl4co.envs import SDVRPEnv
from rl4co.models.zoo import AttentionModel
import torch
import random
import numpy as np

# Section 1: Data Generation
## Generate Dataset using env and customized data generator

In [2]:
seed = 123
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

params = make_generator_params(
    num_loc=18,
    outer_size=(8, 6),
    inner_size=(4, 2),
    vgap_range=(1.0, 2.0),
    hgap_range=(0.3, 0.5),
    # equal_demand 如需关掉就传 False，不传就是文件里的 DEFAULT_EQUAL_DEMAND
)

env = SDVRPEnv(generator_params=params)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

######################## sample number ###############################
td_init = env.reset(batch_size=100).to(device)

## Show the samples

In [3]:
# Extract coords, capacity and demand from td_init
def get_tensor(td, *keys):
    for k in keys:
        if k in td.keys():
            return td[k]
    raise KeyError(f"None of keys {keys} found in tensordict")

coords = get_tensor(td_init, "locs", "coords")          # [B, N+1, 2]
capacity = get_tensor(td_init, "capacity", "veh_capacity", "vehicle_capacity")  # [B] or [B,1]
demand = get_tensor(td_init, "demand")  * capacity      # [B, N]

B, Np1, _ = coords.shape
N = Np1 - 1

for b in range(B):
    depot_xy = coords[b, 0]                    # [2]
    cities_xy = coords[b, 1:1+N]               # [N, 2]
    demands_b = demand[b]                      # [N]
    cap_b = capacity[b].item() if capacity[b].ndim == 0 else float(capacity[b].squeeze().item())

    print(f"\n=== Sample {b+1} ===")
    print(f"vehicle_capacity: {cap_b:.1f}")
    print(f"depot: (x={depot_xy[0]:.4f}, y={depot_xy[1]:.4f})")
    for i in range(N):
        x, y = cities_xy[i]
        d = demands_b[i]
        print(f"city{i+1}: (x={x:.4f}, y={y:.4f}), demand={float(d):.1f}")


=== Sample 1 ===
vehicle_capacity: 100.0
depot: (x=6.1461, y=5.2430)
city1: (x=2.0905, y=3.7354), demand=10.0
city2: (x=2.5679, y=3.7354), demand=5.0
city3: (x=3.0453, y=3.7354), demand=5.0
city4: (x=3.5226, y=3.7354), demand=5.0
city5: (x=4.0000, y=3.7354), demand=10.0
city6: (x=4.4774, y=3.7354), demand=5.0
city7: (x=4.9547, y=3.7354), demand=5.0
city8: (x=5.4321, y=3.7354), demand=5.0
city9: (x=5.9095, y=3.7354), demand=10.0
city10: (x=2.0905, y=2.2646), demand=10.0
city11: (x=2.5679, y=2.2646), demand=5.0
city12: (x=3.0453, y=2.2646), demand=5.0
city13: (x=3.5226, y=2.2646), demand=5.0
city14: (x=4.0000, y=2.2646), demand=10.0
city15: (x=4.4774, y=2.2646), demand=5.0
city16: (x=4.9547, y=2.2646), demand=5.0
city17: (x=5.4321, y=2.2646), demand=5.0
city18: (x=5.9095, y=2.2646), demand=10.0

=== Sample 2 ===
vehicle_capacity: 100.0
depot: (x=1.1602, y=3.7228)
city1: (x=2.3431, y=3.6428), demand=20.0
city2: (x=2.7573, y=3.6428), demand=10.0
city3: (x=3.1715, y=3.6428), demand=10.0
ci

# Section 2: Model Generation

## Model Parameters

Default Model

In [4]:
# Model: default is AM with REINFORCE and greedy rollout baseline
model = AttentionModel(env,
                       baseline='rollout',
                       train_data_size=1_00_000,
                       val_data_size=1_0_000)

print(model.policy.encoder)

AttentionModelEncoder(
  (init_embedding): VRPInitEmbedding(
    (init_embed): Linear(in_features=3, out_features=128, bias=True)
    (init_embed_depot): Linear(in_features=2, out_features=128, bias=True)
  )
  (net): GraphAttentionNetwork(
    (layers): Sequential(
      (0): MultiHeadAttentionLayer(
        (0): SkipConnection(
          (module): MultiHeadAttention(
            (Wqkv): Linear(in_features=128, out_features=384, bias=True)
            (out_proj): Linear(in_features=128, out_features=128, bias=True)
          )
        )
        (1): Normalization(
          (normalizer): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (2): SkipConnection(
          (module): MLP(
            (hidden_act): ReLU()
            (out_act): Identity()
            (lins): ModuleList(
              (0): Linear(in_features=128, out_features=512, bias=True)
              (1): Linear(in_features=512, out_features=128, bias=True)
            )
  

c:\Users\yizhe\anaconda3\envs\rl4co\Lib\site-packages\lightning\pytorch\utilities\parsing.py:210: Attribute 'env' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['env'])`.
c:\Users\yizhe\anaconda3\envs\rl4co\Lib\site-packages\lightning\pytorch\utilities\parsing.py:210: Attribute 'policy' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['policy'])`.


## Tour Length Result Output

In [7]:
policy = model.policy.to(device)
out = policy(td_init.clone(), env, phase="test", decode_type="greedy")

# Rewards of inital random policy
print(-out['reward'])
print(-out['reward'].mean())

tensor([ 96.5627,  45.7740,  69.2506,  48.0979, 123.3519, 128.7827,  43.1433,
         44.7829,  75.0104,  43.1740,  38.9933,  57.1436,  91.6674, 130.3019,
         70.3541, 119.5200,  42.4448,  32.7610, 131.5775,  43.1267, 123.8015,
        128.6158,  76.0890, 100.5950,  54.0677,  38.6867, 117.8641,  63.3795,
         48.1995,  69.9687,  55.4374, 139.1095,  62.4533, 119.1206, 105.3495,
         82.5317, 101.8221,  32.4419,  92.5001, 102.7631, 102.3947, 103.3818,
         70.4980,  79.3057,  60.1825,  42.7861,  31.8065,  55.0106,  57.5473,
        123.6526,  48.7267,  49.5453, 101.2460, 133.6042, 124.0467, 111.6160,
        102.5531,  41.3902,  34.8526,  46.2340, 108.7042, 138.7007,  96.9082,
         77.8666,  45.4659,  76.3285,  49.2791, 100.1502,  36.5538, 104.4457,
         85.1877,  68.5477,  57.7371, 139.0618,  73.4711,  70.5543,  35.3551,
         85.8078,  65.0359,  32.1665,  65.8453, 103.4715,  44.2295,  54.4664,
         50.6192,  50.4323, 130.8782,  42.7872, 127.1791, 118.10

## Tour Detials

In [None]:
# 0 -> "depot", others -> "city{k}"
def idx_to_name(i: int) -> str:
    return "depot" if i == 0 else f"city{i}"

# split actoins into several tours by depot 
def split_by_depot(seq_idx):
    # delete padding (-1 for paddings)
    seq = [i for i in seq_idx if i != -1]
    # add depot at the beginning and ending of sequence if there is no depot
    if not seq or seq[0] != 0:
        seq = [0] + seq
    if seq[-1] != 0:
        seq = seq + [0]
    # define tours and divided the whole trip into several tours
    tours, cur = [], [seq[0]]
    for i in seq[1:]:
        cur.append(i)
        if i == 0:                 # end one tour when meeting the depot
            # get rid of 0-0 tour
            if any(x != 0 for x in cur[1:-1]):
                tours.append(cur)
            cur = [0]              # new start from the depot
    return tours

# get coordinates from td.locs
def get_coords(td):
    if "locs" in td.keys():
        return td["locs"]
    # if "coords" in td.keys():
    #     return td["coords"]
    raise KeyError("No coordinates found in tensordict (expected 'locs' or 'coords').")

# Calculate distance
def tour_length(seg, coords: torch.Tensor) -> float:
    idx = torch.tensor(seg, dtype=torch.long, device=coords.device)
    pts = coords.index_select(0, idx)          # [L, 2]
    d = (pts[1:] - pts[:-1]).pow(2).sum(-1).sqrt()
    return float(d.sum().item())

# === calculate each tour and total length ===
actions = out["actions"].cpu().tolist()

for bi, (td, seq) in enumerate(zip(td_init, actions), start=1):
    coords = get_coords(td)                    #  shape is [N+1, 2]，0 is depot
    segs = split_by_depot(seq)

    print(f"trip {bi}:")
    tour_total = 0.0
    for si, seg in enumerate(segs, start=1):
        length = tour_length(seg, coords)
        tour_total += length
        names = [idx_to_name(i) for i in seg]
        print(f"  tour {si}: {'-'.join(names)} | length = {length:.2f}")
    print(f"  total: {tour_total:.2f}\n")

# Section 3: Train

## Checkpoint Generation

In [8]:
from lightning.pytorch.callbacks import ModelCheckpoint, RichModelSummary

# Checkpointing callback: save models when validation reward improves
checkpoint_callback = ModelCheckpoint(  dirpath="checkpoints", # save to checkpoints/
                                        filename="epoch_{epoch:03d}",  # save as epoch_XXX.ckpt
                                        save_top_k=-1, # save all checkpoints
                                        save_last=True, # save the last model
                                        monitor="val/reward", # monitor validation reward
                                        mode="max") # maximize validation reward

# Print model summary
rich_model_summary = RichModelSummary(max_depth=3)

# Callbacks list
callbacks = [checkpoint_callback, rich_model_summary]

## Trainer Details

In [9]:
from rl4co.utils.trainer import RL4COTrainer

trainer = RL4COTrainer(
    max_epochs=100,
    accelerator="gpu",
    devices=1,
    # logger=logger,
    callbacks=callbacks,
)

Using 16bit Automatic Mixed Precision (AMP)
Trainer already configured with model summary callbacks: [<class 'lightning.pytorch.callbacks.rich_model_summary.RichModelSummary'>]. Skipping setting a default `ModelSummary` callback.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
c:\Users\yizhe\anaconda3\envs\rl4co\Lib\site-packages\lightning\pytorch\trainer\connectors\logger_connector\logger_connector.py:76: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `lightning.pytorch` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default


## Training Process

In [10]:
trainer.fit(model)

c:\Users\yizhe\anaconda3\envs\rl4co\Lib\site-packages\lightning\pytorch\callbacks\model_checkpoint.py:881: Checkpoint directory C:\Users\yizhe\Desktop\DRL ISARC Code\checkpoints exists and is not empty.
val_file not set. Generating dataset instead
test_file not set. Generating dataset instead
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
c:\Users\yizhe\anaconda3\envs\rl4co\Lib\site-packages\lightning\pytorch\utilities\model_summary\model_summary.py:242: Precision 16-mixed is not supported by the model summary.  Estimated model size in MB will not be accurate. Using 32 bits instead.


Output()

`Trainer.fit` stopped: `max_epochs=100` reached.


# Section 4: Trained Results

## Tour Length Result Output

Greedy

In [None]:
# Greedy rollouts over trained model (same states as previous plot)
policy = model.policy.to(device)
out = policy(td_init.clone(), env, phase="test", decode_type="greedy")

print(-out['reward'])

Sampling Top_p

In [None]:
from rl4co.utils.ops import batchify

num_samples = 32
top_p = 0.9

bs = td_init.batch_size[0]
td_init_batched = batchify(td_init, num_samples)
out = model(td_init_batched.clone(), decode_type ="sampling",top_p=top_p)
rewards = torch.stack(out["reward"].split(bs), 1).max(1).values
print(rewards)
print(rewards.mean().item())


Sampling Top_p + Temperature

In [None]:
model.policy.temperature = 2.0

num_samples = 32
top_p = 1

bs = td_init.batch_size[0]
td_init_batched = batchify(td_init, num_samples)
out = model(td_init_batched.clone(), decode_type ="sampling",top_p=top_p)
rewards = torch.stack(out["reward"].split(bs), 1).max(1).values
print(rewards)
print(rewards.mean().item())

## Tour Details

In [None]:
# 0 -> "depot", others -> "city{k}"
def idx_to_name(i: int) -> str:
    return "depot" if i == 0 else f"city{i}"

# split actoins into several tours by depot 
def split_by_depot(seq_idx):
    # delete padding (-1 for paddings)
    seq = [i for i in seq_idx if i != -1]
    # add depot at the beginning and ending of sequence if there is no depot
    if not seq or seq[0] != 0:
        seq = [0] + seq
    if seq[-1] != 0:
        seq = seq + [0]
    # define tours and divided the whole trip into several tours
    tours, cur = [], [seq[0]]
    for i in seq[1:]:
        cur.append(i)
        if i == 0:                 # end one tour when meeting the depot
            # get rid of 0-0 tour
            if any(x != 0 for x in cur[1:-1]):
                tours.append(cur)
            cur = [0]              # new start from the depot
    return tours

# get coordinates from td.locs
def get_coords(td):
    if "locs" in td.keys():
        return td["locs"]
    # if "coords" in td.keys():
    #     return td["coords"]
    raise KeyError("No coordinates found in tensordict (expected 'locs' or 'coords').")

# Calculate distance
def tour_length(seg, coords: torch.Tensor) -> float:
    idx = torch.tensor(seg, dtype=torch.long, device=coords.device)
    pts = coords.index_select(0, idx)          # [L, 2]
    d = (pts[1:] - pts[:-1]).pow(2).sum(-1).sqrt()
    return float(d.sum().item())

# === calculate each tour and total length ===
actions = out["actions"].cpu().tolist()

for bi, (td, seq) in enumerate(zip(td_init, actions), start=1):
    coords = get_coords(td)                    #  shape is [N+1, 2]，0 is depot
    segs = split_by_depot(seq)

    print(f"trip {bi}:")
    tour_total = 0.0
    for si, seg in enumerate(segs, start=1):
        length = tour_length(seg, coords)
        tour_total += length
        names = [idx_to_name(i) for i in seg]
        print(f"  tour {si}: {'-'.join(names)} | length = {length:.2f}")
    print(f"  total: {tour_total:.2f}\n")

## Plot

In [None]:
# Plotting

for td, actions in zip(td_init, out['actions'].cpu()):
    env.render(td, actions)