This notebook is to test Torch RL functions to make reward, action etc tracking easier 

In [86]:
import torch
import torch.nn as nn

# Tensordict modules
from tensordict.nn import set_composite_lp_aggregate, TensorDictModule, TensorDictSequential
from tensordict import  TensorDictBase
from torch import multiprocessing

# Data collection
from torchrl.collectors import SyncDataCollector
from torch.distributions import Categorical
from torchrl.data.replay_buffers import ReplayBuffer
from torchrl.data.replay_buffers.samplers import SamplerWithoutReplacement
from torchrl.data.replay_buffers.storages import LazyTensorStorage

#Env
from torchrl.envs import RewardSum, TransformedEnv, PettingZooWrapper, Compose, DoubleToFloat, StepCounter, ParallelEnv, EnvCreator, ExplorationType, set_exploration_type

# Utils
from torchrl.envs.utils import check_env_specs

# Multi-agent network
from torchrl.modules import MultiAgentMLP, ProbabilisticActor, TanhNormal

# Loss
from torchrl.objectives import ClipPPOLoss, ValueEstimators

# Utils
torch.manual_seed(0)
from matplotlib import pyplot as plt
from tqdm import tqdm


from til_environment import gridworld

In [87]:
custom_rewards = {
    RewardNames.GUARD_CAPTURES: 100,     
    RewardNames.SCOUT_CAPTURED: -100,    
    RewardNames.SCOUT_RECON: 2,         
    RewardNames.SCOUT_MISSION: 10,      
    RewardNames.WALL_COLLISION: -5,       # experiemnt with this 
    RewardNames.STATIONARY_PENALTY: -1, 
    RewardNames.SCOUT_STEP: -0.1,         # small step penalty to encourage efficiency
    RewardNames.GUARD_STEP: -0.1,         # same for guards 
}

RENDER_MODE = 'rgb_array'


In [88]:
"""
by default AEC env, so agents take one turn at a time and perform at their action 

for torchRL, would need to train with parallel, meaning all agents cast their moves at the same time, so wrap to parllelENV that is given 

PettingZooWrapper => wraps the parllel env into a framework that is compatible with RL algorithms such as MAPPO and transforms 

"""


def make_env():
    base_env = gridworld.parallel_env(
        render_mode=RENDER_MODE,
        debug=False,
        rewards_dict=custom_rewards,
        novice=False
    )

    # Wrap with PettingZooWrapper to make it TorchRL-compatible
    wrapped_env = PettingZooWrapper(
        env=base_env,
        categorical_actions=False,  # one-hot encoding 
        done_on_any=True            # end episode if any agent 
    )

    # general transforms (per-agent reward tracking, float conversion, step count)
    transformed_env = TransformedEnv(
        wrapped_env,
        Compose(
            RewardSum(
                in_keys=[("player", "reward")],           # works across all agents
                out_keys=[("player", "episode_reward")]
            ),
            DoubleToFloat(),                              # ensure obs are float32
            StepCounter()                                 # add step tracking
        )
    )

    return transformed_env

"""
Reward sum transformation => 

reward_sum = RewardSum(
    in_keys=[("player", "reward")],
    out_keys=[("player", "episode_reward")]
)

Reads from td[player][reward] => transforms to => td[player][episode_reward]
"""

'\nReward sum transformation => \n\nreward_sum = RewardSum(\n    in_keys=[("player", "reward")],\n    out_keys=[("player", "episode_reward")]\n)\n\nReads from td[player][reward] => transforms to => td[player][episode_reward]\n'

In [89]:
env = ParallelEnv(
    num_workers=1, # for the sake of simplicity -> put 1 env for now 
    create_env_fn=EnvCreator(make_env),
    serial_for_single=True
)

In [90]:
# checking if the added functions appear as keys

print("action_keys:", env.action_keys)
print("reward_keys:", env.reward_keys)
print("done_keys:", env.done_keys)

print("Action Spec:", env.action_spec)
print("Observation Spec:", env.observation_spec)
print("Reward Spec:", env.reward_spec)
print("Done Spec:", env.done_spec)
     

action_keys: [('player', 'action')]
reward_keys: [('player', 'reward')]
done_keys: ['done', 'terminated', 'truncated', ('player', 'done'), ('player', 'terminated'), ('player', 'truncated')]
Action Spec: Composite(
    player: Composite(
        action: OneHot(
            shape=torch.Size([1, 4, 5]),
            space=CategoricalBox(n=5),
            device=cpu,
            dtype=torch.int64,
            domain=discrete),
        device=cpu,
        shape=torch.Size([1, 4])),
    device=cpu,
    shape=torch.Size([1]))
Observation Spec: Composite(
    player: Composite(
        observation: BoundedDiscrete(
            shape=torch.Size([1, 4, 572]),
            space=ContinuousBox(
                low=Tensor(shape=torch.Size([1, 4, 572]), device=cpu, dtype=torch.int64, contiguous=True),
                high=Tensor(shape=torch.Size([1, 4, 572]), device=cpu, dtype=torch.int64, contiguous=True)),
            device=cpu,
            dtype=torch.int64,
            domain=discrete),
        e

### Decoding the above tensordict (dictionary of tensors)

```
action_keys: [('player', 'action')]

reward_keys: [('player', 'reward')]

done_keys: ['done', 'terminated', 'truncated', ('player', 'done'), ('player', 'terminated'), ('player', 'truncated')]

Action Spec: Composite(
    player: Composite(
        action: OneHot(
            shape=torch.Size([1, 4, 5]),
            space=CategoricalBox(n=5),
        )
    )
)

Observation Spec: Composite(
    player: Composite(
        observation: BoundedDiscrete(
            shape=torch.Size([1, 4, 572]),
        )
        episode_reward: UnboundedContinuous(
            shape=torch.Size([1, 4, 1]),
        )
    step_count: BoundedDiscrete(
        shape=torch.Size([1, 1]),
        )
    )
)

Reward Spec: Composite(
    player: Composite(
        reward: UnboundedContinuous(
            shape=torch.Size([1, 4, 1]),
        )
    )
)

Done Spec: Composite(
    done: Categorical(
        shape=torch.Size([1, 1]),
        )
    )
    terminated: Categorical(
        shape=torch.Size([1, 1]),
    )
    truncated: Categorical(
        shape=torch.Size([1, 1]),
    )
    player: Composite(
        done: Categorical(
            shape=torch.Size([1, 4, 1]),
        terminated: Categorical(
            shape=torch.Size([1, 4, 1]),
        )
    )
        truncated: Categorical(
            shape=torch.Size([1, 4, 1]),
        )
    )
```


In [91]:
# ### Example of schema 

# TensorDict({
#     'player': TensorDict({
#         'observation': Tensor(shape=[1, 4, 572], dtype=torch.int64),
#         'action': Tensor(shape=[1, 4, 5], dtype=torch.int64),        # one-hot
#         'reward': Tensor(shape=[1, 4, 1], dtype=torch.float32),      # per-agent reward
#         'episode_reward': Tensor(shape=[1, 4, 1], dtype=torch.float32),  # cumulative reward
#         'done': Tensor(shape=[1, 4, 1], dtype=torch.bool),           # per-agent done
#         'terminated': Tensor(shape=[1, 4, 1], dtype=torch.bool),     # true if done normally
#         'truncated': Tensor(shape=[1, 4, 1], dtype=torch.bool),      # true if cutoff
#     }),
#     'step_count': Tensor(shape=[1, 1], dtype=torch.int64),           # steps taken in episode
#     'done': Tensor(shape=[1, 1], dtype=torch.bool),                  # global done
#     'terminated': Tensor(shape=[1, 1], dtype=torch.bool),            # global terminated
#     'truncated': Tensor(shape=[1, 1], dtype=torch.bool),             # global truncated
# })


# consider a single rollout 
# td = {
#   'player': {
#     'observation': tensor([
#         [  
#             [...572 obs for player_0...],
#             [...572 obs for player_1...],
#             [...572 obs for player_2...],
#             [...572 obs for player_3...]
#         ]
#     ]),  # shape = [1, 4, 572]

#     'action': tensor([
#         [  # 1 env, 4 agents
#             [0, 1, 0, 0, 0],  # player_0 chose action 1
#             [1, 0, 0, 0, 0],  # player_1 chose action 0
#             [0, 0, 1, 0, 0],  # player_2 chose action 2
#             [0, 0, 0, 1, 0],  # player_3 chose action 3
#         ]
#     ]),  # shape = [1, 4, 5]

#     'reward': tensor([
#         [
#             [1.0],  # player_0
#             [0.5],  # player_1
#             [0.0],  # player_2
#             [0.0]   # player_3
#         ]
#     ]),  # shape = [1, 4, 1]

#     'episode_reward': tensor([
#         [
#             [3.0],  # player_0 has earned 3.0 total so far
#             [2.5],  # player_1
#             [1.0],  # player_2
#             [0.0]   # player_3
#         ]
#     ]),  # shape = [1, 4, 1]

#     'done': tensor([
#         [
#             [False],
#             [False],
#             [False],
#             [True]   # player_3 is done (captured)
#         ]
#     ]),  # shape = [1, 4, 1]

#     'terminated': same as done
#     'truncated': all False unless cutoff by max steps
#   },

#   'step_count': tensor([[5]]),   # environment has run 5 steps so far

#   'done': tensor([[True]]),      # environment is done (because of `done_on_any`)
#   'terminated': tensor([[True]]),
#   'truncated': tensor([[False]])
# }


"""
flat-keys => top layer => such as 'player', 'done'...


to access nested infp => td['player']['done']

"""


"\nflat-keys => top layer => such as 'player', 'done'...\n\n\nto access nested infp => td['player']['done']\n\n"

In [92]:
check_env_specs(env)

2025-05-27 17:00:39,240 [torchrl][INFO] check_env_specs succeeded!


PettingZoo environments are AEC but also supports a parallel_env

1. wrap the parallel_env in a PettingZooWrapper, converting the PettingZoo format into a TorchRL-compatible TransformedEnv


2. once wrapped TorchRL can access observations, actions, rewards, and done flags through a structured format called a TensorDict. 

3. roles are rotated dynamically every round, group based func like RewardSum fails, instead of complex dynamic role allocation, can try with flag based approaches first 