# Environment Solver

In [1]:
# Enable autoreloading of modules
%load_ext autoreload
%autoreload 2

In [2]:
ENV_ID = "CartPole-v1"
#ENV_ID = "LunarLander-v3"
#ALGORITHM = "reinforce"
ALGORITHM = "ppo"

Install packages:

In [3]:
from utils.environment import suppress_warnings

# Suppress common warnings
suppress_warnings()

Load secrets:

In [4]:
from tsilva_notebook_utils.colab import load_secrets_into_env

_ = load_secrets_into_env([
    'WANDB_API_KEY'
])

In [5]:
import torch.nn as nn
from tsilva_notebook_utils.gymnasium import build_env as _build_env, set_random_seed
from utils.config import load_config

# Load configuration from YAML files
CONFIG = load_config(ENV_ID, ALGORITHM)
print(f"Loaded config for {ENV_ID} with {ALGORITHM} algorithm:")
print(CONFIG)

Loaded config for CartPole-v1 with ppo algorithm:
RLConfig(env_id='CartPole-v1', seed=42, max_epochs=-1, gamma=0.99, lam=0.95, clip_epsilon=0.2, batch_size=256, train_rollout_steps=512, eval_interval=20, eval_episodes=5, reward_threshold=475, policy_lr=0.001, value_lr=0.001, hidden_dim=32, entropy_coef=0.01, shared_backbone=True, backbone_dim=64, normalize=False, mean_reward_window=100, rollout_interval=1, n_envs='auto', async_rollouts=True)


Build environment:

In [6]:
from tsilva_notebook_utils.gymnasium import log_env_info
from utils.environment import setup_environment

# Setup environment with configuration
build_env_fn = setup_environment(CONFIG)

# Test building env
env = build_env_fn(CONFIG.seed)
log_env_info(env)

Environment Info (SubprocVecEnv with 12 envs)
  Env ID: CartPole-v1
  Observation space: Box(low=[-4.8, -inf, -0.419, -inf], high=[4.8, inf, 0.419, inf], shape=(4,), dtype=float32)
  Action space: Discrete(2)
  Max episode steps: 500


Define models:

In [20]:
import numpy as np
from utils.training import create_agent, create_trainer
from tsilva_notebook_utils.torch import get_default_device

# Get environment dimensions
obs_dim = env.observation_space.shape[0]
act_dim = env.action_space.n if hasattr(env.action_space, 'n') else env.action_space.shape[0]

# Debug device information
print(f"Default device: {get_default_device()}")

# Create agent using utility function
agent = create_agent(CONFIG, build_env_fn, obs_dim, act_dim, algorithm=ALGORITHM)

# Debug model devices
print(f"Policy model device: {next(agent.policy_model.parameters()).device}")
if hasattr(agent, 'value_model') and agent.value_model is not None:
    print(f"Value model device: {next(agent.value_model.parameters()).device}")
print(f"Rollout collector type: {type(agent.rollout_collector)}")

# Create trainer with W&B logging
trainer = create_trainer(CONFIG, project_name=ENV_ID, run_name=f"{ALGORITHM}-{CONFIG.seed}")

# Fit the model
trainer.fit(agent)

Default device: cuda
Policy model device: cuda:0
Value model device: cuda:0
Rollout collector type: <class 'utils.rollouts.SyncRolloutCollector'>
Policy model device: cuda:0
Value model device: cuda:0
Rollout collector type: <class 'utils.rollouts.SyncRolloutCollector'>


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


🔗 W&B Run: https://wandb.ai/tsilva/CartPole-v1/runs/z3enerfv
Waiting for initial rollout...


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name         | Type              | Params | Mode 
-----------------------------------------------------------
0 | policy_model | SharedPolicyNet   | 4.6 K  | train
1 | value_model  | SharedValueNet    | 4.6 K  | train
2 | shared_model | SharedBackboneNet | 4.6 K  | eval 
-----------------------------------------------------------
4.6 K     Trainable params
0         Non-trainable params
4.6 K     Total params
0.018     Total estimated model params size (MB)
2         Modules in train mode
12        Modules in eval mode

  | Name         | Type              | Params | Mode 
-----------------------------------------------------------
0 | policy_model | SharedPolicyNet   | 4.6 K  | train
1 | value_model  | SharedValueNet    | 4.6 K  | train
2 | shared_model | SharedBackboneNet | 4.6 K  | eval 
-----------------------------------------------------------
4.6 K     Trainable params
0         Non-trainable params
4.6 K     Total params
0.018     

Training started at 2025-07-16 22:27:53


/home/tsilva/repos/tsilva/gymnasium-solver/.venv/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Early stopping at epoch 39 with eval mean reward 497.00 >= threshold 475
Training completed in 71.09 seconds (1.18 minutes)
Training completed in 71.09 seconds (1.18 minutes)


In [8]:
from utils.evaluation import evaluate_agent

# Evaluate agent and render episodes
results = evaluate_agent(
    agent, 
    build_env_fn, 
    n_episodes=8, 
    deterministic=True, 
    render=True,
    grid=(2, 2), 
    text_color=(0, 0, 0), 
    out_dir="./tmp"
)

print(f"Mean reward: {results['mean_reward']:.2f}")

  from pkg_resources import resource_stream, resource_exists
  from pkg_resources import resource_stream, resource_exists
  from pkg_resources import resource_stream, resource_exists
  from pkg_resources import resource_stream, resource_exists
  from pkg_resources import resource_stream, resource_exists
  from pkg_resources import resource_stream, resource_exists
  from pkg_resources import resource_stream, resource_exists
  from pkg_resources import resource_stream, resource_exists


Mean reward: 500.00


In [9]:
from utils.training import get_monitoring_info

# Get monitoring information
monitoring_info = get_monitoring_info()

print("Key metrics to watch on W&B dashboard:")
for metric in monitoring_info["primary_metrics"]:
    print(f"  - {metric}")

print("\nWarning conditions:")
for condition, action in monitoring_info["warning_conditions"].items():
    print(f"  - {condition}: {action}")

Key metrics to watch on W&B dashboard:
  - eval/mean_reward
  - train/mean_reward
  - epoch/explained_var
  - epoch/entropy
  - epoch/clip_fraction

  - epoch/clip_fraction > 0.5: Reduce policy_lr or clip_epsilon
  - epoch/approx_kl > 0.1: Reduce policy_lr
  - epoch/explained_var < 0.3: Increase value_lr or network size
  - epoch/entropy < 0.01: Increase entropy_coef
  - rollout/queue_miss > rollout/queue_updated: Check async collection


## Test Shared Backbone PPO

Test the new shared backbone feature for PPO:

In [11]:
# Test shared backbone PPO
from utils.config import load_config
from utils.training import create_agent

# Create a config with shared backbone enabled
shared_config = load_config(ENV_ID, "ppo")
shared_config.shared_backbone = True
shared_config.backbone_dim = [128, 64]  # Larger backbone with multiple layers
shared_config.hidden_dim = 32  # Smaller heads

print("Original PPO config:")
print(f"  shared_backbone: {getattr(CONFIG, 'shared_backbone', False)}")
print(f"  hidden_dim: {CONFIG.hidden_dim}")

print("\nShared backbone config:")
print(f"  shared_backbone: {shared_config.shared_backbone}")
print(f"  backbone_dim: {shared_config.backbone_dim}")
print(f"  hidden_dim: {shared_config.hidden_dim}")

# Create shared backbone agent
shared_agent = create_agent(shared_config, build_env_fn, obs_dim, act_dim, algorithm="ppo")

print(f"\nShared agent type: {type(shared_agent)}")
print(f"Uses shared backbone: {shared_agent.use_shared_backbone}")
print(f"Shared model type: {type(shared_agent.shared_model) if shared_agent.shared_model else None}")

# Compare parameter counts
original_params = sum(p.numel() for p in agent.policy_model.parameters()) + sum(p.numel() for p in agent.value_model.parameters())
shared_params = sum(p.numel() for p in shared_agent.shared_model.parameters()) if shared_agent.shared_model else 0

print(f"\nParameter comparison:")
print(f"  Original (separate models): {original_params:,} parameters")
print(f"  Shared backbone: {shared_params:,} parameters")
print(f"  Difference: {shared_params - original_params:+,} parameters")

Original PPO config:
  shared_backbone: True
  hidden_dim: 32

Shared backbone config:
  shared_backbone: True
  backbone_dim: [128, 64]
  hidden_dim: 32

Shared agent type: <class 'learners.ppo.PPOLearner'>
Uses shared backbone: True
Shared model type: <class 'utils.models.SharedBackboneNet'>

Parameter comparison:
  Original (separate models): 9,158 parameters
  Shared backbone: 13,155 parameters
  Difference: +3,997 parameters


In [13]:
# Test training the shared backbone agent
from utils.training import create_trainer

# Create a smaller config for testing
test_config = shared_config
test_config.max_epochs = 5

# Create trainer for shared backbone agent
shared_trainer = create_trainer(test_config, project_name=f"{ENV_ID}-shared", run_name=f"shared-ppo-{test_config.seed}")

print(f"Training shared backbone agent for {test_config.max_epochs} epochs...")
shared_trainer.fit(shared_agent)

/home/tsilva/repos/tsilva/gymnasium-solver/.venv/lib/python3.10/site-packages/pytorch_lightning/loggers/wandb.py:397: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


🔗 W&B Run: https://wandb.ai/tsilva/CartPole-v1-shared/runs/ivya1ika
Training shared backbone agent for 5 epochs...
Waiting for initial rollout...


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name         | Type              | Params | Mode 
-----------------------------------------------------------
0 | policy_model | SharedPolicyNet   | 13.2 K | train
1 | value_model  | SharedValueNet    | 13.2 K | train
2 | shared_model | SharedBackboneNet | 13.2 K | train
-----------------------------------------------------------
13.2 K    Trainable params
0         Non-trainable params
13.2 K    Total params
0.053     Total estimated model params size (MB)
16        Modules in train mode
0         Modules in eval mode


Training started at 2025-07-16 22:22:02


/home/tsilva/repos/tsilva/gymnasium-solver/.venv/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=5` reached.
Exception in thread Thread-9 (_collect_loop):
Traceback (most recent call last):
  File "/usr/lib/python3.10/threading.py", line 1016, in _bootstrap_inner
    self.run()
  File "/home/tsilva/repos/tsilva/gymnasium-solver/.venv/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 766, in run_closure
    _threading_Thread_run(self)
  File "/usr/lib/python3.10/threading.py", line 953, in run
    self._target(*self._args, **self._kwargs)
  File "/home/tsilva/repos/tsilva/gymnasium-solver/utils/rollouts.py", line 401, in _collect_loop
    trajectories, extras = collect_rollouts(
  File "/home/tsilva/repos/tsilva/gymnasium-solver/utils/rollouts.py", line 100, in collect_rollouts
    logits = policy_model(obs_t)
  File "/home/tsilva/repos/tsilva/gymnasium-solver/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/home/tsilva/repos/tsilva/gymnasi

Training completed in 10.77 seconds (0.18 minutes)


In [14]:
# Debug device placement
print("Device debugging:")
print(f"Policy model device: {next(shared_agent.policy_model.parameters()).device}")
print(f"Value model device: {next(shared_agent.value_model.parameters()).device}")
print(f"Shared model device: {next(shared_agent.shared_backbone.parameters()).device}")

# Check if the wrapper models reference the same underlying model
print(f"Policy wrapper shared_net is shared_backbone: {shared_agent.policy_model.shared_net is shared_agent.shared_backbone}")
print(f"Value wrapper shared_net is shared_backbone: {shared_agent.value_model.shared_net is shared_agent.shared_backbone}")

# Check rollout collector models
print(f"Rollout collector policy device: {next(shared_agent.rollout_collector.policy_model.parameters()).device}")
print(f"Rollout collector value device: {next(shared_agent.rollout_collector.value_model.parameters()).device}")

Device debugging:
Policy model device: cpu
Value model device: cpu
Shared model device: cpu
Policy wrapper shared_net is shared_backbone: True
Value wrapper shared_net is shared_backbone: True
Rollout collector policy device: cpu
Rollout collector value device: cpu


In [15]:
# Create a new shared backbone agent with device fixes
print("Creating new shared backbone agent...")
shared_agent_v2 = create_agent(shared_config, build_env_fn, obs_dim, act_dim, algorithm="ppo")

# Debug device placement for new agent
print("New agent device debugging:")
print(f"Policy model device: {next(shared_agent_v2.policy_model.parameters()).device}")
print(f"Value model device: {next(shared_agent_v2.value_model.parameters()).device}")
print(f"Shared model device: {next(shared_agent_v2.shared_backbone.parameters()).device}")
print(f"Rollout collector policy device: {next(shared_agent_v2.rollout_collector.policy_model.parameters()).device}")
print(f"Rollout collector value device: {next(shared_agent_v2.rollout_collector.value_model.parameters()).device}")

# Try a quick training test
test_config_v2 = shared_config
test_config_v2.max_epochs = 2
shared_trainer_v2 = create_trainer(test_config_v2, project_name=f"{ENV_ID}-shared-v2", run_name=f"shared-ppo-v2-{test_config_v2.seed}")

print(f"Testing training with new agent for {test_config_v2.max_epochs} epochs...")
shared_trainer_v2.fit(shared_agent_v2)

Creating new shared backbone agent...
New agent device debugging:
Policy model device: cuda:0
Value model device: cuda:0
Shared model device: cuda:0
Rollout collector policy device: cuda:0
Rollout collector value device: cuda:0


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


🔗 W&B Run: https://wandb.ai/tsilva/CartPole-v1-shared-v2/runs/q6z2bblw
Testing training with new agent for 2 epochs...
Waiting for initial rollout...


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name         | Type              | Params | Mode 
-----------------------------------------------------------
0 | policy_model | SharedPolicyNet   | 13.2 K | train
1 | value_model  | SharedValueNet    | 13.2 K | train
2 | shared_model | SharedBackboneNet | 13.2 K | train
-----------------------------------------------------------
13.2 K    Trainable params
0         Non-trainable params
13.2 K    Total params
0.053     Total estimated model params size (MB)
16        Modules in train mode
0         Modules in eval mode


Training started at 2025-07-16 22:23:33


/home/tsilva/repos/tsilva/gymnasium-solver/.venv/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=2` reached.
Exception in thread Thread-11 (_collect_loop):
Traceback (most recent call last):
  File "/usr/lib/python3.10/threading.py", line 1016, in _bootstrap_inner
    self.run()
  File "/home/tsilva/repos/tsilva/gymnasium-solver/.venv/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 766, in run_closure
    _threading_Thread_run(self)
  File "/usr/lib/python3.10/threading.py", line 953, in run
    self._target(*self._args, **self._kwargs)
  File "/home/tsilva/repos/tsilva/gymnasium-solver/utils/rollouts.py", line 412, in _collect_loop
    trajectories, extras = collect_rollouts(
  File "/home/tsilva/repos/tsilva/gymnasium-solver/utils/rollouts.py", line 100, in collect_rollouts
    logits = policy_model(obs_t)
  File "/home/tsilva/repos/tsilva/gymnasium-solver/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/home/tsilva/repos/tsilva/gymnas

Training completed in 3.57 seconds (0.06 minutes)


In [16]:
# Test the fixed shared backbone agent
print("Creating a new shared backbone agent with collect_rollouts device fix...")
shared_agent_v3 = create_agent(shared_config, build_env_fn, obs_dim, act_dim, algorithm="ppo")

# Test training for 2 epochs
test_config_v3 = shared_config
test_config_v3.max_epochs = 2
shared_trainer_v3 = create_trainer(test_config_v3, project_name=f"{ENV_ID}-shared-v3", run_name=f"shared-ppo-v3-{test_config_v3.seed}")

print(f"Testing training with device-fixed agent for {test_config_v3.max_epochs} epochs...")
shared_trainer_v3.fit(shared_agent_v3)

Creating a new shared backbone agent with collect_rollouts device fix...


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


🔗 W&B Run: https://wandb.ai/tsilva/CartPole-v1-shared-v3/runs/ski19pwz
Testing training with device-fixed agent for 2 epochs...
Waiting for initial rollout...


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name         | Type              | Params | Mode 
-----------------------------------------------------------
0 | policy_model | SharedPolicyNet   | 13.2 K | train
1 | value_model  | SharedValueNet    | 13.2 K | train
2 | shared_model | SharedBackboneNet | 13.2 K | train
-----------------------------------------------------------
13.2 K    Trainable params
0         Non-trainable params
13.2 K    Total params
0.053     Total estimated model params size (MB)
16        Modules in train mode
0         Modules in eval mode


Training started at 2025-07-16 22:24:27


/home/tsilva/repos/tsilva/gymnasium-solver/.venv/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=2` reached.
Exception in thread Thread-13 (_collect_loop):
Traceback (most recent call last):
  File "/usr/lib/python3.10/threading.py", line 1016, in _bootstrap_inner
    self.run()
  File "/home/tsilva/repos/tsilva/gymnasium-solver/.venv/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 766, in run_closure
    _threading_Thread_run(self)
  File "/usr/lib/python3.10/threading.py", line 953, in run
    self._target(*self._args, **self._kwargs)
  File "/home/tsilva/repos/tsilva/gymnasium-solver/utils/rollouts.py", line 415, in _collect_loop
    trajectories, extras = collect_rollouts(
  File "/home/tsilva/repos/tsilva/gymnasium-solver/utils/rollouts.py", line 109, in collect_rollouts
    value_model(obs_t).squeeze(-1)
  File "/home/tsilva/repos/tsilva/gymnasium-solver/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/home/tsilva/repos/tsilva/gymn

Training completed in 5.37 seconds (0.09 minutes)


In [17]:
# Test the final fix for shared backbone agent
print("Creating shared backbone agent with state dict device mapping fix...")
shared_agent_final = create_agent(shared_config, build_env_fn, obs_dim, act_dim, algorithm="ppo")

# Test training for 3 epochs to be sure
test_config_final = shared_config
test_config_final.max_epochs = 3
shared_trainer_final = create_trainer(test_config_final, project_name=f"{ENV_ID}-shared-final", run_name=f"shared-ppo-final-{test_config_final.seed}")

print(f"Testing training with final device fix for {test_config_final.max_epochs} epochs...")
shared_trainer_final.fit(shared_agent_final)

Creating shared backbone agent with state dict device mapping fix...


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


🔗 W&B Run: https://wandb.ai/tsilva/CartPole-v1-shared-final/runs/7ft0hlt3
Testing training with final device fix for 3 epochs...
Waiting for initial rollout...


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name         | Type              | Params | Mode 
-----------------------------------------------------------
0 | policy_model | SharedPolicyNet   | 13.2 K | train
1 | value_model  | SharedValueNet    | 13.2 K | train
2 | shared_model | SharedBackboneNet | 13.2 K | train
-----------------------------------------------------------
13.2 K    Trainable params
0         Non-trainable params
13.2 K    Total params
0.053     Total estimated model params size (MB)
16        Modules in train mode
0         Modules in eval mode


Training started at 2025-07-16 22:25:13


/home/tsilva/repos/tsilva/gymnasium-solver/.venv/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=3` reached.
Exception in thread Thread-15 (_collect_loop):
Traceback (most recent call last):
  File "/usr/lib/python3.10/threading.py", line 1016, in _bootstrap_inner
    self.run()
  File "/home/tsilva/repos/tsilva/gymnasium-solver/.venv/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 766, in run_closure
    _threading_Thread_run(self)
  File "/usr/lib/python3.10/threading.py", line 953, in run
    self._target(*self._args, **self._kwargs)
  File "/home/tsilva/repos/tsilva/gymnasium-solver/utils/rollouts.py", line 414, in _collect_loop
    trajectories, extras = collect_rollouts(
  File "/home/tsilva/repos/tsilva/gymnasium-solver/utils/rollouts.py", line 103, in collect_rollouts
    logits = policy_model(obs_t)
  File "/home/tsilva/repos/tsilva/gymnasium-solver/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/home/tsilva/repos/tsilva/gymnas

Training completed in 7.25 seconds (0.12 minutes)


In [18]:
# Test comprehensive fix for shared backbone agent
print("Creating shared backbone agent with comprehensive initialization and device fixes...")
shared_agent_comprehensive = create_agent(shared_config, build_env_fn, obs_dim, act_dim, algorithm="ppo")

# Test training for 5 epochs
test_config_comprehensive = shared_config
test_config_comprehensive.max_epochs = 5
shared_trainer_comprehensive = create_trainer(test_config_comprehensive, project_name=f"{ENV_ID}-shared-comprehensive", run_name=f"shared-ppo-comprehensive-{test_config_comprehensive.seed}")

print(f"Testing training with comprehensive fixes for {test_config_comprehensive.max_epochs} epochs...")
shared_trainer_comprehensive.fit(shared_agent_comprehensive)

Creating shared backbone agent with comprehensive initialization and device fixes...


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


🔗 W&B Run: https://wandb.ai/tsilva/CartPole-v1-shared-comprehensive/runs/37wz8owt
Testing training with comprehensive fixes for 5 epochs...
Waiting for initial rollout...


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name         | Type              | Params | Mode 
-----------------------------------------------------------
0 | policy_model | SharedPolicyNet   | 13.2 K | train
1 | value_model  | SharedValueNet    | 13.2 K | train
2 | shared_model | SharedBackboneNet | 13.2 K | eval 
-----------------------------------------------------------
13.2 K    Trainable params
0         Non-trainable params
13.2 K    Total params
0.053     Total estimated model params size (MB)
2         Modules in train mode
14        Modules in eval mode


Training started at 2025-07-16 22:26:41


/home/tsilva/repos/tsilva/gymnasium-solver/.venv/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=5` reached.
Exception in thread Thread-17 (_collect_loop):
Traceback (most recent call last):
  File "/usr/lib/python3.10/threading.py", line 1016, in _bootstrap_inner
    self.run()
  File "/home/tsilva/repos/tsilva/gymnasium-solver/.venv/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 766, in run_closure
    _threading_Thread_run(self)
  File "/usr/lib/python3.10/threading.py", line 953, in run
    self._target(*self._args, **self._kwargs)
  File "/home/tsilva/repos/tsilva/gymnasium-solver/utils/rollouts.py", line 439, in _collect_loop
    trajectories, extras = collect_rollouts(
  File "/home/tsilva/repos/tsilva/gymnasium-solver/utils/rollouts.py", line 109, in collect_rollouts
    value_model(obs_t).squeeze(-1)
  File "/home/tsilva/repos/tsilva/gymnasium-solver/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/home/tsilva/repos/tsilva/gymn

Training completed in 10.58 seconds (0.18 minutes)


In [19]:
# Test shared backbone agent with synchronous rollouts
print("Creating shared backbone agent with synchronous rollouts...")
shared_agent_sync = create_agent(shared_config, build_env_fn, obs_dim, act_dim, algorithm="ppo")

print(f"Rollout collector type: {type(shared_agent_sync.rollout_collector)}")

# Test training for 3 epochs with sync rollouts
test_config_sync = shared_config
test_config_sync.max_epochs = 3
shared_trainer_sync = create_trainer(test_config_sync, project_name=f"{ENV_ID}-shared-sync", run_name=f"shared-ppo-sync-{test_config_sync.seed}")

print(f"Testing training with synchronous rollouts for {test_config_sync.max_epochs} epochs...")
shared_trainer_sync.fit(shared_agent_sync)

Creating shared backbone agent with synchronous rollouts...
Rollout collector type: <class 'utils.rollouts.SyncRolloutCollector'>


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


🔗 W&B Run: https://wandb.ai/tsilva/CartPole-v1-shared-sync/runs/7k6i8985
Testing training with synchronous rollouts for 3 epochs...
Waiting for initial rollout...


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name         | Type              | Params | Mode 
-----------------------------------------------------------
0 | policy_model | SharedPolicyNet   | 13.2 K | train
1 | value_model  | SharedValueNet    | 13.2 K | train
2 | shared_model | SharedBackboneNet | 13.2 K | eval 
-----------------------------------------------------------
13.2 K    Trainable params
0         Non-trainable params
13.2 K    Total params
0.053     Total estimated model params size (MB)
2         Modules in train mode
14        Modules in eval mode


Training started at 2025-07-16 22:27:32


/home/tsilva/repos/tsilva/gymnasium-solver/.venv/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=3` reached.


Training completed in 4.75 seconds (0.08 minutes)
