# Environment Solver

In [9]:
# Enable autoreloading of modules
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [10]:
ENV_ID = "CartPole-v1"
#ENV_ID = "LunarLander-v3"
ALGORITHM = "reinforce"  # Change to "reinforce" for REINFORCE algorithm
#ALGORITHM = "ppo"

Install packages:

In [11]:
from utils.environment import suppress_warnings

# Suppress common warnings
suppress_warnings()

Load secrets:

In [12]:
from tsilva_notebook_utils.colab import load_secrets_into_env

_ = load_secrets_into_env([
    'WANDB_API_KEY'
])

In [14]:
import torch.nn as nn
from tsilva_notebook_utils.gymnasium import build_env as _build_env, set_random_seed
from utils.config import load_config

# Load configuration from YAML files
CONFIG = load_config(ENV_ID, ALGORITHM)
print(f"Loaded config for {ENV_ID} with {ALGORITHM} algorithm:")
print(CONFIG)

Loaded config for CartPole-v1 with reinforce algorithm:
RLConfig(env_id='CartPole-v1', seed=42, max_epochs=-1, gamma=0.99, lam=0.95, clip_epsilon=0.2, batch_size=512, train_rollout_steps=2048, eval_interval=20, eval_episodes=5, reward_threshold=475, policy_lr=0.001, value_lr=0.001, hidden_dim=32, entropy_coef=0.02, shared_backbone=False, backbone_dim=64, normalize=False, mean_reward_window=100, rollout_interval=1, n_envs='auto', async_rollouts=True)


Build environment:

In [15]:
from tsilva_notebook_utils.gymnasium import log_env_info
from utils.environment import setup_environment

# Setup environment with configuration
build_env_fn = setup_environment(CONFIG)

# Test building env
env = build_env_fn(CONFIG.seed)
log_env_info(env)

Environment Info (SubprocVecEnv with 12 envs)
  Env ID: CartPole-v1
  Observation space: Box(low=[-4.8, -inf, -0.419, -inf], high=[4.8, inf, 0.419, inf], shape=(4,), dtype=float32)
  Action space: Discrete(2)
  Max episode steps: 500


Define models:

In [None]:
import numpy as np
from utils.training import create_agent, create_trainer
from tsilva_notebook_utils.torch import get_default_device

# Get environment dimensions
obs_dim = env.observation_space.shape[0]
act_dim = env.action_space.n if hasattr(env.action_space, 'n') else env.action_space.shape[0]

# Debug device information
print(f"Default device: {get_default_device()}")

# Create agent using utility function
agent = create_agent(CONFIG, build_env_fn, obs_dim, act_dim, algorithm=ALGORITHM)

# Debug model devices
print(f"Policy model device: {next(agent.policy_model.parameters()).device}")
if hasattr(agent, 'value_model') and agent.value_model is not None:
    print(f"Value model device: {next(agent.value_model.parameters()).device}")
print(f"Rollout collector type: {type(agent.rollout_collector)}")

# Create trainer with W&B logging
trainer = create_trainer(CONFIG, project_name=ENV_ID, run_name=f"{ALGORITHM}-{CONFIG.seed}")

# Fit the model
trainer.fit(agent)

Default device: cuda
Policy model device: cuda:0
Rollout collector type: <class 'utils.rollouts.AsyncRolloutCollector'>
Policy model device: cuda:0
Rollout collector type: <class 'utils.rollouts.AsyncRolloutCollector'>


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


🔗 W&B Run: https://wandb.ai/tsilva/CartPole-v1/runs/np29htc7
Waiting for initial rollout...
Still waiting for rollout...
Still waiting for rollout...
Still waiting for rollout...
Still waiting for rollout...


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name         | Type      | Params | Mode 
---------------------------------------------------
0 | policy_model | PolicyNet | 226    | train
---------------------------------------------------
226       Trainable params
0         Non-trainable params
226       Total params
0.001     Total estimated model params size (MB)
5         Modules in train mode
0         Modules in eval mode

  | Name         | Type      | Params | Mode 
---------------------------------------------------
0 | policy_model | PolicyNet | 226    | train
---------------------------------------------------
226       Trainable params
0         Non-trainable params
226       Total params
0.001     Total estimated model params size (MB)
5         Modules in train mode
0         Modules in eval mode


Training started at 2025-07-16 22:09:06


/home/tsilva/repos/tsilva/gymnasium-solver/.venv/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]


Detected KeyboardInterrupt, attempting graceful shutdown ...


[WandbCleanup] run.finish() failed: 


SystemExit: 1

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


Exception in thread Thread-10 (_collect_loop):
Traceback (most recent call last):
  File "/usr/lib/python3.10/threading.py", line 1016, in _bootstrap_inner
    self.run()
  File "/home/tsilva/repos/tsilva/gymnasium-solver/.venv/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 766, in run_closure
    _threading_Thread_run(self)
  File "/usr/lib/python3.10/threading.py", line 953, in run
    self._target(*self._args, **self._kwargs)
  File "/home/tsilva/repos/tsilva/gymnasium-solver/utils/rollouts.py", line 384, in _collect_loop
    trajectories, extras = collect_rollouts(
  File "/home/tsilva/repos/tsilva/gymnasium-solver/utils/rollouts.py", line 111, in collect_rollouts
    next_obs, reward, done, infos = env.step(act_t.cpu().numpy())
  File "/home/tsilva/repos/tsilva/gymnasium-solver/.venv/lib/python3.10/site-packages/stable_baselines3/common/vec_env/base_vec_env.py", line 222, in step
    return self.step_wait()
  File "/home/tsilva/repos/tsilva/gymnasium-solver/.venv/lib/py

In [8]:
from utils.evaluation import evaluate_agent

# Evaluate agent and render episodes
results = evaluate_agent(
    agent, 
    build_env_fn, 
    n_episodes=8, 
    deterministic=True, 
    render=True,
    grid=(2, 2), 
    text_color=(0, 0, 0), 
    out_dir="./tmp"
)

print(f"Mean reward: {results['mean_reward']:.2f}")

  from pkg_resources import resource_stream, resource_exists
  from pkg_resources import resource_stream, resource_exists
  from pkg_resources import resource_stream, resource_exists
  from pkg_resources import resource_stream, resource_exists
  from pkg_resources import resource_stream, resource_exists
  from pkg_resources import resource_stream, resource_exists
  from pkg_resources import resource_stream, resource_exists
  from pkg_resources import resource_stream, resource_exists


FileNotFoundError: [Errno 2] No such file or directory: 'tmp/video_1434fdf94b52447790a1c638328ff041.mp4'

In [None]:
from utils.training import get_monitoring_info

# Get monitoring information
monitoring_info = get_monitoring_info()

print("Key metrics to watch on W&B dashboard:")
for metric in monitoring_info["primary_metrics"]:
    print(f"  - {metric}")

print("\nWarning conditions:")
for condition, action in monitoring_info["warning_conditions"].items():
    print(f"  - {condition}: {action}")

## Test Shared Backbone PPO

Test the new shared backbone feature for PPO:

In [None]:
# Test shared backbone PPO
from utils.config import load_config
from utils.training import create_agent

# Create a config with shared backbone enabled
shared_config = load_config(ENV_ID, "ppo")
shared_config.shared_backbone = True
shared_config.backbone_dim = [128, 64]  # Larger backbone with multiple layers
shared_config.hidden_dim = 32  # Smaller heads

print("Original PPO config:")
print(f"  shared_backbone: {getattr(CONFIG, 'shared_backbone', False)}")
print(f"  hidden_dim: {CONFIG.hidden_dim}")

print("\nShared backbone config:")
print(f"  shared_backbone: {shared_config.shared_backbone}")
print(f"  backbone_dim: {shared_config.backbone_dim}")
print(f"  hidden_dim: {shared_config.hidden_dim}")

# Create shared backbone agent
shared_agent = create_agent(shared_config, build_env_fn, obs_dim, act_dim, algorithm="ppo")

print(f"\nShared agent type: {type(shared_agent)}")
print(f"Uses shared backbone: {shared_agent.use_shared_backbone}")
print(f"Shared model type: {type(shared_agent.shared_model) if shared_agent.shared_model else None}")

# Compare parameter counts
original_params = sum(p.numel() for p in agent.policy_model.parameters()) + sum(p.numel() for p in agent.value_model.parameters())
shared_params = sum(p.numel() for p in shared_agent.shared_model.parameters()) if shared_agent.shared_model else 0

print(f"\nParameter comparison:")
print(f"  Original (separate models): {original_params:,} parameters")
print(f"  Shared backbone: {shared_params:,} parameters")
print(f"  Difference: {shared_params - original_params:+,} parameters")