# Environment Solver

In [1]:
# Enable autoreloading of modules
%load_ext autoreload
%autoreload 2

In [2]:
ENV_ID = "CartPole-v1"
#ENV_ID = "LunarLander-v3"
#ALGORITHM = "reinforce"
ALGORITHM = "ppo"

Install packages:

In [3]:
from utils.environment import suppress_warnings

# Suppress common warnings
suppress_warnings()

Load secrets:

In [4]:
from tsilva_notebook_utils.colab import load_secrets_into_env

_ = load_secrets_into_env([
    'WANDB_API_KEY'
])

In [5]:
import torch.nn as nn
from tsilva_notebook_utils.gymnasium import build_env as _build_env, set_random_seed
from utils.config import load_config

# Load configuration from YAML files
CONFIG = load_config(ENV_ID, ALGORITHM)
print(f"Loaded config for {ENV_ID} with {ALGORITHM} algorithm:")
print(CONFIG)

Loaded config for CartPole-v1 with ppo algorithm:
RLConfig(env_id='CartPole-v1', seed=42, max_epochs=-1, gamma=0.99, lam=0.95, clip_epsilon=0.2, batch_size=256, train_rollout_steps=512, eval_interval=20, eval_episodes=5, reward_threshold=475, policy_lr=0.001, value_lr=0.001, hidden_dim=32, entropy_coef=0.01, shared_backbone=True, backbone_dim=64, normalize=False, mean_reward_window=100, rollout_interval=1, n_envs='auto', async_rollouts=True)


Build environment:

In [6]:
from tsilva_notebook_utils.gymnasium import log_env_info
from utils.environment import setup_environment

# Setup environment with configuration
build_env_fn = setup_environment(CONFIG)

# Test building env
env = build_env_fn(CONFIG.seed)
log_env_info(env)

Environment Info (SubprocVecEnv with 12 envs)
  Env ID: CartPole-v1
  Observation space: Box(low=[-4.8, -inf, -0.419, -inf], high=[4.8, inf, 0.419, inf], shape=(4,), dtype=float32)
  Action space: Discrete(2)
  Max episode steps: 500


Define models:

In [7]:
import numpy as np
from utils.training import create_agent, create_trainer
from tsilva_notebook_utils.torch import get_default_device

# Get environment dimensions
obs_dim = env.observation_space.shape[0]
act_dim = env.action_space.n if hasattr(env.action_space, 'n') else env.action_space.shape[0]

# Debug device information
print(f"Default device: {get_default_device()}")

# Create agent using utility function
agent = create_agent(CONFIG, build_env_fn, obs_dim, act_dim, algorithm=ALGORITHM)

# Debug model devices
print(f"Policy model device: {next(agent.policy_model.parameters()).device}")
if hasattr(agent, 'value_model') and agent.value_model is not None:
    print(f"Value model device: {next(agent.value_model.parameters()).device}")
print(f"Rollout collector type: {type(agent.rollout_collector)}")

# Create trainer with W&B logging
trainer = create_trainer(CONFIG, project_name=ENV_ID, run_name=f"{ALGORITHM}-{CONFIG.seed}")

# Fit the model
trainer.fit(agent)

Default device: cuda
Policy model device: cuda:0
Value model device: cuda:0
Rollout collector type: <class 'utils.rollouts.SyncRolloutCollector'>


[34m[1mwandb[0m: Currently logged in as: [33mtsilva[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA GeForce RTX 4090') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision


🔗 W&B Run: https://wandb.ai/tsilva/CartPole-v1/runs/xo29l021
Waiting for initial rollout...


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name         | Type              | Params | Mode 
-----------------------------------------------------------
0 | policy_model | SharedPolicyNet   | 4.6 K  | train
1 | value_model  | SharedValueNet    | 4.6 K  | train
2 | shared_model | SharedBackboneNet | 4.6 K  | eval 
-----------------------------------------------------------
4.6 K     Trainable params
0         Non-trainable params
4.6 K     Total params
0.018     Total estimated model params size (MB)
2         Modules in train mode
12        Modules in eval mode


Training started at 2025-07-17 13:20:14


/home/tsilva/repos/tsilva/gymnasium-solver/.venv/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Early stopping at epoch 59 with eval mean reward 500.00 >= threshold 475


0,1
epoch,▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇███
epoch/approx_kl,▂▄▃▂▄▁▂▃▂▂▅▂▃▂▂▅▇▆▄▆▃▃▂▄▆▄▆█▂▃▃▁▄▃▁▁▃▂▃▁
epoch/clip_fraction,▁▄▃▃▂▂▂▃▂▄▂▄▂▂▃▆▆▄▅▁▂▂▃▆▂▇█▂▃▃▃▂▁▂▃▄▄▁▁▂
epoch/entropy,█▇▇▆▅▄▃▃▃▃▃▄▃▃▃▂▂▂▂▂▂▂▂▂▂▃▂▂▂▂▁▁▁▁▁▁▁▁▁▁
epoch/explained_var,▄▄▄▃▂▁▁▃▃▄▆▆▆▇██▇█████████████▇██▇▆▆▆▅▅▆
epoch/kl_div,▃▄▂▂▃▃▂▃▄▂▂▁▃▃▅█▅▂▆▁▄▂▃▂▇▄▆▃▃▄▅▄▃▁▂▂▃▃▂▂
epoch/policy_loss,▂▁▂▃▅▆▆▇▆▇▅▆▇▆▇▇▅▇█▇█▆▆▇▆▆▇▆█▆▇▇▆▇▇▇▇▇▇▇
epoch/total_loss,▃▆▆▆█▆▇▅▅▅▇▆▃▄▂▄▅▂▂▂▂▁▃▂▁▂▁▁▁▃▃▄▁▄▅▄▆▆▃▅
epoch/value_loss,▃▅▆▆▆▆▅▅▅▅▄▆█▄▅▂▃▄▅▂▂▂▂▂▁▁▂▂▁▁▃▂▄▂▁▅▅▆▅▅
eval/mean_reward,▁▅█

0,1
epoch,59.0
epoch/approx_kl,0.00275
epoch/clip_fraction,0.01904
epoch/entropy,0.4876
epoch/explained_var,0.41417
epoch/kl_div,0.00215
epoch/policy_loss,-0.0045
epoch/total_loss,55.96505
epoch/value_loss,55.96955
eval/mean_reward,500.0


Training completed in 98.76 seconds (1.65 minutes)


In [8]:
from utils.evaluation import evaluate_agent

# Evaluate agent and render episodes
results = evaluate_agent(
    agent, 
    build_env_fn, 
    n_episodes=8, 
    deterministic=True, 
    render=True,
    grid=(2, 2), 
    text_color=(0, 0, 0), 
    out_dir="./tmp"
)

print(f"Mean reward: {results['mean_reward']:.2f}")

Mean reward: 500.00
