# Environment Solver

Make notebook autoreload when imported repo modules change:

In [1]:
%load_ext autoreload
%autoreload 2

Define which environment we want to solve and with which algorithm:

In [2]:
ENV_ID = "CartPole-v1"
#ENV_ID = "LunarLander-v3"
#ALGORITHM = "reinforce"
ALGO_ID = "ppo"

Load secrets:

In [3]:
from tsilva_notebook_utils.colab import load_secrets_into_env
_ = load_secrets_into_env(['WANDB_API_KEY'])

Load training configuration:

In [4]:
from utils.config import load_config
CONFIG = load_config(ENV_ID, ALGO_ID)
print(f"Loaded config for {ENV_ID} with {ALGO_ID} algorithm:")
print(CONFIG)

Loaded config for CartPole-v1 with ppo algorithm:
RLConfig(env_id='CartPole-v1', seed=42, max_epochs=-1, gamma=0.99, lam=0.95, clip_epsilon=0.2, batch_size=256, train_rollout_steps=512, eval_interval=20, eval_episodes=5, reward_threshold=475, policy_lr=0.001, value_lr=0.001, hidden_dims=(32,), entropy_coef=0.01, normalize=False, mean_reward_window=100, rollout_interval=1)


Build environment:

In [5]:
from tsilva_notebook_utils.gymnasium import log_env_info
from utils.environment import setup_environment
build_env_fn = setup_environment(CONFIG) # TODO: consider getting rid of this method or moving everything inside it
env = build_env_fn(CONFIG.seed)
log_env_info(env)

Environment Info (DummyVecEnv with 1 envs)
  Env ID: CartPole-v1
  Observation space: Box(low=[-4.8, -inf, -0.419, -inf], high=[4.8, inf, 0.419, inf], shape=(4,), dtype=float32)
  Action space: Discrete(2)
  Max episode steps: 500


In [6]:
from utils.rollouts import SyncRolloutCollector
from utils.models import PolicyNet, ValueNet
obs_dim = env.observation_space.shape[0]
act_dim = env.action_space.n if hasattr(env.action_space, 'n') else env.action_space.shape[0]
policy_model = PolicyNet(obs_dim, act_dim, CONFIG.hidden_dims)
value_model = ValueNet(obs_dim, CONFIG.hidden_dims) if ALGO_ID == "ppo" else None
train_rollout_collector = SyncRolloutCollector(
    build_env_fn(CONFIG.seed),
    policy_model,
    value_model=value_model,
    n_steps=CONFIG.train_rollout_steps
)

In [7]:
trajectories = train_rollout_collector.collect_rollouts() # TODO: consider making get rollout be async method even in sync rollout?
sum(1 if t else 0 for t in trajectories[3]), len(trajectories[3])

(21, 512)

Define models:

In [8]:
from utils.training import create_trainer
from tsilva_notebook_utils.torch import get_default_device # TODO: get rid of tsilva_noteb
from utils.rollouts import SyncRolloutCollector # TODO: restore async functionality
from utils.models import PolicyNet, ValueNet
from learners.ppo import PPOLearner
from learners.reinforce import REINFORCELearner

# TODO: make rollout collector clone models?
input_dim = env.observation_space.shape[0]
output_dim = env.action_space.n
policy_model = PolicyNet(input_dim, output_dim, CONFIG.hidden_dims)
value_model = ValueNet(input_dim, CONFIG.hidden_dims) if ALGO_ID == "ppo" else None # TODO: softcode this better

train_rollout_collector = SyncRolloutCollector(
    build_env_fn(CONFIG.seed),
    policy_model,
    value_model=value_model,
    n_steps=CONFIG.train_rollout_steps
)
eval_rollout_collector = SyncRolloutCollector(
    # TODO: pass env factory and rebuild env on start/stop? this allows using same rollout collector for final evaluation
    build_env_fn(CONFIG.seed + 1000),  # Use a different seed for evaluation
    policy_model,
    n_steps=CONFIG.train_rollout_steps # TODO: change this
)
algo_id = ALGO_ID.lower()
if algo_id == "ppo": agent = PPOLearner(CONFIG, train_rollout_collector, policy_model, value_model, eval_rollout_collector=eval_rollout_collector)
elif algo_id == "reinforce": agent = REINFORCELearner(CONFIG, train_rollout_collector, policy_model, eval_rollout_collector=eval_rollout_collector)

# Create trainer with W&B logging
# TODO: infer most args
trainer = create_trainer(CONFIG, project_name=ENV_ID, run_name=f"{ALGO_ID}-{CONFIG.seed}")

# Fit the model
trainer.fit(agent)

[34m[1mwandb[0m: Currently logged in as: [33mtsilva[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name         | Type      | Params | Mode 
---------------------------------------------------
0 | policy_model | PolicyNet | 226    | train
1 | value_model  | ValueNet  | 193    | train
---------------------------------------------------
419       Trainable params
0         Non-trainable params
419       Total params
0.002     Total estimated model params size (MB)
10        Modules in train mode
0         Modules in eval mode


🔗 W&B Run: https://wandb.ai/tsilva/CartPole-v1/runs/3jsx1fs1
Training started at 2025-07-22 15:26:58


/Users/tsilva/repos/tsilva/gymnasium-solver/.venv/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.
/Users/tsilva/repos/tsilva/gymnasium-solver/.venv/lib/python3.12/site-packages/pytorch_lightning/loops/fit_loop.py:310: The number of training batches (2) is smaller than the logging interval Trainer(log_every_n_steps=10). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Training: |          | 0/? [00:00<?, ?it/s]

0,1
epoch,▁▁▂▂▂▂▃▃▄▄▄▄▅▅▅▆▆▆▇▇▇██
epoch/approx_kl,█▇▆▃▂▂▄▄▆▆▅▃▄▃▁▁▂▃▃
epoch/clip_fraction,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
epoch/entropy,▁▂▃▄▄▅▅▆▆▆▇▇█▇█▇▇▇▆
epoch/explained_var,▆█▇▄▃▆▃▄▆▄▃▂▂▄▃▁▂▂▁
epoch/kl_div,▂▄▆▃▁▅▄▆▃▅▁▃▅▅▅▅▆▄█
epoch/policy_loss,▃▂▄▆▅▂▂▁▃▄▅▃▃▆▄▄▁█▂
epoch/value_loss,▁▁▂▂█▁▃▅▄▄▃▄▄▄▄▄▃▃▁
train/advantage_mean,▄▆█▁
train/advantage_std,▅█▆▁

0,1
epoch,19.0
epoch/approx_kl,1e-05
epoch/clip_fraction,0.0
epoch/entropy,0.68832
epoch/explained_var,0.00081
epoch/kl_div,0.00026
epoch/policy_loss,-0.0073
epoch/value_loss,35.77073
train/advantage_mean,0.00686
train/advantage_std,0.96531


AttributeError: 'SyncRolloutCollector' object has no attribute 'get_rollout'

In [None]:
from utils.evaluation import evaluate_agent

# Evaluate agent and render episodes
# TODO: evaluate agent should receive rollout collector as an argument, not the agent and build_env_fn
results = evaluate_agent(
    agent, 
    build_env_fn, 
    n_episodes=8, 
    deterministic=True, 
    render=True,
    grid=(2, 2), 
    text_color=(0, 0, 0), 
    out_dir="./tmp"
)

print(f"Mean reward: {results['mean_reward']:.2f}")