# Environment Solver

Make notebook autoreload when imported repo modules change:

In [1]:
%load_ext autoreload
%autoreload 2

Define which environment we want to solve and with which algorithm:

In [2]:
ENV_ID = "CartPole-v1"
#ENV_ID = "LunarLander-v3"
#ALGORITHM = "reinforce"
ALGO_ID = "ppo"

Load secrets:

In [3]:
from tsilva_notebook_utils.colab import load_secrets_into_env
_ = load_secrets_into_env(['WANDB_API_KEY'])

Load training configuration:

In [4]:
from utils.config import load_config
CONFIG = load_config(ENV_ID, ALGO_ID)
print(f"Loaded config for {ENV_ID} with {ALGO_ID} algorithm:")
print(CONFIG)

Loaded config for CartPole-v1 with ppo algorithm:
RLConfig(env_id='CartPole-v1', seed=42, max_epochs=-1, gamma=0.99, lam=0.95, clip_epsilon=0.2, batch_size=256, train_rollout_steps=2048, eval_interval=20, eval_episodes=5, reward_threshold=475, policy_lr=0.001, value_lr=0.001, hidden_dims=(32,), entropy_coef=0.01, normalize=False, mean_reward_window=100, rollout_interval=1)


Build environment:

In [5]:
from tsilva_notebook_utils.gymnasium import log_env_info
from utils.environment import setup_environment
build_env_fn = setup_environment(CONFIG) # TODO: consider getting rid of this method or moving everything inside it
env = build_env_fn(CONFIG.seed)
log_env_info(env)

Environment Info (SubprocVecEnv with 8 envs)
  Env ID: CartPole-v1
  Observation space: Box(low=[-4.8, -inf, -0.419, -inf], high=[4.8, inf, 0.419, inf], shape=(4,), dtype=float32)
  Action space: Discrete(2)
  Max episode steps: 500


In [6]:
from utils.rollouts import SyncRolloutCollector
from utils.models import PolicyNet, ValueNet
obs_dim = env.observation_space.shape[0]
act_dim = env.action_space.n if hasattr(env.action_space, 'n') else env.action_space.shape[0]
policy_model = PolicyNet(obs_dim, act_dim, CONFIG.hidden_dims)
value_model = ValueNet(obs_dim, CONFIG.hidden_dims) if ALGO_ID == "ppo" else None
train_rollout_collector = SyncRolloutCollector(
    build_env_fn(CONFIG.seed),
    policy_model,
    value_model=value_model,
    n_steps=CONFIG.train_rollout_steps
)

Define models:

In [None]:
from utils.training import create_trainer
from utils.rollouts import SyncRolloutCollector # TODO: restore async functionality
from utils.models import PolicyNet, ValueNet
from learners.ppo import PPOLearner
from learners.reinforce import REINFORCELearner

# TODO: make rollout collector clone models?
input_dim = env.observation_space.shape[0]
output_dim = env.action_space.n
policy_model = PolicyNet(input_dim, output_dim, CONFIG.hidden_dims)
value_model = ValueNet(input_dim, CONFIG.hidden_dims) if ALGO_ID == "ppo" else None # TODO: softcode this better

train_rollout_collector = SyncRolloutCollector(
    build_env_fn(CONFIG.seed),
    policy_model,
    value_model=value_model,
    n_steps=CONFIG.train_rollout_steps
)
eval_rollout_collector = SyncRolloutCollector(
    # TODO: pass env factory and rebuild env on start/stop? this allows using same rollout collector for final evaluation
    build_env_fn(CONFIG.seed + 1000),  # Use a different seed for evaluation
    policy_model,
    n_episodes=8,
    deterministic=True
)

In [None]:
trajectories, stats = eval_rollout_collector.collect(collect_frames=True)
stats # TODO: stats don't seem to match the number of episodes, why?

  from pkg_resources import resource_stream, resource_exists
  from pkg_resources import resource_stream, resource_exists
  from pkg_resources import resource_stream, resource_exists
  from pkg_resources import resource_stream, resource_exists
  from pkg_resources import resource_stream, resource_exists
  from pkg_resources import resource_stream, resource_exists
  from pkg_resources import resource_stream, resource_exists
  from pkg_resources import resource_stream, resource_exists
2025-07-23 12:39:57.920 python[30522:1240941] +[NSXPCSharedListener endpointForReply:withListenerName:replyErrorCode:]: an error occurred while attempting to obtain endpoint for listener 'ClientCallsAuxiliary': Connection invalid
2025-07-23 12:39:57.920 python[30518:1240937] +[NSXPCSharedListener endpointForReply:withListenerName:replyErrorCode:]: an error occurred while attempting to obtain endpoint for listener 'ClientCallsAuxiliary': Connection invalid
2025-07-23 12:39:57.921 python[30515:1240933] +[NSXP

{'n_episodes': 8,
 'n_steps': 208,
 'mean_ep_reward': 12.5,
 'mean_ep_length': 12.5}

In [None]:
from evaluation import render_rollouts
render_rollouts(eval_rollout_collector, n_episodes=16)

In [None]:

algo_id = ALGO_ID.lower()
if algo_id == "ppo": agent = PPOLearner(CONFIG, train_rollout_collector, policy_model, value_model, eval_rollout_collector=eval_rollout_collector)
elif algo_id == "reinforce": agent = REINFORCELearner(CONFIG, train_rollout_collector, policy_model, eval_rollout_collector=eval_rollout_collector)

# Create trainer with W&B logging
# TODO: infer most args
trainer = create_trainer(CONFIG, project_name=ENV_ID, run_name=f"{ALGO_ID}-{CONFIG.seed}")

# Fit the model
trainer.fit(agent)

In [None]:
from utils.evaluation import evaluate_agent

# Evaluate agent and render episodes
# TODO: evaluate agent should receive rollout collector as an argument, not the agent and build_env_fn
results = evaluate_agent(
    agent, 
    build_env_fn, 
    n_episodes=8, 
    deterministic=True, 
    render=True,
    grid=(2, 2), 
    text_color=(0, 0, 0), 
    out_dir="./tmp"
)

print(f"Mean reward: {results['mean_reward']:.2f}")