# Goal
create a notebook version of run_hw2.py to learn the details of every step

**questions**
1. (theoretical) why reward-to-go's discount factor is adjusted to the current time period: reward_to_go[-1] = rewards[-1] rather than rewards[-1]*gamma**T

In [1]:
from cs285.agents.pg_agent import PGAgent

import os
import time
from dataclasses import dataclass
from typing import Optional

import gym
import numpy as np
import torch
from cs285.infrastructure import pytorch_util as ptu

from cs285.infrastructure import utils
from cs285.infrastructure.logger import Logger
from cs285.infrastructure.action_noise_wrapper import ActionNoiseWrapper

  from .autonotebook import tqdm as notebook_tqdm
Gym has been unmaintained since 2022 and does not support NumPy 2.0 amongst other critical functionality.
Please upgrade to Gymnasium, the maintained drop-in replacement of Gym, or contact the authors of your software and request that they upgrade.
See the migration guide at https://gymnasium.farama.org/introduction/migration_guide/ for additional information.
  DESCRIPTOR = _descriptor.FileDescriptor(
  _descriptor.FieldDescriptor(
  _RESOURCEHANDLEPROTO = _descriptor.Descriptor(
  DESCRIPTOR = _descriptor.FileDescriptor(
  _descriptor.FieldDescriptor(
  _TENSORSHAPEPROTO_DIM = _descriptor.Descriptor(
  DESCRIPTOR = _descriptor.FileDescriptor(
  _descriptor.EnumValueDescriptor(
  _DATATYPE = _descriptor.EnumDescriptor(
  DESCRIPTOR = _descriptor.FileDescriptor(
  _descriptor.FieldDescriptor(
  _TENSORPROTO = _descriptor.Descriptor(
  DESCRIPTOR = _descriptor.FileDescriptor(
  _descriptor.FieldDescriptor(
  _SUMMARYDESCRIPTION = _descri

In [2]:
# Fixed TrainingArgs class that works in Jupyter notebooks
@dataclass
class TrainingArgs:
    """Configuration class for training arguments that works well in Jupyter notebooks."""
    
    # Required arguments
    env_name: str
    exp_name: str
    
    # Training parameters
    n_iter: int = 10
    
    # Policy gradient specific
    use_reward_to_go: bool = False
    use_baseline: bool = False
    baseline_learning_rate: float = 5e-3
    baseline_gradient_steps: int = 5
    gae_lambda: Optional[float] = None
    normalize_advantages: bool = False
    
    # Batch sizes
    batch_size: int = 1000  # steps collected per train iteration
    eval_batch_size: int = 400  # steps collected per eval iteration
    
    # Network parameters
    discount: float = 1.0
    learning_rate: float = 5e-3
    n_layers: int = 2
    layer_size: int = 64
    
    # Environment and logging
    ep_len: Optional[int] = None  # students shouldn't change this away from env's default
    seed: int = 1
    no_gpu: bool = False
    which_gpu: int = 0
    video_log_freq: int = -1
    scalar_log_freq: int = 1
    action_noise_std: float = 0
    
    # Computed properties
    logdir: Optional[str] = None
    
    def __post_init__(self):
        """Set up logging directory after initialization."""
        if self.logdir is None:
            self._setup_logdir()
    
    def _setup_logdir(self):
        """Create and set up the logging directory."""
        logdir_prefix = "q2_pg_"  # keep for autograder
        
        # For notebooks, use current working directory instead of __file__
        try:
            # Try to get the script directory if running from a script
            script_dir = os.path.dirname(os.path.realpath(__file__))
            data_path = os.path.join(script_dir, "../../data")
        except NameError:
            # If __file__ is not defined (like in notebooks), use current directory
            data_path = os.path.join(os.getcwd(), "data")
        
        if not os.path.exists(data_path):
            os.makedirs(data_path)
        
        logdir = (
            logdir_prefix
            + self.exp_name
            + "_"
            + self.env_name
            + "_"
            + time.strftime("%d-%m-%Y_%H-%M-%S")
        )
        self.logdir = os.path.join(data_path, logdir)
        
        if not os.path.exists(self.logdir):
            os.makedirs(self.logdir)
    
    @classmethod
    def from_dict(cls, config_dict: dict) -> 'TrainingArgs':
        """Create TrainingArgs from a dictionary (useful for notebook cells)."""
        return cls(**config_dict)
    
    def to_dict(self) -> dict:
        """Convert TrainingArgs to dictionary."""
        return {
            field.name: getattr(self, field.name) 
            for field in self.__dataclass_fields__.values()
        }
    
    def update(self, **kwargs):
        """Update specific arguments."""
        for key, value in kwargs.items():
            if hasattr(self, key):
                setattr(self, key, value)
            else:
                raise ValueError(f"Unknown argument: {key}")
    
    def print_config(self):
        """Print the current configuration."""
        print("Training Configuration:")
        print("-" * 40)
        for field in self.__dataclass_fields__:
            value = getattr(self, field)
            print(f"{field:25}: {value}")
        print("-" * 40)


# Predefined configurations for common experiments
def get_cartpole_config() -> TrainingArgs:
    """Get a configuration for CartPole experiments."""
    return TrainingArgs(
        env_name="CartPole-v0",
        exp_name="cartpole_baseline",
        n_iter=100,
        batch_size=1000,
        eval_batch_size=400,
        learning_rate=5e-3,
        use_baseline=True,
        use_reward_to_go=True,
        normalize_advantages=True
    )

# Create an instance
args = get_cartpole_config()
args.print_config()

Training Configuration:
----------------------------------------
env_name                 : CartPole-v0
exp_name                 : cartpole_baseline
n_iter                   : 100
use_reward_to_go         : True
use_baseline             : True
baseline_learning_rate   : 0.005
baseline_gradient_steps  : 5
gae_lambda               : None
normalize_advantages     : True
batch_size               : 1000
eval_batch_size          : 400
discount                 : 1.0
learning_rate            : 0.005
n_layers                 : 2
layer_size               : 64
ep_len                   : None
seed                     : 1
no_gpu                   : False
which_gpu                : 0
video_log_freq           : -1
scalar_log_freq          : 1
action_noise_std         : 0
logdir                   : /Users/xing.zhang/machine-learning/homework_fall2023/hw2/cs285/scripts/data/q2_pg_cartpole_baseline_CartPole-v0_01-11-2025_19-41-07
----------------------------------------


In [3]:
MAX_NVIDEO = 2

logger = Logger(args.logdir)

# set random seeds
np.random.seed(args.seed)
torch.manual_seed(args.seed)
ptu.init_gpu(use_gpu=not args.no_gpu, gpu_id=args.which_gpu)

# make the gym environment
env = gym.make(args.env_name, render_mode=None)
discrete = isinstance(env.action_space, gym.spaces.Discrete)

# add action noise, if needed
if args.action_noise_std > 0:
    assert not discrete, f"Cannot use --action_noise_std for discrete environment {args.env_name}"
    env = ActionNoiseWrapper(env, args.seed, args.action_noise_std)

max_ep_len = args.ep_len or env.spec.max_episode_steps

ob_dim = env.observation_space.shape[0]
ac_dim = env.action_space.n if discrete else env.action_space.shape[0]

# simulation timestep, will be used for video saving
if hasattr(env, "model"):
    fps = 1 / env.model.opt.timestep
else:
    fps = env.env.metadata["render_fps"]

from cs285.networks.policies import MLPPolicyPG
actor = MLPPolicyPG(ac_dim, ob_dim, discrete, args.n_layers, args.layer_size, args.learning_rate)

########################
logging outputs to  /Users/xing.zhang/machine-learning/homework_fall2023/hw2/cs285/scripts/data/q2_pg_cartpole_baseline_CartPole-v0_01-11-2025_19-41-07
########################
Using CPU.


  logger.warn(
  deprecation(
  deprecation(


In [4]:
# two key factors: data (trajs) and agent
# -----------------------------------------
# 1. data (trajs): a list of dictionary for each episode
example_batch_size = 10
example_ep_len = 3
trajs, envsteps_this_batch = utils.sample_trajectories(env, actor, example_batch_size, example_ep_len)
print("\ntrajs\n", trajs, sep = '')
print("\nenvstpes this batch\n", envsteps_this_batch, sep = '')
# put each key together
trajs_dict = {k: [traj[k] for traj in trajs] for k in trajs[0]}

print("\ntrajs_dict\n", trajs_dict, sep = '')


trajs
[{'observation': array([[ 0.00742216,  0.01336038,  0.01530321, -0.02889315],
       [ 0.00768937,  0.20825957,  0.01472535, -0.3167087 ],
       [ 0.01185456,  0.4031687 ,  0.00839117, -0.60471165]],
      dtype=float32), 'image_obs': array([], dtype=uint8), 'reward': array([1., 1., 1.], dtype=float32), 'action': array([1., 1., 1.], dtype=float32), 'next_observation': array([[ 0.00768937,  0.20825957,  0.01472535, -0.3167087 ],
       [ 0.01185456,  0.4031687 ,  0.00839117, -0.60471165],
       [ 0.01991793,  0.5981723 , -0.00370306, -0.8947398 ]],
      dtype=float32), 'terminal': array([0., 0., 1.], dtype=float32)}, {'observation': array([[-0.00694073,  0.00904685,  0.04906083, -0.03109459],
       [-0.00675979,  0.20343216,  0.04843893, -0.30790362],
       [-0.00269115,  0.00765468,  0.04228086, -0.00034637]],
      dtype=float32), 'image_obs': array([], dtype=uint8), 'reward': array([1., 1., 1.], dtype=float32), 'action': array([1., 0., 0.], dtype=float32), 'next_observati

  if not isinstance(terminated, (bool, np.bool8)):


In [5]:
# initialize agent
agent = PGAgent(
    ob_dim,
    ac_dim,
    discrete,
    n_layers=args.n_layers,
    layer_size=args.layer_size,
    gamma=args.discount,
    learning_rate=args.learning_rate,
    use_baseline=args.use_baseline,
    use_reward_to_go=args.use_reward_to_go,
    normalize_advantages=args.normalize_advantages,
    baseline_learning_rate=args.baseline_learning_rate,
    baseline_gradient_steps=args.baseline_gradient_steps,
    gae_lambda=args.gae_lambda,
)

total_envsteps = 0
start_time = time.time()

for itr in range(args.n_iter):
    print(f"\n********** Iteration {itr} ************")
    # TODO: sample `args.batch_size` transitions using utils.sample_trajectories
    # make sure to use `max_ep_len`
    trajs, envsteps_this_batch = utils.sample_trajectories(
        env, agent.actor, args.batch_size, max_ep_len
    )
    total_envsteps += envsteps_this_batch

    # trajs should be a list of dictionaries of NumPy arrays, where each dictionary corresponds to a trajectory.
    # this line converts this into a single dictionary of lists of NumPy arrays.
    trajs_dict = {k: [traj[k] for traj in trajs] for k in trajs[0]}

    # TODO: train the agent using the sampled trajectories and the agent's update function
    train_info = agent.update(
        trajs_dict["observation"],
        trajs_dict["action"], 
        trajs_dict["reward"],
        trajs_dict["terminal"]
    )

    if itr % args.scalar_log_freq == 0:
        # save eval metrics
        print("\nCollecting data for eval...")
        eval_trajs, eval_envsteps_this_batch = utils.sample_trajectories(
            env, agent.actor, args.eval_batch_size, max_ep_len
        )

        logs = utils.compute_metrics(trajs, eval_trajs)
        # compute additional metrics
        logs.update(train_info)
        logs["Train_EnvstepsSoFar"] = total_envsteps
        logs["TimeSinceStart"] = time.time() - start_time
        if itr == 0:
            logs["Initial_DataCollection_AverageReturn"] = logs[
                "Train_AverageReturn"
            ]

        # perform the logging
        for key, value in logs.items():
            print("{} : {}".format(key, value))
            logger.log_scalar(value, key, itr)
        print("Done logging...\n\n")

        logger.flush()

    if args.video_log_freq != -1 and itr % args.video_log_freq == 0:
        print("\nCollecting video rollouts...")
        eval_video_trajs = utils.sample_n_trajectories(
            env, agent.actor, MAX_NVIDEO, max_ep_len, render=True
        )

        logger.log_trajs_as_videos(
            eval_video_trajs,
            itr,
            fps=fps,
            max_videos_to_save=MAX_NVIDEO,
            video_title="eval_rollouts",
        )


********** Iteration 0 ************

Collecting data for eval...
Eval_AverageReturn : 21.105262756347656
Eval_StdReturn : 7.690696716308594
Eval_MaxReturn : 36.0
Eval_MinReturn : 12.0
Eval_AverageEpLen : 21.105263157894736
Train_AverageReturn : 23.952381134033203
Train_StdReturn : 14.362570762634277
Train_MaxReturn : 74.0
Train_MinReturn : 9.0
Train_AverageEpLen : 23.952380952380953
Actor Loss : 0.0015725807752460241
Baseline Loss : 445.3271484375
Train_EnvstepsSoFar : 1006
TimeSinceStart : 0.09609627723693848
Initial_DataCollection_AverageReturn : 23.952381134033203
Done logging...



********** Iteration 1 ************

Collecting data for eval...


  scalar = float(scalar)


Eval_AverageReturn : 38.09090805053711
Eval_StdReturn : 18.951873779296875
Eval_MaxReturn : 75.0
Eval_MinReturn : 11.0
Eval_AverageEpLen : 38.09090909090909
Train_AverageReturn : 27.2702693939209
Train_StdReturn : 15.356393814086914
Train_MaxReturn : 73.0
Train_MinReturn : 11.0
Train_AverageEpLen : 27.27027027027027
Actor Loss : -0.003947851713746786
Baseline Loss : 449.7375793457031
Train_EnvstepsSoFar : 2015
TimeSinceStart : 0.18341517448425293
Done logging...



********** Iteration 2 ************

Collecting data for eval...
Eval_AverageReturn : 33.41666793823242
Eval_StdReturn : 18.459226608276367
Eval_MaxReturn : 86.0
Eval_MinReturn : 13.0
Eval_AverageEpLen : 33.416666666666664
Train_AverageReturn : 30.81818199157715
Train_StdReturn : 17.31255340576172
Train_MaxReturn : 91.0
Train_MinReturn : 13.0
Train_AverageEpLen : 30.818181818181817
Actor Loss : -0.005495672579854727
Baseline Loss : 556.9501953125
Train_EnvstepsSoFar : 3032
TimeSinceStart : 0.26929521560668945
Done logging...