## Setup

This notebook translates run_hw1.py into a more observable ipynb version

## Configuration Guide

To run different experiments, modify the configuration in Cell 3:

### For Behavior Cloning (Problem 1):
- Set `do_dagger = False`
- Set `n_iter = 1`
- Choose your environment: `env_name = 'Ant-v4'` (or Walker2d-v4, HalfCheetah-v4, Hopper-v4)
- Set `exp_name = 'bc_ant'` (or bc_walker, bc_cheetah, bc_hopper)

### For DAgger (Problem 2):
- Set `do_dagger = True`
- Set `n_iter = 10` (or more)
- Choose your environment
- Set `exp_name = 'dagger_ant'` (or dagger_walker, etc.)

### Available Environments:
- `'Ant-v4'`
- `'Walker2d-v4'`
- `'HalfCheetah-v4'`
- `'Hopper-v4'`

### Expert Data Files:
- Ant: `'cs285/policies/experts/Ant.pkl'` and `'cs285/expert_data/expert_data_Ant-v4.pkl'`
- Walker2d: `'cs285/policies/experts/Walker2d.pkl'` and `'cs285/expert_data/expert_data_Walker2d-v4.pkl'`
- HalfCheetah: `'cs285/policies/experts/HalfCheetah.pkl'` and `'cs285/expert_data/expert_data_HalfCheetah-v4.pkl'`
- Hopper: `'cs285/policies/experts/Hopper.pkl'` and `'cs285/expert_data/expert_data_Hopper-v4.pkl'`


In [1]:
import pickle
import os
import time
import gym
import inspect

import numpy as np
import torch

from IPython.display import display, Markdown

from cs285.infrastructure import pytorch_util as ptu
from cs285.infrastructure import utils
from cs285.infrastructure.logger import Logger
from cs285.infrastructure.replay_buffer import ReplayBuffer
from cs285.policies.MLP_policy import MLPPolicySL
from cs285.policies.loaded_gaussian_policy import LoadedGaussianPolicy

Gym has been unmaintained since 2022 and does not support NumPy 2.0 amongst other critical functionality.
Please upgrade to Gymnasium, the maintained drop-in replacement of Gym, or contact the authors of your software and request that they upgrade.
See the migration guide at https://gymnasium.farama.org/introduction/migration_guide/ for additional information.
  from .autonotebook import tqdm as notebook_tqdm
  DESCRIPTOR = _descriptor.FileDescriptor(
  _descriptor.FieldDescriptor(
  _RESOURCEHANDLEPROTO = _descriptor.Descriptor(
  DESCRIPTOR = _descriptor.FileDescriptor(
  _descriptor.FieldDescriptor(
  _TENSORSHAPEPROTO_DIM = _descriptor.Descriptor(
  DESCRIPTOR = _descriptor.FileDescriptor(
  _descriptor.EnumValueDescriptor(
  _DATATYPE = _descriptor.EnumDescriptor(
  DESCRIPTOR = _descriptor.FileDescriptor(
  _descriptor.FieldDescriptor(
  _TENSORPROTO = _descriptor.Descriptor(
  DESCRIPTOR = _descriptor.FileDescriptor(
  _descriptor.FieldDescriptor(
  _SUMMARYDESCRIPTION = _descri

In [2]:
# how many rollouts to save as videos to tensorboard
MAX_NVIDEO = 2
MAX_VIDEO_LEN = 40  # we overwrite this in the code below

MJ_ENV_NAMES = ["Ant-v4", "Walker2d-v4", "HalfCheetah-v4", "Hopper-v4"]

In [3]:
os.getcwd()
os.chdir('/Users/xing.zhang/machine-learning/homework_fall2023/hw1')

In [4]:
# Configuration - Set your parameters here
class Args:
    def __getitem__(self, key):
        return getattr(self, key)
    
    def __setitem__(self, key, val):
        setattr(self, key, val)

# Create args object and set all parameters directly
args = Args()

# Expert data configuration
args.expert_policy_file = 'cs285/policies/experts/Ant.pkl'  # relative to where you're running this script from
args.expert_data = 'cs285/expert_data/expert_data_Ant-v4.pkl'  # relative to where you're running this script from
args.env_name = 'Ant-v4'  # choices: Ant-v4, Walker2d-v4, HalfCheetah-v4, Hopper-v4
args.exp_name = 'bc_ant'  # pick an experiment name
args.do_dagger = False  # Set to True for DAgger, False for Behavior Cloning
args.ep_len = 1000

# Training configuration
args.num_agent_train_steps_per_iter = 1000  # number of gradient steps for training policy (per iter in n_iter)
args.n_iter = 1  # Set to >1 for DAgger

# Batch sizes
args.batch_size = 1000  # training data collected (in the env) during each iteration
args.eval_batch_size = 1000  # eval data collected (in the env) for logging metrics
args.train_batch_size = 100  # number of sampled data points to be used per gradient/train step

# Network configuration
args.n_layers = 2  # depth of policy to be learned
args.size = 64  # width of each layer of policy to be learned
args.learning_rate = 5e-3  # LR for supervised learning

# Logging configuration
args.video_log_freq = 5
args.scalar_log_freq = 1

# GPU configuration
args.no_gpu = False
args.which_gpu = 0
args.max_replay_buffer_size = 1000000
args.save_params = False
args.seed = 1

print("Configuration loaded successfully!")
print(f"Environment: {args.env_name}")
print(f"Experiment: {args.exp_name}")
print(f"DAgger: {args.do_dagger}")
print(f"Iterations: {args.n_iter}")

# Convert to dictionary for the training function
params = vars(args)

Configuration loaded successfully!
Environment: Ant-v4
Experiment: bc_ant
DAgger: False
Iterations: 1


In [5]:
# Set up logging directory
if args.do_dagger:
    logdir_prefix = 'q2_'  # The autograder uses the prefix `q2_`
    assert args.n_iter > 1, ('DAgger needs more than 1 iteration (n_iter>1) of training, to iteratively query the expert and train (after 1st warmstarting from behavior cloning).')
else:
    logdir_prefix = 'q1_'  # The autograder uses the prefix `q1_`
    assert args.n_iter == 1, ('Vanilla behavior cloning collects expert data just once (n_iter=1)')

# Create data directory
data_path = os.path.join(os.getcwd(), '../../data') # NOTE: replaced __file__ with os.getcwd()
if not os.path.exists(data_path):
    os.makedirs(data_path)

# Create log directory
logdir = logdir_prefix + args.exp_name + '_' + args.env_name + '_' + time.strftime("%Y_%m_%d_%H_%M")
logdir = os.path.join(data_path, logdir)
args['logdir'] = logdir
if not os.path.exists(logdir):
    os.makedirs(logdir)

print(f"Logging to: {logdir}")
print(f"Data path: {data_path}")

Logging to: /Users/xing.zhang/machine-learning/homework_fall2023/hw1/../../data/q1_bc_ant_Ant-v4_2025_09_14_23_06
Data path: /Users/xing.zhang/machine-learning/homework_fall2023/hw1/../../data


# reproduce run_training_loop

In [6]:
# Set random seeds
seed = params['seed']
np.random.seed(seed)
torch.manual_seed(seed)
ptu.init_gpu(
    use_gpu=not params['no_gpu'],
    gpu_id=params['which_gpu']
)

# Set logger
logger = Logger(params['logdir'])
log_video = True 
log_metrics = True

GPU not detected. Defaulting to CPU.
########################
logging outputs to  /Users/xing.zhang/machine-learning/homework_fall2023/hw1/../../data/q1_bc_ant_Ant-v4_2025_09_14_23_06
########################


In [7]:
#############
## ENV
#############

env = gym.make(params['env_name'], render_mode = None)
env.reset(seed = seed)

vars(env)

# Maximum length for episodes
params['ep_len'] = params['ep_len'] or env.spec.max_episode_steps
MAX_VIDEO_LEN = params['ep_len']

# Action and observations
assert isinstance(env.action_space, gym.spaces.Box), "Environment must be continuous"
ob_dim = env.observation_space.shape[0]
ac_dim = env.action_space.shape[0]
print(f"observation and action dims are {ob_dim} & {ac_dim}, respectively")

# Simulation timestep, will be used for video saving
if 'model' in dir(env):
    fps = 1/env.model.opt.timestep
else:
    fps = env.env.metadata['render_fps']


#############
## AGENT
#############

# TODO: Implement missing functions in this class.
actor = MLPPolicySL(
    ac_dim,
    ob_dim,
    params['n_layers'],
    params['size'],
    learning_rate=params['learning_rate'],
)

# replay buffer
replay_buffer = ReplayBuffer(params['max_replay_buffer_size'])

#######################
## LOAD EXPERT POLICY
#######################

print('Loading expert policy from...', params['expert_policy_file'])
expert_policy = LoadedGaussianPolicy(params['expert_policy_file'])
expert_policy.to(ptu.device)
print('Done restoring expert policy...')


#######################
## TRAINING LOOP
#######################

# init vars at beginning of training
total_envsteps = 0
start_time = time.time()

for itr in range(params['n_iter']):
    print("\n\n********** Iteration %i ************"%itr)

    # decide if videos should be rendered/logged at this iteration
    log_video = ((itr % params['video_log_freq'] == 0) and (params['video_log_freq'] != -1))
    # decide if metrics should be logged
    log_metrics = (itr % params['scalar_log_freq'] == 0)

    print("\nCollecting data to be used for training...")
    if itr == 0:
        # BC training from expert data.
        paths = pickle.load(open(params['expert_data'], 'rb'))
        envsteps_this_batch = 0
    else:
        # DAGGER training from sampled data relabeled by expert
        assert params['do_dagger']
        # TODO: collect `params['batch_size']` transitions
        # HINT: use utils.sample_trajectories
        # TODO: implement missing parts of utils.sample_trajectory
        paths, envsteps_this_batch = TODO

#         # relabel the collected obs with actions from a provided expert policy
#         if params['do_dagger']:
#             print("\nRelabelling collected observations with labels from an expert policy...")

#             # TODO: relabel collected obsevations (from our policy) with labels from expert policy
#             # HINT: query the policy (using the get_action function) with paths[i]["observation"]
#             # and replace paths[i]["action"] with these expert labels
#             paths = TODO

#     total_envsteps += envsteps_this_batch
#     # add collected data to replay buffer
#     replay_buffer.add_rollouts(paths)

#     # train agent (using sampled data from replay buffer)
#     print('\nTraining agent using sampled data from replay buffer...')
#     training_logs = []
#     for _ in range(params['num_agent_train_steps_per_iter']):

#         # TODO: sample some data from replay_buffer
#         # HINT1: how much data = params['train_batch_size']
#         # HINT2: use np.random.permutation to sample random indices
#         # HINT3: return corresponding data points from each array (i.e., not different indices from each array)
#         # for imitation learning, we only need observations and actions.  
#         ob_batch, ac_batch = TODO

#         # use the sampled data to train an agent
#         train_log = actor.update(ob_batch, ac_batch)
#         training_logs.append(train_log)

#     # log/save
#     print('\nBeginning logging procedure...')
#     if log_video:
#         # save eval rollouts as videos in tensorboard event file
#         print('\nCollecting video rollouts eval')
#         eval_video_paths = utils.sample_n_trajectories(
#             env, actor, MAX_NVIDEO, MAX_VIDEO_LEN, True)

#         # save videos
#         if eval_video_paths is not None:
#             logger.log_paths_as_videos(
#                 eval_video_paths, itr,
#                 fps=fps,
#                 max_videos_to_save=MAX_NVIDEO,
#                 video_title='eval_rollouts')

#     if log_metrics:
#         # save eval metrics
#         print("\nCollecting data for eval...")
#         eval_paths, eval_envsteps_this_batch = utils.sample_trajectories(
#             env, actor, params['eval_batch_size'], params['ep_len'])

#         logs = utils.compute_metrics(paths, eval_paths)
#         # compute additional metrics
#         logs.update(training_logs[-1]) # Only use the last log for now
#         logs["Train_EnvstepsSoFar"] = total_envsteps
#         logs["TimeSinceStart"] = time.time() - start_time
#         if itr == 0:
#             logs["Initial_DataCollection_AverageReturn"] = logs["Train_AverageReturn"]

#         # perform the logging
#         for key, value in logs.items():
#             print('{} : {}'.format(key, value))
#             logger.log_scalar(value, key, itr)
#         print('Done logging...\n\n')

#         logger.flush()

#     if params['save_params']:
#         print('\nSaving agent params')
#         actor.save('{}/policy_itr_{}.pt'.format(params['logdir'], itr))


observation and action dims are 27 & 8, respectively
Loading expert policy from... cs285/policies/experts/Ant.pkl
obs (1, 111) (1, 111)
Done restoring expert policy...


********** Iteration 0 ************

Collecting data to be used for training...


  deprecation(
  deprecation(


# Below is learning notes
- Environment
- Data: Paths
- Policy 

In [8]:
# Learn 1: five key elements of environment: observation space, action space, reward, transition, termination
env = gym.make(params['env_name'], render_mode = None, new_step_api=True)

display(Markdown("**element 1: observation space**"))
print(env.observation_space)


display(Markdown("**element 2: action space**"))
print(env.action_space)


display(Markdown("**element 3-5: reward, transition, termination in one step funciton**"))
sample_action = env.action_space.sample()
env.reset()
results = env.step(sample_action) # results is a tuple
result_names = ['next_observation', 'reward', 'terminated', 'terminated due to max step hit', 'info']
for result_name, value in zip(result_names, results):
    print(result_name)
    print(value, end = '\n\n')


# check the code of step
base = env.unwrapped
print(base.__class__.__name__, ' & ', base.__module__)
#print(inspect.getsourcefile(base.__class__))
print(inspect.getsource(base.step))   # shows the actual termination math

**element 1: observation space**

Box(-inf, inf, (27,), float64)


**element 2: action space**

Box(-1.0, 1.0, (8,), float32)


**element 3-5: reward, transition, termination in one step funciton**

next_observation
[  0.71492013   0.99619409  -0.02240941  -0.04421852  -0.07169297
   0.05356397   0.4980131    0.08537807  -0.39134828   0.13119301
  -0.45138469   0.02877128   0.48645838  -0.18309693   0.12990067
   0.32806806  -0.63449701  -0.44161018  -1.19474622  -0.78696082
  12.46278956   0.67844818  -7.34644238   3.57685879 -12.73319225
   3.48154923  14.11832739]

reward
-0.33614342386594953

terminated
False

terminated due to max step hit
False

info
{'reward_forward': -0.09296982462278551, 'reward_ctrl': -1.243173599243164, 'reward_survive': 1.0, 'x_position': -0.0083139772792265, 'y_position': 0.05900007904906596, 'distance_from_origin': 0.059582980338310755, 'x_velocity': -0.09296982462278551, 'y_velocity': 0.1243988547619973, 'forward_reward': -0.09296982462278551}

AntEnv  &  gym.envs.mujoco.ant_v4
    def step(self, action):
        xy_position_before = self.get_body_com("torso")[:2].copy()
        self.do_simulation(action, self.frame_skip)
        xy_position_after =

In [9]:
# paths is a list
print(f"size of paths is {len(paths)}")
print(f"each step of the path is a dict with keys: {list(paths[0].keys())}")
for key, item in paths[0].items():
    print(f"shape of {key}: {item.shape}")

# confirm that observation i is next_observation i-1
assert np.array_equal(paths[0]['observation'][1:], paths[0]['next_observation'][0:-1])

size of paths is 2
each step of the path is a dict with keys: ['observation', 'action', 'reward', 'next_observation', 'terminal']
shape of observation: (1000, 27)
shape of action: (1000, 8)
shape of reward: (1000,)
shape of next_observation: (1000, 27)
shape of terminal: (1000,)


In [10]:
# Policy: actor and expert_policy
import torch.nn as nn

# both policies are nn.Module
assert isinstance(expert_policy, nn.Module)
assert isinstance(actor, nn.Module)

print(f"actor type: {type(actor)}")
print(f"expert policy type: {type(expert_policy)}")

actor type: <class 'cs285.policies.MLP_policy.MLPPolicySL'>
expert policy type: <class 'cs285.policies.loaded_gaussian_policy.LoadedGaussianPolicy'>


In [15]:
def learn_policy(policy: nn.Module):
    # architecture
    print("=== Policy Architecture ===")
    print(policy)

    # total count of parameters
    total_params = sum(p.numel() for p in policy.parameters())
    trainable_params = sum(p.numel() for p in policy.parameters() if p.requires_grad)

    print("\n=== Parameter Counts ===")
    print(f"Total parameters:     {total_params:,}")
    print(f"Trainable parameters: {trainable_params:,}")


    # layers 
    print("\n=== Layer by Layer ===")
    n_param_expert = [param.numel() for param in policy.parameters()]
    n_trainable_param_expert = [param.numel() for param in policy.parameters() if param.requires_grad]
    print(f"all parameters: {n_param_expert}")
    print(f"all trainable parameters: {n_trainable_param_expert}")

    for name, param in policy.named_parameters():
        print(f"{name:30} shape={tuple(param.shape)} requires_grad={param.requires_grad}")

display(Markdown("# expert policy"))
learn_policy(expert_policy)
display(Markdown("# actor"))
learn_policy(actor)

# expert policy

=== Policy Architecture ===
LoadedGaussianPolicy(
  (non_lin): Tanh()
  (hidden_layers): ModuleList(
    (0): Linear(in_features=64, out_features=111, bias=True)
    (1): Linear(in_features=64, out_features=64, bias=True)
  )
  (output_layer): Linear(in_features=8, out_features=64, bias=True)
)

=== Parameter Counts ===
Total parameters:     12,070
Trainable parameters: 12,070

=== Layer by Layer ===
all parameters: [111, 111, 7104, 64, 4096, 64, 512, 8]
all trainable parameters: [111, 111, 7104, 64, 4096, 64, 512, 8]
obs_norm_mean                  shape=(1, 111) requires_grad=True
obs_norm_std                   shape=(1, 111) requires_grad=True
hidden_layers.0.weight         shape=(64, 111) requires_grad=True
hidden_layers.0.bias           shape=(64,) requires_grad=True
hidden_layers.1.weight         shape=(64, 64) requires_grad=True
hidden_layers.1.bias           shape=(64,) requires_grad=True
output_layer.weight            shape=(8, 64) requires_grad=True
output_layer.bias          

# actor

=== Policy Architecture ===
MLPPolicySL(
  (mean_net): Sequential(
    (0): Linear(in_features=27, out_features=64, bias=True)
    (1): Tanh()
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): Tanh()
    (4): Linear(in_features=64, out_features=8, bias=True)
  )
)

=== Parameter Counts ===
Total parameters:     6,480
Trainable parameters: 6,480

=== Layer by Layer ===
all parameters: [8, 1728, 64, 4096, 64, 512, 8]
all trainable parameters: [8, 1728, 64, 4096, 64, 512, 8]
logstd                         shape=(8,) requires_grad=True
mean_net.0.weight              shape=(64, 27) requires_grad=True
mean_net.0.bias                shape=(64,) requires_grad=True
mean_net.2.weight              shape=(64, 64) requires_grad=True
mean_net.2.bias                shape=(64,) requires_grad=True
mean_net.4.weight              shape=(8, 64) requires_grad=True
mean_net.4.bias                shape=(8,) requires_grad=True
