##Setup

You will need to make a copy of this notebook in your Google Drive before you can edit the homework files. You can do so with **File &rarr; Save a copy in Drive**.

In [None]:
#@title mount your Google Drive
#@markdown Your work will be stored in a folder called `hw_16831` by default to prevent Colab instance timeouts from deleting your edits.

import os
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
#@title set up mount symlink

DRIVE_PATH = '/content/gdrive/My\ Drive/hw_16831'
DRIVE_PYTHON_PATH = DRIVE_PATH.replace('\\', '')
if not os.path.exists(DRIVE_PYTHON_PATH):
  %mkdir $DRIVE_PATH

## the space in `My Drive` causes some issues,
## make a symlink to avoid this
SYM_PATH = '/content/hw_16831'
if not os.path.exists(SYM_PATH):
  !ln -s $DRIVE_PATH $SYM_PATH

In [None]:
#@title apt install requirements

#@markdown Run each section with Shift+Enter

#@markdown Double-click on section headers to show code.

!apt update
!apt install -y --no-install-recommends \
        build-essential \
        curl \
        git \
        gnupg2 \
        make \
        cmake \
        ffmpeg \
        swig \
        libz-dev \
        unzip \
        zlib1g-dev \
        libglfw3 \
        libglfw3-dev \
        libxrandr2 \
        libxinerama-dev \
        libxi6 \
        libxcursor-dev \
        libgl1-mesa-dev \
        libgl1-mesa-glx \
        libglew-dev \
        libosmesa6-dev \
        lsb-release \
        ack-grep \
        patchelf \
        wget \
        xpra \
        xserver-xorg-dev \
        xvfb \
        python3-opengl \
        ffmpeg > /dev/null 2>&1

In [None]:
#@title clone homework repo
#@markdown Note that this is the same codebase from homework 1,
#@markdown so you may need to move your old `hw_16831`
#@markdown folder in order to clone the repo again.

#@markdown **Don't delete your old work though!**
#@markdown You will need it for this assignment.

%cd $SYM_PATH
!git clone https://github.com/LeCAR-Lab/16831-F25-HW.git
%cd 16831-F25-HW/hw2
%pip install -r requirements.txt

# subprocess exited error at numpy is fine - install manually below
%pip install -e .

In [None]:
!pip install numpy

In [None]:
# Download mujoco from source (fix from hw1)
!wget https://mujoco.org/download/mujoco210-linux-x86_64.tar.gz
!tar xzvf mujoco210-linux-x86_64.tar.gz
!mkdir -p ~/.mujoco
!mv mujoco210 ~/.mujoco/mujoco210
!rm mujoco*

%pip install -U mujoco
%pip install -U 'mujoco-py<2.2,>=2.1'
%pip install -U pyvirtualdisplay
%pip install -U gym-notebook-wrapper
%pip install -U "cython<3"

In [None]:
# set env variables
import os
os.environ['LD_LIBRARY_PATH'] += ':/root/.mujoco/mujoco210/bin'
os.environ['MUJOCO_PY_MUJOCO_PATH'] = '/root/.mujoco/mujoco210'
os.environ['LD_LIBRARY_PATH'] += ':/usr/lib/nvidia'

In [None]:
!pip install tensorboardx
!pip install cement==2.10.14
!pip install box2d box2d-py pygame --pre

In [None]:
#@title set up virtual display

from pyvirtualdisplay import Display

display = Display(visible=0, size=(1400, 900))
display.start()

In [None]:
#@title test virtual display

#@markdown If you see a video of a four-legged ant fumbling about, setup is complete!
import numpy as np
if not hasattr(np, "bool8"):
    np.bool8 = np.bool_
    
import gym
import gnwrapper

env = gnwrapper.LoopAnimation(gym.make('Ant-v4'))

observation = env.reset()
for i in range(100):
    obs, rew, term, _ = env.step(env.action_space.sample())
    env.render()
    if term:
      break

env.display()
env.close()

## Editing Code

To edit code, click the folder icon on the left menu. Navigate to the corresponding file (`hw_16831/...`). Double click a file to open an editor. There is a timeout of about ~12 hours with Colab while it is active (and less if you close your browser window). We sync your edits to Google Drive so that you won't lose your work in the event of an instance timeout, but you will need to re-mount your Google Drive and re-install packages with every new instance.

## Run Policy Gradients

In [None]:
# fix error with autoreload

#@title imports
import importlib
import os
import time
import rob831.policies.MLP_policy
import rob831.agents.pg_agent
import rob831.infrastructure.rl_trainer
import rob831.infrastructure.utils

importlib.reload(rob831.policies.MLP_policy)
importlib.reload(rob831.agents.pg_agent)
importlib.reload(rob831.infrastructure.rl_trainer)
importlib.reload(rob831.infrastructure.utils)

from rob831.infrastructure.rl_trainer import RL_Trainer
from rob831.agents.pg_agent import PGAgent

#%load_ext autoreload
#%autoreload 2

In [18]:
#@title runtime arguments

class Args:

  def __getitem__(self, key):
    return getattr(self, key)

  def __setitem__(self, key, val):
    setattr(self, key, val)

  def __contains__(self, key):
    return hasattr(self, key)

  env_name = 'CartPole-v0' #@param
  exp_name = 'q1_sb_rtg_na' #@param

  #@markdown main parameters of interest
  n_iter = 100 #@param {type: "integer"}

  ## PDF will tell you how to set ep_len
  ## and discount for each environment
  ep_len = 200 #@param {type: "integer"}
  discount = 0.95 #@param {type: "number"}

  reward_to_go = True #@param {type: "boolean"}
  nn_baseline = False #@param {type: "boolean"}
  gae_lambda = None #@param {type: "number"}
  dont_standardize_advantages = False #@param {type: "boolean"}

  #@markdown batches and steps
  batch_size = 1000 #@param {type: "integer"}
  eval_batch_size = 400 #@param {type: "integer"}

  num_agent_train_steps_per_iter = 1 #@param {type: "integer"}
  learning_rate =  5e-3 #@param {type: "number"}

  #@markdown MLP parameters
  n_layers = 2 #@param {type: "integer"}
  size = 64 #@param {type: "integer"}

  #@markdown system
  save_params = False #@param {type: "boolean"}
  no_gpu = False #@param {type: "boolean"}
  which_gpu = 0 #@param {type: "integer"}
  seed = 1 #@param {type: "integer"}

  action_noise_std = 0 #@param {type: "number"}

  #@markdown logging
  ## default is to not log video so
  ## that logs are small enough to be
  ## uploaded to gradscope
  video_log_freq =  -1#@param {type: "integer"}
  scalar_log_freq =  1#@param {type: "integer"}


args = Args()

## ensure compatibility with hw1 code
args['train_batch_size'] = args['batch_size']

if args['video_log_freq'] > 0:
  import warnings
  warnings.warn(
      '''\nLogging videos will make eventfiles too'''
      '''\nlarge for the autograder. Set video_log_freq = -1'''
      '''\nfor the runs you intend to submit.''')

In [19]:
#@title create directory for logging

data_path = '''/content/hw_16831/hw2/data'''

if not (os.path.exists(data_path)):
    os.makedirs(data_path)

logdir = args.exp_name + '_' + args.env_name + '_' + time.strftime("%d-%m-%Y_%H-%M-%S")
logdir = os.path.join(data_path, logdir)
args['logdir'] = logdir
if not(os.path.exists(logdir)):
    os.makedirs(logdir)

In [20]:
## define policy gradient trainer

class PG_Trainer(object):

    def __init__(self, params):

        #####################
        ## SET AGENT PARAMS
        #####################

        computation_graph_args = {
            'n_layers': params['n_layers'],
            'size': params['size'],
            'learning_rate': params['learning_rate'],
            }

        estimate_advantage_args = {
            'gamma': params['discount'],
            'standardize_advantages': not(params['dont_standardize_advantages']),
            'reward_to_go': params['reward_to_go'],
            'nn_baseline': params['nn_baseline'],
            'gae_lambda': params['gae_lambda'],
        }

        train_args = {
            'num_agent_train_steps_per_iter': params['num_agent_train_steps_per_iter'],
        }

        agent_params = {**computation_graph_args, **estimate_advantage_args, **train_args}

        self.params = params
        self.params['agent_class'] = PGAgent
        self.params['agent_params'] = agent_params
        self.params['batch_size_initial'] = self.params['batch_size']

        ################
        ## RL TRAINER
        ################

        self.rl_trainer = RL_Trainer(self.params)

    def run_training_loop(self):

        self.rl_trainer.run_training_loop(
            self.params['n_iter'],
            collect_policy = self.rl_trainer.agent.actor,
            eval_policy = self.rl_trainer.agent.actor,
            )

In [None]:
# Before running the below cell for Q3 (LunarLander), please modify lunar_lander

## run training
#@title imports
import importlib
import os
import time
import rob831.policies.MLP_policy
import rob831.agents.pg_agent
import rob831.infrastructure.rl_trainer
import rob831.infrastructure.utils

# path numpy for newer version
import numpy as np
if not hasattr(np, "bool8"):
    np.bool8 = np.bool_

import gym.envs.box2d.lunar_lander as lunar_lander

# auto-reload lunar lander file after modifying
importlib.reload(lunar_lander)

importlib.reload(rob831.policies.MLP_policy)
importlib.reload(rob831.agents.pg_agent)
importlib.reload(rob831.infrastructure.rl_trainer)
importlib.reload(rob831.infrastructure.utils)

from rob831.infrastructure.rl_trainer import RL_Trainer
from rob831.agents.pg_agent import PGAgent

print(args.logdir)
trainer = PG_Trainer(args)
trainer.run_training_loop()

## HW2 experiment execution guide
Complete the setup cells above, then execute each section below in order to reproduce the results required in **hw2_new.pdf**.
The code cells call `run_hw2.py` with the exact hyper-parameters requested by the assignment so you only need to run them sequentially in Colab.

> After launches finish, use the TensorBoard cell at the bottom of the notebook to inspect learning curves and capture the plots for your write-up.


### Section 5.1 – CartPole variance study
Runs all six combinations of small/large batch size, reward-to-go, and advantage standardization.
Collect the average-return curves from TensorBoard for both the small (`b = 1500`) and large (`b = 6000`) batches.


In [None]:
#@title Run CartPole variance sweep (Section 5.1)
import subprocess

cartpole_commands = [
    "python rob831/scripts/run_hw2.py --env_name CartPole-v0 -n 150 -b 1500 -dsa --exp_name q1_sb_no_rtg_dsa",
    "python rob831/scripts/run_hw2.py --env_name CartPole-v0 -n 150 -b 1500 -rtg -dsa --exp_name q1_sb_rtg_dsa",
    "python rob831/scripts/run_hw2.py --env_name CartPole-v0 -n 150 -b 1500 -rtg --exp_name q1_sb_rtg_na",
    "python rob831/scripts/run_hw2.py --env_name CartPole-v0 -n 150 -b 6000 -dsa --exp_name q1_lb_no_rtg_dsa",
    "python rob831/scripts/run_hw2.py --env_name CartPole-v0 -n 150 -b 6000 -rtg -dsa --exp_name q1_lb_rtg_dsa",
    "python rob831/scripts/run_hw2.py --env_name CartPole-v0 -n 150 -b 6000 -rtg --exp_name q1_lb_rtg_na",
]

for command in cartpole_commands:
    print(f"\n>>> Running: {command}")
    subprocess.run(command, shell=True, check=True)


### Section 5.2 – InvertedPendulum hyper-parameter search
Search over batch sizes and learning rates until you find the smallest `b*` and largest `r*` that reach the optimal return of 1000 within 100 iterations.
Record the command, iteration reaching 1000, and final return in your write-up.


In [None]:
#@title Sweep InvertedPendulum configurations (Section 5.2)
import subprocess

invpend_batch_sizes = [1000, 2000, 4000, 8000, 16000]
invpend_learning_rates = [0.03, 0.02, 0.015, 0.01, 0.005]

for batch_size in invpend_batch_sizes:
    for lr in invpend_learning_rates:
        command = (
            f"python rob831/scripts/run_hw2.py --env_name InvertedPendulum-v4 --ep_len 1000 --discount 0.92 -n 100 -l 2 -s 64 -b {batch_size} -lr {lr} -rtg --exp_name q2_b{batch_size}_lr{lr}"
        )
        print(f"\n>>> Running: {command}")
        subprocess.run(command, shell=True, check=True)
    print(f"Completed sweep for batch size {batch_size}.")


### Section 7.1 – LunarLander with neural baseline
Before running this cell, apply the PDF's modification to `lunar_lander.py`.
The command below trains with reward-to-go and a neural network baseline; capture the learning curve and report the final return.


In [None]:
#@title Train LunarLanderContinuous-v4 (Section 7.1)
import subprocess

lunar_command = (
    "python rob831/scripts/run_hw2.py --env_name LunarLanderContinuous-v4 --ep_len 1000 --discount 0.99 -n 100 -l 2 -s 64 -b 10000 -lr 0.005 --reward_to_go --nn_baseline --exp_name q3_b10000_r0.005"
)

print(f">>> Running: {lunar_command}")
subprocess.run(lunar_command, shell=True, check=True)


### Section 7.2 – HalfCheetah hyper-parameter study
First sweep over the candidate batch sizes and learning rates, then rerun the best configuration with and without reward-to-go / neural baseline to create the ablation plots.


In [None]:
#@title Sweep HalfCheetah configurations (Section 7.2)
import subprocess

cheetah_batch_sizes = [15000, 35000, 55000]
cheetah_learning_rates = [0.005, 0.01, 0.02]

for batch_size in cheetah_batch_sizes:
    for lr in cheetah_learning_rates:
        command = (
            f"python rob831/scripts/run_hw2.py --env_name HalfCheetah-v4 --ep_len 150 --discount 0.95 -n 100 -l 2 -s 32 -b {batch_size} -lr {lr} --reward_to_go --nn_baseline --exp_name q4_search_b{batch_size}_lr{lr}"
        )
        print(f"\n>>> Running: {command}")
        subprocess.run(command, shell=True, check=True)
    print(f"Completed sweep for batch size {batch_size}.")


After reviewing the search runs (e.g., via TensorBoard), fill in `cheetah_b_star` and `cheetah_r_star` with the best performing configuration, then run the ablation cell.


In [None]:
#@title HalfCheetah ablations with the best hyper-parameters
import subprocess

cheetah_b_star = 35000  # TODO: replace with the batch size that achieved the highest final return
cheetah_r_star = 0.01   # TODO: replace with the learning rate that achieved the highest final return

cheetah_ablation_commands = [
    f"python rob831/scripts/run_hw2.py --env_name HalfCheetah-v4 --ep_len 150 --discount 0.95 -n 100 -l 2 -s 32 -b {cheetah_b_star} -lr {cheetah_r_star} --exp_name q4_b{cheetah_b_star}_r{cheetah_r_star}",
    f"python rob831/scripts/run_hw2.py --env_name HalfCheetah-v4 --ep_len 150 --discount 0.95 -n 100 -l 2 -s 32 -b {cheetah_b_star} -lr {cheetah_r_star} -rtg --exp_name q4_b{cheetah_b_star}_r{cheetah_r_star}_rtg",
    f"python rob831/scripts/run_hw2.py --env_name HalfCheetah-v4 --ep_len 150 --discount 0.95 -n 100 -l 2 -s 32 -b {cheetah_b_star} -lr {cheetah_r_star} --nn_baseline --exp_name q4_b{cheetah_b_star}_r{cheetah_r_star}_nnbaseline",
    f"python rob831/scripts/run_hw2.py --env_name HalfCheetah-v4 --ep_len 150 --discount 0.95 -n 100 -l 2 -s 32 -b {cheetah_b_star} -lr {cheetah_r_star} -rtg --nn_baseline --exp_name q4_b{cheetah_b_star}_r{cheetah_r_star}_rtg_nnbaseline",
]

for command in cheetah_ablation_commands:
    print(f"\n>>> Running: {command}")
    subprocess.run(command, shell=True, check=True)


### Section 8 – Hopper with generalized advantage estimation
Train Hopper with reward-to-go, a neural baseline, and the four λ values from the PDF.
Summarize how λ influences learning speed and final return.


In [None]:
#@title Run Hopper GAE sweep (Section 8)
import subprocess

hopper_lambdas = [0.0, 0.95, 0.99, 1.0]

for lam in hopper_lambdas:
    command = (
        f"python rob831/scripts/run_hw2.py --env_name Hopper-v4 --ep_len 1000 --discount 0.99 -n 300 -l 2 -s 32 -b 2000 -lr 0.001 -rtg --nn_baseline --gae_lambda {lam} --exp_name q5_b2000_r0.001_gae{lam}"
    )
    print(f"\n>>> Running: {command}")
    subprocess.run(command, shell=True, check=True)


### Recommended execution order recap
1. CartPole variance sweep (`Run CartPole variance sweep`).
2. InvertedPendulum search (`Sweep InvertedPendulum configurations`).
3. LunarLander baseline run (`Train LunarLanderContinuous-v4`).
4. HalfCheetah sweep (`Sweep HalfCheetah configurations`).
5. HalfCheetah ablations (`HalfCheetah ablations with the best hyper-parameters`).
6. Hopper GAE sweep (`Run Hopper GAE sweep`).
7. Launch TensorBoard (existing cell at the end of the notebook) to visualize and export plots.


In [None]:
#@markdown You can visualize your runs with tensorboard from within the notebook

## requires tensorflow==2.3.0
%load_ext tensorboard
%tensorboard --logdir /content/hw_16831/hw2/data