##Setup

You will need to make a copy of this notebook in your Google Drive before you can edit the homework files. You can do so with **File &rarr; Save a copy in Drive**.

In [None]:
#@title mount your Google Drive
#@markdown Your work will be stored in a folder called `hw_16831` by default to prevent Colab instance timeouts from deleting your edits.

import os
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
#@title set up mount symlink

DRIVE_PATH = '/content/gdrive/My\ Drive/hw_16831'
DRIVE_PYTHON_PATH = DRIVE_PATH.replace('\\', '')
if not os.path.exists(DRIVE_PYTHON_PATH):
  %mkdir $DRIVE_PATH

## the space in `My Drive` causes some issues,
## make a symlink to avoid this
SYM_PATH = '/content/hw_16831'
if not os.path.exists(SYM_PATH):
  !ln -s $DRIVE_PATH $SYM_PATH

In [None]:
#@title apt install requirements

#@markdown Run each section with Shift+Enter

#@markdown Double-click on section headers to show code.

!apt update
!apt install -y --no-install-recommends \
        build-essential \
        curl \
        git \
        gnupg2 \
        make \
        cmake \
        ffmpeg \
        swig \
        libz-dev \
        unzip \
        zlib1g-dev \
        libglfw3 \
        libglfw3-dev \
        libxrandr2 \
        libxinerama-dev \
        libxi6 \
        libxcursor-dev \
        libgl1-mesa-dev \
        libgl1-mesa-glx \
        libglew-dev \
        libosmesa6-dev \
        lsb-release \
        ack-grep \
        patchelf \
        wget \
        xpra \
        xserver-xorg-dev \
        xvfb \
        python3-opengl \
        ffmpeg > /dev/null 2>&1

In [None]:
#@title clone homework repo
#@markdown Note that this is the same codebase from homework 1,
#@markdown so you may need to move your old `hw_16831`
#@markdown folder in order to clone the repo again.

#@markdown **Don't delete your old work though!**
#@markdown You will need it for this assignment.

%cd $SYM_PATH
!git clone https://github.com/LeCAR-Lab/16831-F25-HW.git
%cd 16831-F25-HW/hw2
%pip install -r requirements.txt

# subprocess exited error at numpy is fine - install manually below
%pip install -e .

In [None]:
!pip install numpy

In [None]:
# Download mujoco from source (fix from hw1)
!wget https://mujoco.org/download/mujoco210-linux-x86_64.tar.gz
!tar xzvf mujoco210-linux-x86_64.tar.gz
!mkdir -p ~/.mujoco
!mv mujoco210 ~/.mujoco/mujoco210
!rm mujoco*

%pip install -U mujoco
%pip install -U 'mujoco-py<2.2,>=2.1'
%pip install -U pyvirtualdisplay
%pip install -U gym-notebook-wrapper
%pip install -U "cython<3"

In [None]:
# set env variables
import os
os.environ['LD_LIBRARY_PATH'] += ':/root/.mujoco/mujoco210/bin'
os.environ['MUJOCO_PY_MUJOCO_PATH'] = '/root/.mujoco/mujoco210'
os.environ['LD_LIBRARY_PATH'] += ':/usr/lib/nvidia'

In [None]:
!pip install tensorboardx
!pip install cement==2.10.14
!pip install box2d box2d-py pygame --pre

In [None]:
#@title set up virtual display

from pyvirtualdisplay import Display

display = Display(visible=0, size=(1400, 900))
display.start()

In [None]:
#@title test virtual display

#@markdown If you see a video of a four-legged ant fumbling about, setup is complete!
import numpy as np
if not hasattr(np, "bool8"):
    np.bool8 = np.bool_
    
import gym
import gnwrapper

env = gnwrapper.LoopAnimation(gym.make('Ant-v4'))

observation = env.reset()
for i in range(100):
    obs, rew, term, _ = env.step(env.action_space.sample())
    env.render()
    if term:
      break

env.display()
env.close()

## Editing Code

To edit code, click the folder icon on the left menu. Navigate to the corresponding file (`hw_16831/...`). Double click a file to open an editor. There is a timeout of about ~12 hours with Colab while it is active (and less if you close your browser window). We sync your edits to Google Drive so that you won't lose your work in the event of an instance timeout, but you will need to re-mount your Google Drive and re-install packages with every new instance.

## Run Policy Gradients

In [None]:
# fix error with autoreload

#@title imports
import importlib
import os
import time
import rob831.policies.MLP_policy
import rob831.agents.pg_agent
import rob831.infrastructure.rl_trainer
import rob831.infrastructure.utils

importlib.reload(rob831.policies.MLP_policy)
importlib.reload(rob831.agents.pg_agent)
importlib.reload(rob831.infrastructure.rl_trainer)
importlib.reload(rob831.infrastructure.utils)

from rob831.infrastructure.rl_trainer import RL_Trainer
from rob831.agents.pg_agent import PGAgent

#%load_ext autoreload
#%autoreload 2

In [18]:
#@title runtime arguments

class Args:

  def __getitem__(self, key):
    return getattr(self, key)

  def __setitem__(self, key, val):
    setattr(self, key, val)

  def __contains__(self, key):
    return hasattr(self, key)

  env_name = 'CartPole-v0' #@param
  exp_name = 'q1_sb_rtg_na' #@param

  #@markdown main parameters of interest
  n_iter = 100 #@param {type: "integer"}

  ## PDF will tell you how to set ep_len
  ## and discount for each environment
  ep_len = 200 #@param {type: "integer"}
  discount = 0.95 #@param {type: "number"}

  reward_to_go = True #@param {type: "boolean"}
  nn_baseline = False #@param {type: "boolean"}
  gae_lambda = None #@param {type: "number"}
  dont_standardize_advantages = False #@param {type: "boolean"}

  #@markdown batches and steps
  batch_size = 1000 #@param {type: "integer"}
  eval_batch_size = 400 #@param {type: "integer"}

  num_agent_train_steps_per_iter = 1 #@param {type: "integer"}
  learning_rate =  5e-3 #@param {type: "number"}

  #@markdown MLP parameters
  n_layers = 2 #@param {type: "integer"}
  size = 64 #@param {type: "integer"}

  #@markdown system
  save_params = False #@param {type: "boolean"}
  no_gpu = False #@param {type: "boolean"}
  which_gpu = 0 #@param {type: "integer"}
  seed = 1 #@param {type: "integer"}

  action_noise_std = 0 #@param {type: "number"}

  #@markdown logging
  ## default is to not log video so
  ## that logs are small enough to be
  ## uploaded to gradscope
  video_log_freq =  -1#@param {type: "integer"}
  scalar_log_freq =  1#@param {type: "integer"}


args = Args()

## ensure compatibility with hw1 code
args['train_batch_size'] = args['batch_size']

if args['video_log_freq'] > 0:
  import warnings
  warnings.warn(
      '''\nLogging videos will make eventfiles too'''
      '''\nlarge for the autograder. Set video_log_freq = -1'''
      '''\nfor the runs you intend to submit.''')

In [19]:
#@title create directory for logging

data_path = '''/content/hw_16831/hw2/data'''

if not (os.path.exists(data_path)):
    os.makedirs(data_path)

logdir = args.exp_name + '_' + args.env_name + '_' + time.strftime("%d-%m-%Y_%H-%M-%S")
logdir = os.path.join(data_path, logdir)
args['logdir'] = logdir
if not(os.path.exists(logdir)):
    os.makedirs(logdir)

In [20]:
## define policy gradient trainer

class PG_Trainer(object):

    def __init__(self, params):

        #####################
        ## SET AGENT PARAMS
        #####################

        computation_graph_args = {
            'n_layers': params['n_layers'],
            'size': params['size'],
            'learning_rate': params['learning_rate'],
            }

        estimate_advantage_args = {
            'gamma': params['discount'],
            'standardize_advantages': not(params['dont_standardize_advantages']),
            'reward_to_go': params['reward_to_go'],
            'nn_baseline': params['nn_baseline'],
            'gae_lambda': params['gae_lambda'],
        }

        train_args = {
            'num_agent_train_steps_per_iter': params['num_agent_train_steps_per_iter'],
        }

        agent_params = {**computation_graph_args, **estimate_advantage_args, **train_args}

        self.params = params
        self.params['agent_class'] = PGAgent
        self.params['agent_params'] = agent_params
        self.params['batch_size_initial'] = self.params['batch_size']

        ################
        ## RL TRAINER
        ################

        self.rl_trainer = RL_Trainer(self.params)

    def run_training_loop(self):

        self.rl_trainer.run_training_loop(
            self.params['n_iter'],
            collect_policy = self.rl_trainer.agent.actor,
            eval_policy = self.rl_trainer.agent.actor,
            )

In [None]:
# Before running the below cell for Q3 (LunarLander), please modify lunar_lander

## run training
#@title imports
import importlib
import os
import time
import rob831.policies.MLP_policy
import rob831.agents.pg_agent
import rob831.infrastructure.rl_trainer
import rob831.infrastructure.utils

# path numpy for newer version
import numpy as np
if not hasattr(np, "bool8"):
    np.bool8 = np.bool_

import gym.envs.box2d.lunar_lander as lunar_lander

# auto-reload lunar lander file after modifying
importlib.reload(lunar_lander)

importlib.reload(rob831.policies.MLP_policy)
importlib.reload(rob831.agents.pg_agent)
importlib.reload(rob831.infrastructure.rl_trainer)
importlib.reload(rob831.infrastructure.utils)

from rob831.infrastructure.rl_trainer import RL_Trainer
from rob831.agents.pg_agent import PGAgent

print(args.logdir)
trainer = PG_Trainer(args)
trainer.run_training_loop()

In [None]:
#@markdown You can visualize your runs with tensorboard from within the notebook

## requires tensorflow==2.3.0
%load_ext tensorboard
%tensorboard --logdir /content/hw_16831/hw2/data

### Assignment experiment run order
1. **CartPole-v0 variance analysis**
   ```bash
   python rob831/scripts/run_hw2.py --env_name CartPole-v0 -n 150 -b 1500 -dsa --exp_name q1_sb_no_rtg_dsa
   python rob831/scripts/run_hw2.py --env_name CartPole-v0 -n 150 -b 1500 -rtg -dsa --exp_name q1_sb_rtg_dsa
   python rob831/scripts/run_hw2.py --env_name CartPole-v0 -n 150 -b 1500 -rtg --exp_name q1_sb_rtg_na
   python rob831/scripts/run_hw2.py --env_name CartPole-v0 -n 150 -b 6000 -dsa --exp_name q1_lb_no_rtg_dsa
   python rob831/scripts/run_hw2.py --env_name CartPole-v0 -n 150 -b 6000 -rtg -dsa --exp_name q1_lb_rtg_dsa
   python rob831/scripts/run_hw2.py --env_name CartPole-v0 -n 150 -b 6000 -rtg --exp_name q1_lb_rtg_na
   ```
2. **InvertedPendulum-v4 hyperparameter sweep**
   ```bash
   python rob831/scripts/run_hw2.py --env_name InvertedPendulum-v4 \
       --ep_len 1000 --discount 0.92 -n 100 -l 2 -s 64 -b <b*> -lr <r*> -rtg \
       --exp_name q2_b<b*>_r<r*>
   ```
   > Replace `<b*>`/`<r*>` with the sweep values you test; record the smallest batch size and largest learning rate that solve the task in the PDF.
3. **LunarLanderContinuous-v4 baseline run**
   ```bash
   python rob831/scripts/run_hw2.py \
       --env_name LunarLanderContinuous-v4 --ep_len 1000 \
       --discount 0.99 -n 100 -l 2 -s 64 -b 10000 -lr 0.005 \
       --reward_to_go --nn_baseline --exp_name q3_b10000_r0.005
   ```
4. **HalfCheetah-v4 comparisons**
   ```bash
   python rob831/scripts/run_hw2.py --env_name HalfCheetah-v4 --ep_len 150 \
       --discount 0.95 -n 100 -l 2 -s 32 -b 10000 -lr 0.02 \
       --exp_name q4_search_b10000_lr0.02
   python rob831/scripts/run_hw2.py --env_name HalfCheetah-v4 --ep_len 150 \
       --discount 0.95 -n 100 -l 2 -s 32 -b 10000 -lr 0.02 -rtg \
       --exp_name q4_search_b10000_lr0.02_rtg
   python rob831/scripts/run_hw2.py --env_name HalfCheetah-v4 --ep_len 150 \
       --discount 0.95 -n 100 -l 2 -s 32 -b 10000 -lr 0.02 --nn_baseline \
       --exp_name q4_search_b10000_lr0.02_nnbaseline
   python rob831/scripts/run_hw2.py --env_name HalfCheetah-v4 --ep_len 150 \
       --discount 0.95 -n 100 -l 2 -s 32 -b 10000 -lr 0.02 -rtg --nn_baseline \
       --exp_name q4_search_b10000_lr0.02_rtg_nnbaseline
   ```
   ```bash
   python rob831/scripts/run_hw2.py --env_name HalfCheetah-v4 --ep_len 150 \
       --discount 0.95 -n 100 -l 2 -s 32 -b <b*> -lr <r*> \
       --exp_name q4_b<b*>_r<r*>
   python rob831/scripts/run_hw2.py --env_name HalfCheetah-v4 --ep_len 150 \
       --discount 0.95 -n 100 -l 2 -s 32 -b <b*> -lr <r*> -rtg \
       --exp_name q4_b<b*>_r<r*>_rtg
   python rob831/scripts/run_hw2.py --env_name HalfCheetah-v4 --ep_len 150 \
       --discount 0.95 -n 100 -l 2 -s 32 -b <b*> -lr <r*> --nn_baseline \
       --exp_name q4_b<b*>_r<r*>_nnbaseline
   python rob831/scripts/run_hw2.py --env_name HalfCheetah-v4 --ep_len 150 \
       --discount 0.95 -n 100 -l 2 -s 32 -b <b*> -lr <r*> -rtg --nn_baseline \
       --exp_name q4_b<b*>_r<r*>_rtg_nnbaseline
   ```
   > Use the `<b*>`/`<r*>` values you identified as optimal in the previous sweep.
5. **Hopper-v4 GAE ablation**
   ```bash
   # lambda in [0, 0.95, 0.99, 1]
   python rob831/scripts/run_hw2.py \
       --env_name Hopper-v4 --ep_len 1000 \
       --discount 0.99 -n 300 -l 2 -s 32 -b 2000 -lr 0.001 \
       --reward_to_go --nn_baseline --action_noise_std 0.5 --gae_lambda <lambda> \
       --exp_name q5_b2000_r0.001_lambda<lambda>
   ```
6. *(Optional bonus)* Parallelization and multi-gradient-step experiments can be launched by repeating `python rob831/scripts/run_hw2.py` with the additional flags described in the PDF.


## Experiment automation and analysis helpersThe cells below automate the launch of the homework experiments and collect the TensorBoard logs required for the HW2 report. Run them after the setup section completes.

In [None]:
#@title Configure homework directories
import os
from pathlib import Path

HW2_ROOT = Path.cwd().resolve()
DATA_DIR = HW2_ROOT / 'data'
print(f"Working directory: {HW2_ROOT}")
print(f"Data directory: {DATA_DIR}")
DATA_DIR.mkdir(parents=True, exist_ok=True)


In [None]:
#@title Helper functions for loading logs and plotting curves
import shlex
import subprocess
from dataclasses import dataclass
from pathlib import Path
from typing import List, Optional, Sequence, Tuple, Union

import matplotlib.pyplot as plt
import numpy as np
from tensorboard.backend.event_processing import event_accumulator

try:
    import pandas as pd
except ImportError:  # pragma: no cover - pandas is available in Colab but optional
    pd = None


@dataclass
class RunConfig:
    label: str
    exp_name: str
    env_name: str


def resolve_run_dirs(exp_name: str, env_name: Optional[str] = None, data_dir: Path = DATA_DIR) -> List[Path]:
    pattern = exp_name
    if env_name:
        pattern += f'_{env_name}'
    pattern += '*'
    matches = sorted(data_dir.glob(pattern))
    return matches


def load_event_scalars(run_dir: Path, tag: str = 'Eval_AverageReturn') -> Tuple[np.ndarray, np.ndarray]:
    ea = event_accumulator.EventAccumulator(str(run_dir), size_guidance={'scalars': 0})
    ea.Reload()
    events = ea.Scalars(tag)
    steps = np.array([event.step for event in events])
    values = np.array([event.value for event in events])
    return steps, values


def collect_run_data(configs: Sequence[RunConfig], tag: str = 'Eval_AverageReturn', selector: Union[str, int] = 'latest'):
    run_data = []
    for cfg in configs:
        matches = resolve_run_dirs(cfg.exp_name, cfg.env_name)
        if not matches:
            print(f"[warn] No runs found for exp_name='{cfg.exp_name}' env_name='{cfg.env_name}'.")
            continue
        if selector == 'latest':
            run_dir = max(matches, key=lambda p: p.stat().st_mtime)
        elif isinstance(selector, int):
            run_dir = matches[selector]
        else:
            run_dir = matches[0]
        steps, values = load_event_scalars(run_dir, tag)
        run_data.append({
            'label': cfg.label,
            'exp_name': cfg.exp_name,
            'env_name': cfg.env_name,
            'run_dir': run_dir,
            'steps': steps,
            'values': values,
            'tag': tag,
        })
    return run_data


def plot_learning_curves(run_data, title: str, ylabel: str = 'Average Return', smoothing_window: Optional[int] = None):
    if not run_data:
        print('[warn] No run data available to plot.')
        return
    plt.figure(figsize=(8, 5))
    for item in run_data:
        values = item['values']
        steps = item['steps']
        if smoothing_window and len(values) >= smoothing_window:
            if pd is not None:
                smoothed = pd.Series(values).rolling(smoothing_window, min_periods=1).mean().to_numpy()
            else:
                kernel = np.ones(smoothing_window) / smoothing_window
                smoothed = np.convolve(values, kernel, mode='same')
            plt.plot(steps, smoothed, label=item['label'])
        else:
            plt.plot(steps, values, label=item['label'])
    plt.title(title)
    plt.xlabel('Iteration')
    plt.ylabel(ylabel)
    plt.grid(True, linestyle='--', alpha=0.4)
    plt.legend()
    plt.show()


def summarize_run_data(run_data):
    if not run_data:
        return
    rows = []
    for item in run_data:
        values = item['values']
        if len(values) == 0:
            continue
        rows.append({
            'label': item['label'],
            'exp_name': item['exp_name'],
            'env': item['env_name'],
            'iterations': int(len(values)),
            'final_return': float(values[-1]),
            'max_return': float(np.max(values)),
            'logdir': item['run_dir'].name,
        })
    if not rows:
        print('[warn] No scalar values logged yet.')
        return
    if pd is not None:
        display(pd.DataFrame(rows))
    else:
        for row in rows:
            print(row)


def first_iteration_at_or_above(values: np.ndarray, steps: np.ndarray, threshold: float) -> Optional[int]:
    for step, value in zip(steps, values):
        if value >= threshold:
            return int(step)
    return None


def report_threshold_crossings(run_data, threshold: float):
    for item in run_data:
        step = first_iteration_at_or_above(item['values'], item['steps'], threshold)
        if step is None:
            print(f"{item['label']}: did not reach {threshold} (max={np.max(item['values']):.1f}).")
        else:
            print(f"{item['label']}: reached {threshold} at iteration {step}.")


def parse_list(value: str, cast=float):
    if isinstance(value, (list, tuple)):
        return [cast(v) for v in value]
    tokens = [token.strip() for token in str(value).split(',') if token.strip()]
    return [cast(token) for token in tokens]


def extract_flag(tokens: List[str], flag: str) -> Optional[str]:
    if flag not in tokens:
        return None
    index = tokens.index(flag)
    if index == len(tokens) - 1:
        return None
    return tokens[index + 1]


def run_experiment_commands(commands: Sequence[str], skip_existing: bool = True):
    for cmd in commands:
        tokens = shlex.split(cmd)
        exp_name = extract_flag(tokens, '--exp_name')
        env_name = extract_flag(tokens, '--env_name')
        print(f"
>>> {cmd}")
        if skip_existing and exp_name and env_name and resolve_run_dirs(exp_name, env_name):
            print(f"[skip] Logs already exist for {exp_name} ({env_name}). Delete the folder in {DATA_DIR} to rerun.")
            continue
        subprocess.run(cmd, shell=True, check=True)


## Experiment 1 (CartPole-v0): variance analysisRun the small- and large-batch experiments and then generate the plots required for the report.

In [None]:
#@title Run CartPole small-batch experiments
cartpole_small_commands = [
    "python rob831/scripts/run_hw2.py --env_name CartPole-v0 -n 150 -b 1500 -dsa --exp_name q1_sb_no_rtg_dsa",
    "python rob831/scripts/run_hw2.py --env_name CartPole-v0 -n 150 -b 1500 -rtg -dsa --exp_name q1_sb_rtg_dsa",
    "python rob831/scripts/run_hw2.py --env_name CartPole-v0 -n 150 -b 1500 -rtg --exp_name q1_sb_rtg_na",
]
run_experiment_commands(cartpole_small_commands)


In [None]:
#@title Run CartPole large-batch experiments
cartpole_large_commands = [
    "python rob831/scripts/run_hw2.py --env_name CartPole-v0 -n 150 -b 6000 -dsa --exp_name q1_lb_no_rtg_dsa",
    "python rob831/scripts/run_hw2.py --env_name CartPole-v0 -n 150 -b 6000 -rtg -dsa --exp_name q1_lb_rtg_dsa",
    "python rob831/scripts/run_hw2.py --env_name CartPole-v0 -n 150 -b 6000 -rtg --exp_name q1_lb_rtg_na",
]
run_experiment_commands(cartpole_large_commands)


In [None]:
#@title Plot CartPole learning curves
q1_tag = 'Eval_AverageReturn'  #@param ["Eval_AverageReturn", "Train_AverageReturn"]
small_configs = [
    RunConfig('No RTG, standardized advantages', 'q1_sb_no_rtg_dsa', 'CartPole-v0'),
    RunConfig('RTG + standardized advantages', 'q1_sb_rtg_dsa', 'CartPole-v0'),
    RunConfig('RTG only', 'q1_sb_rtg_na', 'CartPole-v0'),
]
large_configs = [
    RunConfig('No RTG, standardized advantages', 'q1_lb_no_rtg_dsa', 'CartPole-v0'),
    RunConfig('RTG + standardized advantages', 'q1_lb_rtg_dsa', 'CartPole-v0'),
    RunConfig('RTG only', 'q1_lb_rtg_na', 'CartPole-v0'),
]
small_data = collect_run_data(small_configs, tag=q1_tag)
plot_learning_curves(small_data, title=f'CartPole-v0 (small batch, tag={q1_tag})', ylabel=q1_tag.replace('_', ' '))
summarize_run_data(small_data)
large_data = collect_run_data(large_configs, tag=q1_tag)
plot_learning_curves(large_data, title=f'CartPole-v0 (large batch, tag={q1_tag})', ylabel=q1_tag.replace('_', ' '))
summarize_run_data(large_data)


## Experiment 2 (InvertedPendulum-v4): hyperparameter sweepUse the cell below to sweep over batch sizes and learning rates. Adjust the comma-separated lists as needed to search for the optimal configuration.

In [None]:
#@title Run InvertedPendulum sweep
batch_sizes = "2000, 4000, 6000"  #@param {type:"string"}
learning_rates = "0.005, 0.01, 0.02"  #@param {type:"string"}

b_values = parse_list(batch_sizes, cast=float)
r_values = parse_list(learning_rates, cast=float)

sweep_commands = []
for b in b_values:
    for lr in r_values:
        exp = f"q2_b{int(b)}_r{lr}"
        cmd = (
            "python rob831/scripts/run_hw2.py --env_name InvertedPendulum-v4 "
            "--ep_len 1000 --discount 0.92 -n 100 -l 2 -s 64 "
            f"-b {int(b)} -lr {lr} -rtg --exp_name {exp}"
        )
        sweep_commands.append(cmd)

run_experiment_commands(sweep_commands)


In [None]:
#@title Plot the best InvertedPendulum run
best_batch_size = 2000  #@param {type:"integer"}
best_learning_rate = 0.01  #@param {type:"number"}
q2_tag = 'Eval_AverageReturn'  #@param ["Eval_AverageReturn", "Train_AverageReturn"]

best_exp = f"q2_b{best_batch_size}_r{best_learning_rate}"
q2_data = collect_run_data([RunConfig('Selected hyperparameters', best_exp, 'InvertedPendulum-v4')], tag=q2_tag)
plot_learning_curves(q2_data, title=f'InvertedPendulum-v4 ({best_exp})', ylabel=q2_tag.replace('_', ' '))
summarize_run_data(q2_data)
if q2_data:
    report_threshold_crossings(q2_data, threshold=1000)


## Experiment 3 (LunarLanderContinuous-v4): baseline runLaunch the baseline run and then visualize its learning curve.

In [None]:
#@title Run LunarLander baseline experiment
lunarlander_commands = [
    "python rob831/scripts/run_hw2.py --env_name LunarLanderContinuous-v4 --ep_len 1000 "
    "--discount 0.99 -n 100 -l 2 -s 64 -b 10000 -lr 0.005 --reward_to_go --nn_baseline --exp_name q3_b10000_r0.005"
]
run_experiment_commands(lunarlander_commands)


In [None]:
#@title Plot LunarLander baseline curve
q3_tag = 'Eval_AverageReturn'  #@param ["Eval_AverageReturn", "Train_AverageReturn"]
lunar_data = collect_run_data([RunConfig('LunarLander baseline', 'q3_b10000_r0.005', 'LunarLanderContinuous-v4')], tag=q3_tag)
plot_learning_curves(lunar_data, title='LunarLanderContinuous-v4 baseline', ylabel=q3_tag.replace('_', ' '))
summarize_run_data(lunar_data)


## Experiment 4 (HalfCheetah-v4): policy gradient comparisonsThe following cells cover both the provided baseline configuration sweep and the follow-up experiments with the optimal hyperparameters you identify.

In [None]:
#@title Run HalfCheetah baseline comparison experiments
halfcheetah_baseline_commands = [
    "python rob831/scripts/run_hw2.py --env_name HalfCheetah-v4 --ep_len 150 --discount 0.95 -n 100 -l 2 -s 32 -b 10000 -lr 0.02 --exp_name q4_search_b10000_lr0.02",
    "python rob831/scripts/run_hw2.py --env_name HalfCheetah-v4 --ep_len 150 --discount 0.95 -n 100 -l 2 -s 32 -b 10000 -lr 0.02 -rtg --exp_name q4_search_b10000_lr0.02_rtg",
    "python rob831/scripts/run_hw2.py --env_name HalfCheetah-v4 --ep_len 150 --discount 0.95 -n 100 -l 2 -s 32 -b 10000 -lr 0.02 --nn_baseline --exp_name q4_search_b10000_lr0.02_nnbaseline",
    "python rob831/scripts/run_hw2.py --env_name HalfCheetah-v4 --ep_len 150 --discount 0.95 -n 100 -l 2 -s 32 -b 10000 -lr 0.02 -rtg --nn_baseline --exp_name q4_search_b10000_lr0.02_rtg_nnbaseline",
]
run_experiment_commands(halfcheetah_baseline_commands)


In [None]:
#@title Plot HalfCheetah baseline comparisons
q4_tag = 'Eval_AverageReturn'  #@param ["Eval_AverageReturn", "Train_AverageReturn"]
baseline_configs = [
    RunConfig('Baseline (no RTG / no baseline)', 'q4_search_b10000_lr0.02', 'HalfCheetah-v4'),
    RunConfig('Baseline + RTG', 'q4_search_b10000_lr0.02_rtg', 'HalfCheetah-v4'),
    RunConfig('Baseline + NN baseline', 'q4_search_b10000_lr0.02_nnbaseline', 'HalfCheetah-v4'),
    RunConfig('Baseline + RTG + NN baseline', 'q4_search_b10000_lr0.02_rtg_nnbaseline', 'HalfCheetah-v4'),
]
baseline_data = collect_run_data(baseline_configs, tag=q4_tag)
plot_learning_curves(baseline_data, title='HalfCheetah-v4 baseline comparisons', ylabel=q4_tag.replace('_', ' '))
summarize_run_data(baseline_data)


In [None]:
#@title Run HalfCheetah hyperparameter sweep
halfcheetah_batch_sizes = "5000, 7500, 10000"  #@param {type:"string"}
halfcheetah_learning_rates = "0.01, 0.02, 0.03"  #@param {type:"string"}

hc_b_values = parse_list(halfcheetah_batch_sizes, cast=float)
hc_lr_values = parse_list(halfcheetah_learning_rates, cast=float)

hc_sweep_commands = []
for b in hc_b_values:
    for lr in hc_lr_values:
        exp = f"q4_b{int(b)}_r{lr}"
        base_cmd = (
            "python rob831/scripts/run_hw2.py --env_name HalfCheetah-v4 --ep_len 150 --discount 0.95 -n 100 -l 2 -s 32 "
            f"-b {int(b)} -lr {lr}"
        )
        hc_sweep_commands.extend([
            f"{base_cmd} --exp_name {exp}",
            f"{base_cmd} -rtg --exp_name {exp}_rtg",
            f"{base_cmd} --nn_baseline --exp_name {exp}_nnbaseline",
            f"{base_cmd} -rtg --nn_baseline --exp_name {exp}_rtg_nnbaseline",
        ])

run_experiment_commands(hc_sweep_commands)


In [None]:
#@title Plot HalfCheetah runs with selected b* and r*
optimal_batch_size = 10000  #@param {type:"integer"}
optimal_learning_rate = 0.02  #@param {type:"number"}
q4_opt_tag = 'Eval_AverageReturn'  #@param ["Eval_AverageReturn", "Train_AverageReturn"]

opt_prefix = f"q4_b{optimal_batch_size}_r{optimal_learning_rate}"
optimal_configs = [
    RunConfig('No RTG / no baseline', opt_prefix, 'HalfCheetah-v4'),
    RunConfig('+ RTG', f"{opt_prefix}_rtg", 'HalfCheetah-v4'),
    RunConfig('+ NN baseline', f"{opt_prefix}_nnbaseline", 'HalfCheetah-v4'),
    RunConfig('+ RTG + NN baseline', f"{opt_prefix}_rtg_nnbaseline", 'HalfCheetah-v4'),
]
optimal_data = collect_run_data(optimal_configs, tag=q4_opt_tag)
plot_learning_curves(optimal_data, title=f'HalfCheetah-v4 (b*={optimal_batch_size}, r*={optimal_learning_rate})', ylabel=q4_opt_tag.replace('_', ' '))
summarize_run_data(optimal_data)


## Experiment 5 (Hopper-v4): GAE ablationRun the four GAE settings and compare their performance.

In [None]:
#@title Run Hopper GAE experiments
gae_lambdas = "0.0, 0.95, 0.99, 1.0"  #@param {type:"string"}

lambda_values = parse_list(gae_lambdas, cast=float)
hopper_commands = []
for lam in lambda_values:
    exp = f"q5_b2000_r0.001_lambda{lam}"
    cmd = (
        "python rob831/scripts/run_hw2.py --env_name Hopper-v4 --ep_len 1000 --discount 0.99 "
        "-n 300 -l 2 -s 32 -b 2000 -lr 0.001 --reward_to_go --nn_baseline --action_noise_std 0.5 "
        f"--gae_lambda {lam} --exp_name {exp}"
    )
    hopper_commands.append(cmd)

run_experiment_commands(hopper_commands)


In [None]:
#@title Plot Hopper GAE learning curves
q5_tag = 'Eval_AverageReturn'  #@param ["Eval_AverageReturn", "Train_AverageReturn"]

lambda_values_for_plot = lambda_values if 'lambda_values' in globals() else parse_list('0.0,0.95,0.99,1.0', cast=float)

gae_configs = [RunConfig(f"lambda={lam}", f"q5_b2000_r0.001_lambda{lam}", 'Hopper-v4') for lam in lambda_values_for_plot]
gae_data = collect_run_data(gae_configs, tag=q5_tag)
plot_learning_curves(gae_data, title='Hopper-v4 GAE comparison', ylabel=q5_tag.replace('_', ' '), smoothing_window=5)
summarize_run_data(gae_data)
