In [None]:
####### Pip Installs gymnasium and Stable-Baselines
!pip install gymnasium[box2d] stable-baselines3 shimmy

Collecting stable-baselines3
  Downloading stable_baselines3-2.7.1-py3-none-any.whl.metadata (4.8 kB)
Collecting shimmy
  Downloading Shimmy-2.0.0-py3-none-any.whl.metadata (3.5 kB)
Collecting box2d-py==2.3.5 (from gymnasium[box2d])
  Downloading box2d-py-2.3.5.tar.gz (374 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m374.4/374.4 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting swig==4.* (from gymnasium[box2d])
  Downloading swig-4.4.0-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (3.5 kB)
Downloading swig-4.4.0-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m25.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading stable_baselines3-2.7.1-py3-none-any.whl (188 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m188.0/188.0 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25

In [None]:
###### pip Installs dependencies and code for google drive mount
# 1. Install system dependencies
!apt-get install -y swig cmake ffmpeg xvfb python3-opengl

# 2. Install Python packages
!pip install gymnasium[box2d] stable-baselines3[extra] pyvirtualdisplay

# 3. Mount Google Drive for persistent storage
from google.colab import drive
drive.mount('/content/drive')

# Create a dedicated folder in your Drive for the Hardcore agent
import os
save_path = "/content/drive/MyDrive/RL_Hardcore_Project/"
os.makedirs(save_path, exist_ok=True)

# 4. Start virtual display
from pyvirtualdisplay import Display
virtual_display = Display(visible=0, size=(1400, 900))
virtual_display.start()

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
python3-opengl is already the newest version (3.1.5+dfsg-1).
swig is already the newest version (4.0.2-1ubuntu1).
cmake is already the newest version (3.22.1-1ubuntu1.22.04.2).
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
xvfb is already the newest version (2:21.1.4-2ubuntu1.7~22.04.16).
0 upgraded, 0 newly installed, 0 to remove and 1 not upgraded.
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


<pyvirtualdisplay.display.Display at 0x7dbf1c9306e0>

In [None]:
###### setting up gym environment
import gymnasium as gym
from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize
from stable_baselines3.common.monitor import Monitor

log_dir = "/content/drive/MyDrive/RL_Hardcore_Project/logs/"
os.makedirs(log_dir, exist_ok=True)

def make_env():
    # HARDCORE version enabled
    env = gym.make("BipedalWalkerHardcore-v3", render_mode="rgb_array")
    # Monitor records 'r' (reward) and 'l' (length) for every episode
    env = Monitor(env, log_dir)
    return env

# Vectorize and Normalize
env = DummyVecEnv([make_env])
env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10.)

Gym has been unmaintained since 2022 and does not support NumPy 2.0 amongst other critical functionality.
Please upgrade to Gymnasium, the maintained drop-in replacement of Gym, or contact the authors of your software and request that they upgrade.
See the migration guide at https://gymnasium.farama.org/introduction/migration_guide/ for additional information.
  from pkg_resources import resource_stream, resource_exists
  return datetime.utcnow().replace(tzinfo=utc)
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)
Implementing implicit namespace packages (as specified in

In [None]:
###### Code for SAC model definition and learning

import os
from stable_baselines3 import SAC
from stable_baselines3.common.callbacks import CheckpointCallback

# 1. Setup Auto-Save Checkpoints (every 25k steps)
# We set save_replay_buffer=True so it records memory to Drive automatically
checkpoint_callback = CheckpointCallback(
    save_freq=25000,
    save_path=save_path,
    name_prefix="sac_hardcore",
    save_replay_buffer=True,
    save_vecnormalize=True
)

# 2. Hardcore-optimized Hyperparameters
model = SAC(
    "MlpPolicy",
    env,
    verbose=1,
    learning_rate=3e-4,
    buffer_size=1000000, # This memory is what we are saving
    batch_size=256,
    ent_coef='auto',
    gamma=0.99,
    tau=0.005,
    train_freq=1,
    gradient_steps=1,
    policy_kwargs=dict(net_arch=[512, 512]),
)

# 3. Training with Keyboard Interrupt Protection
print("Starting Hardcore training. To stop and save early, press the STOP button.")

try:
    model.learn(
        total_timesteps=2000000, # Large goal for Hardcore
        callback=checkpoint_callback,
        progress_bar=True,
        log_interval=10
    )
except KeyboardInterrupt:
    print("\n[MANUAL STOP] Interrupt detected. Performing emergency save to Google Drive...")
finally:
    # This section runs even if you stop the cell manually
    final_zip = f"{save_path}sac_hardcore_manual_stop"
    final_pkl = f"{save_path}vec_normalize_manual_stop.pkl"
    final_rb  = f"{save_path}replay_buffer_manual_stop.pkl"

    # Save Model Weights
    model.save(final_zip)
    # Save Normalization Stats
    env.save(final_pkl)
    # Save Replay Buffer (Memory)
    model.save_replay_buffer(final_rb)

    print(f"--- ALL DATA SAVED TO DRIVE ---")
    print(f"Model: {final_zip}.zip")
    print(f"Stats: {final_pkl}")
    print(f"Buffer: {final_rb}")

In [None]:
###### code for plotting reward results
import pandas as pd
import matplotlib.pyplot as plt
from stable_baselines3.common.monitor import load_results

# Load the logs from Google Drive
results = load_results(log_dir)

# Plotting
plt.figure(figsize=(10, 5))
plt.plot(results.l.index, results.r, alpha=0.2, color='blue', label='Episode Reward')
# Calculate moving average for smoothness
rolling_mean = results.r.rolling(window=50).mean()
plt.plot(results.l.index, rolling_mean, color='red', label='Smoothed Mean (50 ep)')

plt.title("Hardcore Bipedal Walker Learning Curve")
plt.xlabel("Episodes")
plt.ylabel("Reward")
plt.legend()
plt.grid(True)
plt.show()

In [None]:
###### code for producing video for showing walker performance after learning
import base64
from pathlib import Path
from IPython import display as ipythondisplay
from stable_baselines3.common.vec_env import VecVideoRecorder, VecNormalize

def show_video(video_path):
    video_file = Path(video_path)
    if video_file.is_file():
        video_b64 = base64.b64encode(video_file.read_bytes()).decode("ascii")
        ipythondisplay.display(ipythondisplay.HTML(data=f'''
            <video width="600" height="400" controls>
                <source src="data:video/mp4;base64,{video_b64}" type="video/mp4" />
            </video>'''))
    else:
        print("Video file not found.")

# 1. Paths to your Google Drive files
model_path = f"{save_path}sac_hardcore_final.zip"
stats_path = f"{save_path}vec_normalize_hardcore_final.pkl"

# 2. Reconstruct Eval Environment (Hardcore)
eval_env = DummyVecEnv([make_env])
eval_env = VecNormalize.load(stats_path, eval_env)

# IMPORTANT: Disable training/updates during evaluation
eval_env.training = False
eval_env.norm_reward = False

# 3. Setup the Video Recorder
video_folder = "./videos_hardcore/"
video_length = 2000  # Hardcore episodes are longer due to obstacles
eval_env = VecVideoRecorder(
    eval_env,
    video_folder,
    record_video_trigger=lambda x: x == 0,
    video_length=video_length,
    name_prefix="hardcore-walker-result"
)

# 4. Load the trained model
model = SAC.load(model_path, env=eval_env)

# 5. Run evaluation
print("Testing Hardcore Agent and Recording Video...")
obs = eval_env.reset()
total_reward = 0

for i in range(video_length):
    action, _states = model.predict(obs, deterministic=True)
    obs, rewards, dones, info = eval_env.step(action)
    total_reward += rewards[0]
    if dones[0]:
        print(f"Episode finished early at step {i}")
        break

eval_env.close()
print(f"Final Evaluation Score: {total_reward:.2f}")

# 6. Display the result
video_file = f"{video_folder}/hardcore-walker-result-step-0-to-step-{video_length}.mp4"
show_video(video_file)