In [4]:
# -----------------------------------------------------------------
# STEP 0: FULL COLAB SETUP
# -----------------------------------------------------------------
# 1. Update the Linux package manager
!apt-get update

# 2. Install SWIG (the C++ build dependency for Box2D)
!apt-get install -y swig

# 3. Install the Python libraries (gym, box2d, stable-baselines3)
!pip install gymnasium "gymnasium[box2d]"
!pip install stable-baselines3[extra]
!pip install imageio-ffmpeg

print("--- All dependencies installed successfully! ---")

0% [Working]            Hit:1 https://cli.github.com/packages stable InRelease
0% [Connecting to archive.ubuntu.com (185.125.190.83)] [Connecting to security.                                                                               Get:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
0% [Waiting for headers] [Waiting for headers] [Waiting for headers] [Connected                                                                               Get:3 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
0% [Waiting for headers] [Waiting for headers] [3 InRelease 3,632 B/3,632 B 1000% [Waiting for headers] [Waiting for headers] [Connected to r2u.stat.illinois.                                                                               Get:4 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
0% [Waiting for headers] [4 InRelease 14.2 kB/129 kB 11%] [Waiting for headers]             

In [6]:

# General imports
import gymnasium as gym
import numpy as np
import time
import glob
import io
import base64

# Imports for the RL agent and training
from stable_baselines3 import PPO  # We'll use PPO as our algorithm
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.vec_env import DummyVecEnv

# Imports for video display
from IPython.display import HTML
import imageio

print("--- Libraries installed and imported ---")

# %%
# -----------------------------------------------------------------
# HELPER FUNCTION: VIDEO VISUALIZATION
# -----------------------------------------------------------------
# This helper function will let us watch our trained agents
# It is an updated version of the one in your original notebook

def show_video_of_model(model, env, video_filename="video.mp4"):
    """
    Renders the environment and saves a video of the model's performance.
    """
    print(f"--- Generating video for {video_filename} ---")
    frames = []

    # Note: We must create a *new* env for this.
    # *** CORRECTION HERE ***
    video_env = gym.make(env.spec.id, render_mode='rgb_array', **env.spec.kwargs)
    obs, _ = video_env.reset()
    done = False

    while not done:
        frames.append(video_env.render())
        # The stable-baselines3 model.predict() handles the state
        action, _ = model.predict(obs, deterministic=True)
        obs, reward, done, truncated, info = video_env.step(action)
        if done or truncated:
            break

    video_env.close()

    # Save the video
    imageio.mimsave(video_filename, frames, fps=30)
    print(f"--- Video saved as {video_filename} ---")


def show_video(video_filename="video.mp4"):
    """
    Finds a .mp4 file and displays it in the notebook.
    """
    mp4list = glob.glob(video_filename)
    if len(mp4list) > 0:
        mp4 = mp4list[0]
        video = io.open(mp4, 'r+b').read()
        encoded = base64.b64encode(video)
        return HTML(data=f'''<video alt="test" autoplay loop controls style="height: 400px;">
                        <source src="data:video/mp4;base64,{encoded.decode('ascii')}" type="video/mp4" />
                     </video>''')
    else:
        print(f"Could not find video {video_filename}")
        return None

# %%
# -----------------------------------------------------------------
# PART 1: THE CROSS-DOMAIN PROBLEM
# Train on Earth, Fail on Moon
# -----------------------------------------------------------------
print("\n--- PART 1: Proving the Cross-Domain Problem ---")

# Create the two separate environments as defined in your proposal
# Earth Gravity: -9.8 m/s^2
# Moon Gravity: -1.62 m/s^2
EARTH_GRAVITY = -9.8
MOON_GRAVITY = -1.62

# Create the training (Earth) environment
# *** CORRECTION HERE ***
train_env = make_vec_env("LunarLander-v3",
                         n_envs=16,
                         env_kwargs={"gravity": EARTH_GRAVITY})

# Create the testing (Moon) environment
# *** CORRECTION HERE ***
test_env_moon = gym.make("LunarLander-v3", gravity=MOON_GRAVITY)

# --- 1A: Train the Agent ---
print("--- Training PPO agent on EARTH gravity... ---")
# Initialize the PPO agent
# "MlpPolicy" is the same as the 'Network' class in your notebook
model_earth = PPO("MlpPolicy", train_env, verbose=0)

# Train the agent (this will take 1-2 minutes)
start_time = time.time()
model_earth.learn(total_timesteps=200_000)
end_time = time.time()
print(f"--- Training finished in {end_time - start_time:.2f} seconds ---")


# --- 1B: Evaluate the Agent ---
print("--- Evaluating 'Earth-Trained' model... ---")

# Evaluate the model on the Earth environment (it should do well)
mean_reward_earth, std_reward_earth = evaluate_policy(model_earth, train_env, n_eval_episodes=10)
print(f"Avg. reward on EARTH: {mean_reward_earth:.2f} +/- {std_reward_earth:.2f}")

# Evaluate the *same* model on the Moon environment (it should fail)
mean_reward_moon, std_reward_moon = evaluate_policy(model_earth, test_env_moon, n_eval_episodes=10)
print(f"Avg. reward on MOON: {mean_reward_moon:.2f} +/- {std_reward_moon:.2f}")

print("\n--- CONCLUSION: The agent trained on Earth cannot land on the Moon! ---")

# Generate a video of the failure
show_video_of_model(model_earth, test_env_moon, "1_fail_on_moon.mp4")

# %%
# Display the video of the agent failing on the Moon
show_video("1_fail_on_moon.mp4")

# %%
# -----------------------------------------------------------------
# PART 2: THE DOMAIN RANDOMIZATION (DR) SOLUTION
# Train on *all* gravities, Succeed on *both* Earth and Moon
# -----------------------------------------------------------------
print("\n--- PART 2: Implementing the Domain Randomization Solution ---")

# This class is your "Domain Randomization Engine"
class DomainRandomizationWrapper(gym.Wrapper):
    """
    This wrapper randomizes the environment's gravity at the
    start of each new episode.
    """
    def __init__(self, env):
        super().__init__(env)
        # Define the range of gravities to sample from
        self.gravity_range = [MOON_GRAVITY, EARTH_GRAVITY] # [-1.62, -9.8]
        print(f"Domain Randomization Wrapper initialized with gravity range {self.gravity_range}")

    def reset(self, **kwargs):
        """
        This is the key function. It's called at the start of every episode.
        """
        # 1. Sample a new, random gravity
        new_gravity = np.random.uniform(low=self.gravity_range[1],
                                          high=self.gravity_range[0])

        # 2. Apply this gravity to the underlying Box2D world
        # We use 'unwrapped' to access the core environment
        self.env.unwrapped.world.gravity = (0, new_gravity)

        # 3. Call the original reset function
        return self.env.reset(**kwargs)

# --- 2A: Create and Train the DR Agent ---

# Create a function to instantiate our new random environment
def make_random_env():
    # *** CORRECTION HERE ***
    env = gym.make("LunarLander-v3", gravity=EARTH_GRAVITY) # Start with base gravity
    env = DomainRandomizationWrapper(env)
    return env

# Create a vectorized environment of our new random env
train_env_random = make_vec_env(make_random_env, n_envs=16)

print("--- Training ROBUST agent on RANDOM gravities... ---")
# Initialize a new PPO agent
model_robust = PPO("MlpPolicy", train_env_random, verbose=0)

# Train the agent (this will take 1-2 minutes)
start_time = time.time()
model_robust.learn(total_timesteps=200_000)
end_time = time.time()
print(f"--- Training finished in {end_time - start_time:.2f} seconds ---")


# --- 2B: Evaluate the Robust Agent ---
print("--- Evaluating 'Robust-Trained' model... ---")

# Evaluate the model on the Earth environment
# *** CORRECTION HERE ***
test_env_earth = gym.make("LunarLander-v3", gravity=EARTH_GRAVITY)
mean_reward, std_reward = evaluate_policy(model_robust, test_env_earth, n_eval_episodes=10)
print(f"Avg. reward on EARTH: {mean_reward:.2f} +/- {std_reward:.2f}")

# Evaluate the model on the Moon environment
# (test_env_moon was already created with v3)
mean_reward, std_reward = evaluate_policy(model_robust, test_env_moon, n_eval_episodes=10)
print(f"Avg. reward on MOON: {mean_reward:.2f} +/- {std_reward:.2f}")

print("\n--- CONCLUSION: The robust agent can land on both Earth AND the Moon! ---")

# Generate a video of the success
show_video_of_model(model_robust, test_env_moon, "2_success_on_moon.mp4")

# %%
# Display the video of the robust agent succeeding on the Moon
show_video("2_success_on_moon.mp4")

# %%
# Generate a video of the robust agent succeeding on Earth
show_video_of_model(model_robust, test_env_earth, "3_success_on_earth.mp4")

# %%
# Display the video of the robust agent succeeding on Earth
show_video("3_success_on_earth.mp4")

--- Libraries installed and imported ---

--- PART 1: Proving the Cross-Domain Problem ---
--- Training PPO agent on EARTH gravity... ---
--- Training finished in 284.17 seconds ---
--- Evaluating 'Earth-Trained' model... ---
Avg. reward on EARTH: -149.37 +/- 47.87
Avg. reward on MOON: -116.10 +/- 98.06

--- CONCLUSION: The agent trained on Earth cannot land on the Moon! ---
--- Generating video for 1_fail_on_moon.mp4 ---




--- Video saved as 1_fail_on_moon.mp4 ---

--- PART 2: Implementing the Domain Randomization Solution ---
Domain Randomization Wrapper initialized with gravity range [-1.62, -9.8]
Domain Randomization Wrapper initialized with gravity range [-1.62, -9.8]
Domain Randomization Wrapper initialized with gravity range [-1.62, -9.8]
Domain Randomization Wrapper initialized with gravity range [-1.62, -9.8]
Domain Randomization Wrapper initialized with gravity range [-1.62, -9.8]
Domain Randomization Wrapper initialized with gravity range [-1.62, -9.8]
Domain Randomization Wrapper initialized with gravity range [-1.62, -9.8]
Domain Randomization Wrapper initialized with gravity range [-1.62, -9.8]
Domain Randomization Wrapper initialized with gravity range [-1.62, -9.8]
Domain Randomization Wrapper initialized with gravity range [-1.62, -9.8]
Domain Randomization Wrapper initialized with gravity range [-1.62, -9.8]
Domain Randomization Wrapper initialized with gravity range [-1.62, -9.8]
Domain



--- Video saved as 2_success_on_moon.mp4 ---
--- Generating video for 3_success_on_earth.mp4 ---




--- Video saved as 3_success_on_earth.mp4 ---
