<a href="https://colab.research.google.com/github/zahra-eslamian/artificial_intelligence_A_to_Z/blob/main/dqn_lunar_lander.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# System dependency needed to build Box2D / box2d-py
!apt-get install -y swig

# Gymnasium with Box2D support
!pip install gymnasium
!pip install gymnasium[box2d]

# (Optional) Only if you really need Atari, not needed for LunarLander:
# !pip install "gymnasium[atari]"

# DQN dependencies
!pip install torch
!pip install imageio

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  swig4.0
Suggested packages:
  swig-doc swig-examples swig4.0-examples swig4.0-doc
The following NEW packages will be installed:
  swig swig4.0
0 upgraded, 2 newly installed, 0 to remove and 41 not upgraded.
Need to get 1,116 kB of archives.
After this operation, 5,542 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 swig4.0 amd64 4.0.2-1ubuntu1 [1,110 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 swig all 4.0.2-1ubuntu1 [5,632 B]
Fetched 727 kB in 1s (564 kB/s)
Selecting previously unselected package swig4.0.
(Reading database ... 121713 files and directories currently installed.)
Preparing to unpack .../swig4.0_4.0.2-1ubuntu1_amd64.deb ...
Unpacking swig4.0 (4.0.2-1ubuntu1) ...
Selecting previously unselected package swig.
Preparing to unpack .../swig_4.0.2-1ubuntu1

In [5]:
import random
from collections import deque

import gymnasium as gym
import imageio
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

# =====================
# Hyperparameters
# =====================
ENV_ID = "LunarLander-v3"

GAMMA = 0.99
LR = 5e-4                 # a bit smaller, more stable
BATCH_SIZE = 64
BUFFER_SIZE = 100_000
MIN_REPLAY_SIZE = 5_000   # start learning earlier

EPS_START = 1.0
EPS_END = 0.05
EPS_DECAY = 100_000       # decay over ~100k steps

INTERPOLATION_PARAMETER = 1e-3  # tau for soft target network updates

MAX_EPISODES = 1000
MAX_STEPS_PER_EPISODE = 1000

VIDEO_FILENAME = "lunar_lander_dqn.mp4"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


# =====================
# DQN Network
# =====================
class DQN(nn.Module):
    def __init__(self, obs_dim, n_actions):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(obs_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, n_actions),
        )

    def forward(self, x):
        return self.net(x)


# =====================
# Replay Buffer
# =====================
class ReplayBuffer:
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)

    def push(self, state, action, reward, next_state, done):
        self.buffer.append((state, action, reward, next_state, done))

    def sample(self, batch_size):
        batch = random.sample(self.buffer, batch_size)
        states, actions, rewards, next_states, dones = zip(*batch)

        states = torch.tensor(states, dtype=torch.float32, device=device)
        actions = torch.tensor(actions, dtype=torch.int64, device=device)
        rewards = torch.tensor(rewards, dtype=torch.float32, device=device)
        next_states = torch.tensor(next_states, dtype=torch.float32, device=device)
        dones = torch.tensor(dones, dtype=torch.float32, device=device)

        return states, actions, rewards, next_states, dones

    def __len__(self):
        return len(self.buffer)


# =====================
# Epsilon-greedy policy
# =====================
def get_epsilon(step):
    # Linear decay from EPS_START to EPS_END
    epsilon = EPS_END + (EPS_START - EPS_END) * max(0, (EPS_DECAY - step)) / EPS_DECAY
    return epsilon


def select_action(policy_net, state, step, n_actions):
    epsilon = get_epsilon(step)
    if random.random() < epsilon:
        # Explore
        return random.randrange(n_actions)
    else:
        # Exploit
        state_v = torch.tensor(state, dtype=torch.float32, device=device).unsqueeze(0)
        with torch.no_grad():
            q_values = policy_net(state_v)
        return int(torch.argmax(q_values, dim=1).item())


# =====================
# Soft target update (Polyak averaging)
# =====================
def soft_update(target_net, policy_net, tau):
    with torch.no_grad():
        for target_param, policy_param in zip(target_net.parameters(), policy_net.parameters()):
            target_param.data.copy_(
                tau * policy_param.data + (1.0 - tau) * target_param.data
            )


# =====================
# Optimize DQN
# =====================
def compute_loss(policy_net, target_net, optimizer, replay_buffer):
    states, actions, rewards, next_states, dones = replay_buffer.sample(BATCH_SIZE)

    # Current Q values
    q_values = policy_net(states)
    q_values = q_values.gather(1, actions.unsqueeze(1)).squeeze(1)

    # Target Q values
    with torch.no_grad():
        next_q_values = target_net(next_states).max(1)[0]
        target_q_values = rewards + GAMMA * next_q_values * (1 - dones)

    # Huber loss (SmoothL1Loss) instead of MSE
    loss = nn.SmoothL1Loss()(q_values, target_q_values)

    optimizer.zero_grad()
    loss.backward()
    # Gradient clipping for stability
    torch.nn.utils.clip_grad_norm_(policy_net.parameters(), max_norm=1.0)
    optimizer.step()

    return loss.item()


# =====================
# Evaluation of trained agent
# =====================
def evaluate_agent(policy_net, n_episodes=20):
    env = gym.make(ENV_ID)
    rewards = []

    for ep in range(n_episodes):
        state, _ = env.reset(seed=ep)
        done = False
        ep_reward = 0.0

        while not done:
            state_v = torch.tensor(state, dtype=torch.float32, device=device).unsqueeze(0)
            with torch.no_grad():
                q_values = policy_net(state_v)
                action = int(torch.argmax(q_values, dim=1).item())

            next_state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
            ep_reward += reward
            state = next_state

        rewards.append(ep_reward)

    env.close()
    print(f"\nEvaluation over {n_episodes} episodes:")
    print(f"  Mean reward: {np.mean(rewards):.2f}")
    print(f"  Min reward:  {np.min(rewards):.2f}")
    print(f"  Max reward:  {np.max(rewards):.2f}\n")
    return rewards


# =====================
# Record best episode video
# =====================
def record_best_video(policy_net, filename=VIDEO_FILENAME, episodes=10):
    """
    Run the trained policy for several episodes and save a video
    of the *best* episode (highest total reward).
    """
    video_env = gym.make(ENV_ID, render_mode="rgb_array")

    best_frames = None
    best_reward = -float("inf")

    for ep in range(episodes):
        state, _ = video_env.reset(seed=ep)
        done = False
        step = 0
        ep_reward = 0.0
        frames = []

        while not done and step < MAX_STEPS_PER_EPISODE:
            state_v = torch.tensor(state, dtype=torch.float32, device=device).unsqueeze(0)
            with torch.no_grad():
                q_values = policy_net(state_v)
                action = int(torch.argmax(q_values, dim=1).item())

            next_state, reward, terminated, truncated, _ = video_env.step(action)
            done = terminated or truncated

            frame = video_env.render()
            frames.append(frame)

            state = next_state
            ep_reward += reward
            step += 1

        print(f"Video rollout episode {ep}: total reward = {ep_reward:.2f}")

        if ep_reward > best_reward:
            best_reward = ep_reward
            best_frames = frames

    video_env.close()

    if best_frames is None:
        print("No episodes recorded, something went wrong.")
        return

    print(f"\nSaving best episode (reward={best_reward:.2f}) "
          f"to {filename} with {len(best_frames)} frames...")
    with imageio.get_writer(filename, fps=30) as writer:
        for frame in best_frames:
            writer.append_data(frame)
    print("Video saved:", filename)


# =====================
# Main training loop
# =====================
if __name__ == "__main__":
    env = gym.make(ENV_ID)
    obs_dim = env.observation_space.shape[0]
    n_actions = env.action_space.n

    print("Observation dim:", obs_dim)
    print("Number of actions:", n_actions)

    # Networks
    policy_net = DQN(obs_dim, n_actions).to(device)
    target_net = DQN(obs_dim, n_actions).to(device)
    target_net.load_state_dict(policy_net.state_dict())
    target_net.eval()

    optimizer = optim.Adam(policy_net.parameters(), lr=LR)
    replay_buffer = ReplayBuffer(BUFFER_SIZE)

    # Fill replay buffer with random experience
    print("Filling replay buffer with random policy...")
    state, _ = env.reset(seed=0)
    while len(replay_buffer) < MIN_REPLAY_SIZE:
        action = env.action_space.sample()
        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated

        replay_buffer.push(state, action, reward, next_state, done)

        if done:
            state, _ = env.reset()
        else:
            state = next_state

    print(f"Replay buffer filled with {len(replay_buffer)} transitions. Starting training.")

    total_steps = 0
    episode_rewards = []

    for episode in range(1, MAX_EPISODES + 1):
        state, _ = env.reset()
        episode_reward = 0.0

        for t in range(MAX_STEPS_PER_EPISODE):
            total_steps += 1

            # Choose action
            action = select_action(policy_net, state, total_steps, n_actions)

            # Environment step
            next_state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated

            # Store experience
            replay_buffer.push(state, action, reward, next_state, done)

            state = next_state
            episode_reward += reward

            # Optimize model
            loss = compute_loss(policy_net, target_net, optimizer, replay_buffer)

            # Soft update target network
            soft_update(target_net, policy_net, INTERPOLATION_PARAMETER)

            if done:
                break

        episode_rewards.append(episode_reward)
        avg_reward = np.mean(episode_rewards[-20:])

        print(
            f"Episode {episode:4d} | "
            f"Reward: {episode_reward:7.2f} | "
            f"Avg(20): {avg_reward:7.2f} | "
            f"Steps: {total_steps:7d} | "
            f"Epsilon: {get_epsilon(total_steps):.3f}"
        )

        # Simple "solved" condition
        if avg_reward >= 200.0 and episode >= 20:
            print("Environment solved, stopping training.")
            break

    env.close()

    # Evaluate trained agent
    evaluate_agent(policy_net, n_episodes=20)

    # Record video of the best episode among several rollouts
    record_best_video(policy_net, filename=VIDEO_FILENAME, episodes=10)


Using device: cpu
Observation dim: 8
Number of actions: 4
Filling replay buffer with random policy...
Replay buffer filled with 5000 transitions. Starting training.
Episode    1 | Reward: -103.66 | Avg(20): -103.66 | Steps:      73 | Epsilon: 0.999
Episode    2 | Reward: -187.40 | Avg(20): -145.53 | Steps:     172 | Epsilon: 0.998
Episode    3 | Reward: -202.66 | Avg(20): -164.57 | Steps:     263 | Epsilon: 0.998
Episode    4 | Reward:  -83.26 | Avg(20): -144.24 | Steps:     365 | Epsilon: 0.997
Episode    5 | Reward: -140.18 | Avg(20): -143.43 | Steps:     435 | Epsilon: 0.996
Episode    6 | Reward:  -71.93 | Avg(20): -131.51 | Steps:     506 | Epsilon: 0.995
Episode    7 | Reward: -148.75 | Avg(20): -133.98 | Steps:     604 | Epsilon: 0.994
Episode    8 | Reward: -323.09 | Avg(20): -157.62 | Steps:     726 | Epsilon: 0.993
Episode    9 | Reward: -267.25 | Avg(20): -169.80 | Steps:     817 | Epsilon: 0.992
Episode   10 | Reward: -110.44 | Avg(20): -163.86 | Steps:     879 | Epsilon: 0



Video rollout episode 9: total reward = 265.08

Saving best episode (reward=284.38) to lunar_lander_dqn.mp4 with 651 frames...
Video saved: lunar_lander_dqn.mp4
