# Flappy Bird — Dueling Double DQN (State-Based, Gymnasium)
This notebook implements a **state-based** Dueling Double DQN for `FlappyBird-v0` from **flappy-bird-gymnasium**.
It includes:
- Dueling DQN network architecture
- Double DQN training logic
- Replay buffer with `push()` and `sample()`
- Reward shaping and survival/progress bonuses
- Slower epsilon decay
- Gradient clipping and step-based target updates
- Model saving/loading and optional gameplay recording

Run on GPU (Colab/VS Code) for faster training. Adjust `episodes` for quick tests.


In [19]:
# Install dependencies (uncomment to run in Colab / if packages are missing)
#pip install gymnasium flappy-bird-gymnasium torch numpy matplotlib imageio[ffmpeg] tqdm --quiet
#pip install moviepy


In [20]:
import random
import os
import math
import time
import numpy as np
import gymnasium as gym
import flappy_bird_gymnasium
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from collections import deque
from IPython.display import Video, display
import imageio
import matplotlib.pyplot as plt
import cv2

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Device:', device)


Device: cuda


In [21]:
# Dueling DQN (state-based)
class DuelingDQN(nn.Module):
    def __init__(self, state_size, action_size, hidden=128):
        super(DuelingDQN, self).__init__()
        self.fc1 = nn.Linear(state_size, hidden)
        self.fc2 = nn.Linear(hidden, hidden)
        self.value_head = nn.Linear(hidden, 1)
        self.adv_head = nn.Linear(hidden, action_size)

    def forward(self, x):
        # Accept numpy arrays or torch tensors
        if isinstance(x, np.ndarray):
            x = torch.from_numpy(x).float().to(device)
        x = x.float().to(device)
        if x.dim() == 1:
            x = x.unsqueeze(0)
        h = F.relu(self.fc1(x))
        h = F.relu(self.fc2(h))
        value = self.value_head(h)
        adv = self.adv_head(h)
        q = value + (adv - adv.mean(dim=1, keepdim=True))
        return q


In [22]:
# Replay buffer
class ReplayBuffer:
    def __init__(self, capacity=50000):
        self.buffer = deque(maxlen=capacity)

    def push(self, state, action, reward, next_state, done):
        self.buffer.append((state, action, reward, next_state, done))

    def sample(self, batch_size):
        batch = random.sample(self.buffer, batch_size)
        states, actions, rewards, next_states, dones = zip(*batch)
        states = np.asarray(states, dtype=np.float32)
        next_states = np.asarray(next_states, dtype=np.float32)
        return (
            torch.from_numpy(states).to(device),
            torch.tensor(actions, dtype=torch.long, device=device),
            torch.tensor(rewards, dtype=torch.float32, device=device),
            torch.from_numpy(next_states).to(device),
            torch.tensor(dones, dtype=torch.float32, device=device),
        )

    def __len__(self):
        return len(self.buffer)


In [23]:
# Utilities: stacking, action selection, save/load
def stack_frames(buffer_deque, new_state, k):
    if len(buffer_deque) == 0:
        for _ in range(k):
            buffer_deque.append(new_state)
    else:
        buffer_deque.append(new_state)
    return np.concatenate(list(buffer_deque), axis=-1)

def select_action(q_net, state, action_space, epsilon):
    if random.random() < epsilon:
        return action_space.sample()
    state_t = torch.from_numpy(np.array(state, dtype=np.float32)).unsqueeze(0).to(device)
    with torch.no_grad():
        qvals = q_net(state_t)
        return int(torch.argmax(qvals, dim=1).item())

def save_model(net, path):
    torch.save(net.state_dict(), path)

def load_model(net, path):
    net.load_state_dict(torch.load(path, map_location=device))


In [24]:
# Training function (Dueling + Double DQN, state-based) with improved stability
def train_dueling_double_dqn_state(
    env_name='FlappyBird-v0',
    episodes=2000,
    gamma=0.99,
    lr=5e-5,
    batch_size=128,
    memory_size=100000,
    epsilon_start=1.0,
    epsilon_end=0.05,
    epsilon_decay=0.9998,      # slower decay
    target_update_steps=4000,  # update target network by training steps
    reward_scale=5.0,
    survival_bonus=0.02,
    progress_coef=0.001,
    grad_clip=10.0,
    frame_stack_k=3,
    save_path='best_flappy_dqn_state.pth',
    record_video=False,
    video_path='flappy_state_trained.mp4'
):
    env = gym.make(env_name)
    obs, _ = env.reset()
    base_dim = len(obs)
    state_dim = base_dim * frame_stack_k
    action_dim = env.action_space.n

    q_net = DuelingDQN(state_dim, action_dim).to(device)
    target_net = DuelingDQN(state_dim, action_dim).to(device)
    target_net.load_state_dict(q_net.state_dict())
    optimizer = optim.Adam(q_net.parameters(), lr=lr)
    memory = ReplayBuffer(memory_size)

    epsilon = epsilon_start
    train_steps = 0
    rewards_history = []
    best_avg = -1e9
    moving_avg_window = 100

    for ep in range(1, episodes+1):
        obs, _ = env.reset()
        frame_deque = deque(maxlen=frame_stack_k)
        state = stack_frames(frame_deque, obs, frame_stack_k)
        total_reward = 0.0
        done = False

        prev_h_dist = None
        while not done:
            action = select_action(q_net, state, env.action_space, epsilon)
            next_obs, reward, terminated, truncated, info = env.step(action)
            done = terminated or truncated

            env_r = float(reward)
            try:
                h_dist = float(next_obs[2])
            except Exception:
                h_dist = None

            # compute progress bonus if horizontal distance exists
            progress_bonus = 0.0
            if prev_h_dist is not None and h_dist is not None:
                # if bird gets closer to the pipe horizontally, that's progress (prev_h_dist - h_dist positive)
                progress_bonus = progress_coef * max(0.0, prev_h_dist - h_dist)

            prev_h_dist = h_dist if h_dist is not None else prev_h_dist

            # survival bonus small positive each step
            shaped = env_r * reward_scale + survival_bonus + progress_bonus
            # clip shaped reward to keep updates stable
            shaped = float(max(-2.0, min(2.0, shaped)))

            frame_deque.append(next_obs)
            next_state = np.concatenate(list(frame_deque), axis=-1)

            memory.push(state, action, shaped, next_state, float(done))
            state = next_state
            total_reward += env_r  # log original environment reward (not shaped)

            # training step
            if len(memory) >= batch_size:
                states_b, actions_b, rewards_b, next_states_b, dones_b = memory.sample(batch_size)

                # current Q values
                q_values = q_net(states_b).gather(1, actions_b.unsqueeze(1)).squeeze(1)

                # Double DQN: action selection by online net, evaluation by target net
                with torch.no_grad():
                    next_actions = q_net(next_states_b).argmax(dim=1, keepdim=True)
                    next_q = target_net(next_states_b).gather(1, next_actions).squeeze(1)
                    target_q = rewards_b + gamma * next_q * (1.0 - dones_b)

                loss = F.mse_loss(q_values, target_q)
                optimizer.zero_grad()
                loss.backward()
                torch.nn.utils.clip_grad_norm_(q_net.parameters(), grad_clip)
                optimizer.step()

                train_steps += 1
                if train_steps % target_update_steps == 0:
                    target_net.load_state_dict(q_net.state_dict())

            # decay epsilon per step slowly
            epsilon = max(epsilon_end, epsilon * epsilon_decay)

        rewards_history.append(total_reward)
        # moving average
        if len(rewards_history) >= moving_avg_window:
            avg_recent = float(np.mean(rewards_history[-moving_avg_window:]))
        else:
            avg_recent = float(np.mean(rewards_history))

        # save best model based on moving average
        if len(rewards_history) >= moving_avg_window and avg_recent > best_avg:
            best_avg = avg_recent
            save_model(q_net, save_path)

        if ep % 10 == 0 or ep == 1:
            print(f"Episode {ep}/{episodes} | Reward: {total_reward:.2f} | Avg{moving_avg_window}: {avg_recent:.2f} | Epsilon: {epsilon:.4f} | Buffer: {len(memory)}")

    env.close()

    if record_video:
        video_path = os.path.join("videos", video_path)

        env = gym.make(env_name, render_mode='rgb_array')
        writer = imageio.get_writer(video_path, fps=30)

        num_episodes = 10
        max_steps = 500

        for ep in range(1, num_episodes + 1):
            obs, _ = env.reset()
            frame_deque = deque(maxlen=frame_stack_k)
            state = stack_frames(frame_deque, obs, frame_stack_k)
            done = False
            step = 0

            while not done and step < max_steps:
                step += 1
                action = select_action(q_net, state, env.action_space, 0.0)
                obs, _, terminated, truncated, _ = env.step(action)
                done = terminated or truncated
                frame_deque.append(obs)
                state = np.concatenate(list(frame_deque), axis=-1)
                frame = env.render()

                # Overlay text showing episode number
                frame = cv2.putText(
                    frame.copy(),
                    f"Episode {ep}",
                    (20, 40),
                    cv2.FONT_HERSHEY_SIMPLEX,
                    1.2,
                    (255, 255, 255),
                    3,
                    cv2.LINE_AA,
                )
                writer.append_data(frame)

            print(f"Recorded episode {ep}/{num_episodes}")

        writer.close()
        env.close()
        print(f"Saved compiled video to {video_path}")

    return q_net, rewards_history


In [25]:
model, rewards = train_dueling_double_dqn_state(
    env_name='FlappyBird-v0',
    episodes=10000,
    gamma=0.99,
    lr=5e-5,
    batch_size=128,
    memory_size=100000,
    epsilon_start=1.0,
    epsilon_end=0.05,
    epsilon_decay=0.9998,
    target_update_steps=4000,
    reward_scale=5.0,
    survival_bonus=0.02,
    progress_coef=0.001,
    grad_clip=10.0,
    frame_stack_k=3,
    save_path='best_flappy_dqn_state.pth',
    record_video=True,
    video_path='flappy_state_trained.mp4'
)



Episode 1/10000 | Reward: -8.10 | Avg100: -8.10 | Epsilon: 0.9900 | Buffer: 50
Episode 10/10000 | Reward: -8.70 | Avg100: -7.92 | Epsilon: 0.9048 | Buffer: 500
Episode 20/10000 | Reward: -8.10 | Avg100: -7.83 | Epsilon: 0.8187 | Buffer: 1000
Episode 30/10000 | Reward: -8.10 | Avg100: -7.58 | Epsilon: 0.7408 | Buffer: 1500
Episode 40/10000 | Reward: -6.90 | Avg100: -7.29 | Epsilon: 0.6703 | Buffer: 2000
Episode 50/10000 | Reward: -7.50 | Avg100: -7.31 | Epsilon: 0.6065 | Buffer: 2500
Episode 60/10000 | Reward: -6.30 | Avg100: -7.00 | Epsilon: 0.5488 | Buffer: 3000
Episode 70/10000 | Reward: -8.70 | Avg100: -6.85 | Epsilon: 0.4966 | Buffer: 3500
Episode 80/10000 | Reward: -4.50 | Avg100: -6.80 | Epsilon: 0.4493 | Buffer: 4000
Episode 90/10000 | Reward: -4.50 | Avg100: -6.51 | Epsilon: 0.4065 | Buffer: 4500
Episode 100/10000 | Reward: -2.10 | Avg100: -6.43 | Epsilon: 0.3678 | Buffer: 5000
Episode 110/10000 | Reward: -0.90 | Avg100: -6.22 | Epsilon: 0.3328 | Buffer: 5500
Episode 120/10000 

In [26]:
def record_best_dueling_dqn_videos(
    env_name='FlappyBird-v0',
    model_path='best_flappy_dqn_state.pth',
    output_dir='videos',
    output_filename='flappybird_best10.mp4',
    frame_stack_k=3,
    num_eval_episodes=30,
    top_k=10,
    max_steps=500
):
    os.makedirs(output_dir, exist_ok=True)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    env = gym.make(env_name, render_mode='rgb_array')
    state_dim = len(env.reset()[0]) * frame_stack_k
    action_dim = env.action_space.n

    # Load trained model
    model = DuelingDQN(state_dim, action_dim).to(device)
    model.load_state_dict(torch.load(model_path, map_location=device))
    model.eval()

    results = []  # (reward, frames)
    print(f"Evaluating {num_eval_episodes} episodes to pick best {top_k}...")

    for ep in range(num_eval_episodes):
        obs, _ = env.reset()
        frame_deque = deque(maxlen=frame_stack_k)
        state = stack_frames(frame_deque, obs, frame_stack_k)
        done, total_reward, step = False, 0, 0
        frames = []

        while not done and step < max_steps:
            with torch.no_grad():
                q_values = model(torch.tensor(state, dtype=torch.float32, device=device).unsqueeze(0))
                action = torch.argmax(q_values, dim=1).item()

            obs, reward, terminated, truncated, info = env.step(action)
            done = terminated or truncated
            frame_deque.append(obs)
            state = np.concatenate(list(frame_deque), axis=-1)
            frames.append(env.render())
            total_reward += reward
            step += 1

        results.append((total_reward, frames))
        print(f"Episode {ep+1}/{num_eval_episodes} | Reward: {total_reward:.2f}")

    env.close()

    # Pick best episodes
    results.sort(key=lambda x: x[0], reverse=True)
    best_episodes = results[:top_k]
    print(f"Saving top {top_k} episodes merged into one video...")

    merged_path = os.path.join(output_dir, output_filename)
    writer = imageio.get_writer(merged_path, fps=30)

    for i, (reward, frames) in enumerate(best_episodes, 1):
        for frame in frames:
            writer.append_data(frame)
        print(f"Added episode {i} (Reward: {reward:.2f})")

    writer.close()
    print(f"\n Saved merged video at: {merged_path}")

# Run it
record_best_dueling_dqn_videos()


Evaluating 30 episodes to pick best 10...


  model.load_state_dict(torch.load(model_path, map_location=device))


Episode 1/30 | Reward: 49.40
Episode 2/30 | Reward: 48.20
Episode 3/30 | Reward: 44.60
Episode 4/30 | Reward: 47.00
Episode 5/30 | Reward: 47.60
Episode 6/30 | Reward: 45.80
Episode 7/30 | Reward: 44.00
Episode 8/30 | Reward: 47.60
Episode 9/30 | Reward: 31.40
Episode 10/30 | Reward: 50.00
Episode 11/30 | Reward: 14.20
Episode 12/30 | Reward: 17.50
Episode 13/30 | Reward: 49.40
Episode 14/30 | Reward: 47.00
Episode 15/30 | Reward: 46.40
Episode 16/30 | Reward: 50.60
Episode 17/30 | Reward: 41.00
Episode 18/30 | Reward: 44.00
Episode 19/30 | Reward: 45.80
Episode 20/30 | Reward: 48.80
Episode 21/30 | Reward: 45.20
Episode 22/30 | Reward: 48.20
Episode 23/30 | Reward: 46.40
Episode 24/30 | Reward: 45.20
Episode 25/30 | Reward: 47.00
Episode 26/30 | Reward: 46.40
Episode 27/30 | Reward: 50.00
Episode 28/30 | Reward: 9.50
Episode 29/30 | Reward: 33.00
Episode 30/30 | Reward: 48.80
Saving top 10 episodes merged into one video...
Added episode 1 (Reward: 50.60)
Added episode 2 (Reward: 50.00

In [27]:
display(Video("videos/flappybird_best10.mp4", embed=True))