In [3]:
%load_ext autoreload
%autoreload 2

import os
import time
import numpy as np
import torch
from tqdm import trange

from game.flappy_bird import FlappyBirdEnv
from agent.dqn_agent import DQNAgent, DQNConfig
CKPT_PATH = "agent/flappy_dqn_live.pth"

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [7]:

env = FlappyBirdEnv(pipe_gap=160, pipe_speed=2.0, seed=42)
state_dim = 5
action_dim = 2

cfg = DQNConfig(
    start_learning_after=1000,  # earlier learning
    eps_decay_steps=20000,    
	state_dim=state_dim,
	action_dim=action_dim,
	gamma=0.99,
	lr=1e-3,
	batch_size=64,
	replay_size=50_000,
	
	target_update_freq=1_000,
	eps_start=1.0,
	eps_end=0.05,
	
	gradient_clip_norm=5.0,
)

agent = DQNAgent(cfg)
print("Using device:", agent.cfg.device)

Using device: cpu


In [8]:
import numpy as np

def simple_heuristic_action(state):
    bird_y, bird_vel, next_x, top, bottom = state
    target_y = (top + bottom) / 2.0
    return 1 if bird_y > target_y + 0.03 else 0

warm_steps = 5000
s = env.reset()
for _ in range(warm_steps):
    if np.random.rand() < 0.2:
        a = np.random.randint(2)
    else:
        a = simple_heuristic_action(s)
    ns, r, d, info = env.step(a)
    agent.store(s, a, r, ns, d)
    s = ns if not d else env.reset()
print("Replay warm-started with heuristic + random steps.")

Replay warm-started with heuristic + random steps.


In [9]:
num_episodes = 500
max_steps_per_ep = 10_000

moving_avg_window = 50
ep_returns = []
losses = []

for ep in trange(num_episodes, desc="Training"):
	state = env.reset()
	ep_return = 0.0

	for t in range(max_steps_per_ep):
		action = agent.select_action(state)
		next_state, reward, done, info = env.step(action)

		agent.store(state, action, reward, next_state, done)
		loss = agent.train_step()
		if loss is not None:
			losses.append(loss)

		# autosave every 1k steps so a live viewer can reload
		if agent.total_steps % 1000 == 0:
			agent.save(CKPT_PATH)

		ep_return += reward
		state = next_state
		if done:
			break

	ep_returns.append(ep_return)
	if (ep + 1) % 10 == 0:
		recent = ep_returns[-moving_avg_window:]
		mavg = np.mean(recent) if recent else 0.0
		print(f"Ep {ep+1}/{num_episodes} | Return: {ep_return:.1f} | 50-ep avg: {mavg:.1f} | Eps: {agent.epsilon:.3f}")

# final save
agent.save(CKPT_PATH)
print("Training finished. Saved:", CKPT_PATH)

Training:   2%|▏         | 11/500 [00:01<01:10,  6.98it/s]

Ep 10/500 | Return: -62.0 | 50-ep avg: -63.7 | Eps: 0.982


Training:   4%|▍         | 21/500 [00:03<01:06,  7.25it/s]

Ep 20/500 | Return: -65.0 | 50-ep avg: -63.1 | Eps: 0.964


Training:   6%|▌         | 31/500 [00:04<01:01,  7.66it/s]

Ep 30/500 | Return: -66.0 | 50-ep avg: -63.6 | Eps: 0.947


Training:   8%|▊         | 41/500 [00:05<00:57,  7.98it/s]

Ep 40/500 | Return: -65.0 | 50-ep avg: -63.9 | Eps: 0.929


Training:  10%|█         | 50/500 [00:07<01:21,  5.50it/s]

Ep 50/500 | Return: -65.0 | 50-ep avg: -63.9 | Eps: 0.912


Training:  12%|█▏        | 61/500 [00:09<00:56,  7.73it/s]

Ep 60/500 | Return: -65.0 | 50-ep avg: -64.2 | Eps: 0.895


Training:  14%|█▍        | 71/500 [00:10<01:13,  5.81it/s]

Ep 70/500 | Return: -62.0 | 50-ep avg: -64.7 | Eps: 0.878


Training:  16%|█▌        | 81/500 [00:12<00:52,  7.91it/s]

Ep 80/500 | Return: -66.0 | 50-ep avg: -64.8 | Eps: 0.861


Training:  18%|█▊        | 92/500 [00:13<00:46,  8.69it/s]

Ep 90/500 | Return: -66.0 | 50-ep avg: -64.6 | Eps: 0.843


Training:  20%|██        | 101/500 [00:14<00:45,  8.78it/s]

Ep 100/500 | Return: -62.0 | 50-ep avg: -64.8 | Eps: 0.826


Training:  22%|██▏       | 111/500 [00:16<00:46,  8.42it/s]

Ep 110/500 | Return: -64.0 | 50-ep avg: -64.5 | Eps: 0.808


Training:  24%|██▍       | 120/500 [00:17<00:38,  9.90it/s]

Ep 120/500 | Return: -60.0 | 50-ep avg: -64.0 | Eps: 0.790


Training:  26%|██▌       | 131/500 [00:18<00:39,  9.43it/s]

Ep 130/500 | Return: -66.0 | 50-ep avg: -63.8 | Eps: 0.772


Training:  28%|██▊       | 141/500 [00:19<00:42,  8.40it/s]

Ep 140/500 | Return: -62.0 | 50-ep avg: -63.6 | Eps: 0.754


Training:  30%|███       | 151/500 [00:20<00:39,  8.77it/s]

Ep 150/500 | Return: -63.0 | 50-ep avg: -63.3 | Eps: 0.736


Training:  32%|███▏      | 161/500 [00:21<00:37,  9.06it/s]

Ep 160/500 | Return: -61.0 | 50-ep avg: -63.2 | Eps: 0.718


Training:  34%|███▍      | 171/500 [00:22<00:33,  9.89it/s]

Ep 170/500 | Return: -64.0 | 50-ep avg: -63.2 | Eps: 0.700


Training:  36%|███▌      | 181/500 [00:24<00:48,  6.57it/s]

Ep 180/500 | Return: -66.0 | 50-ep avg: -62.9 | Eps: 0.682


Training:  38%|███▊      | 191/500 [00:25<00:35,  8.66it/s]

Ep 190/500 | Return: -66.0 | 50-ep avg: -62.1 | Eps: 0.662


Training:  40%|████      | 201/500 [00:27<00:43,  6.90it/s]

Ep 200/500 | Return: -64.0 | 50-ep avg: -61.1 | Eps: 0.642


Training:  42%|████▏     | 211/500 [00:28<00:34,  8.31it/s]

Ep 210/500 | Return: -61.0 | 50-ep avg: -60.7 | Eps: 0.622


Training:  44%|████▍     | 221/500 [00:29<00:35,  7.79it/s]

Ep 220/500 | Return: -61.0 | 50-ep avg: -60.6 | Eps: 0.604


Training:  46%|████▌     | 231/500 [00:31<00:42,  6.35it/s]

Ep 230/500 | Return: -66.0 | 50-ep avg: -60.4 | Eps: 0.586


Training:  48%|████▊     | 241/500 [00:33<00:46,  5.56it/s]

Ep 240/500 | Return: -38.0 | 50-ep avg: -60.6 | Eps: 0.566


Training:  50%|█████     | 251/500 [00:34<00:40,  6.14it/s]

Ep 250/500 | Return: -57.0 | 50-ep avg: -61.2 | Eps: 0.547


Training:  52%|█████▏    | 260/500 [00:36<00:35,  6.70it/s]

Ep 260/500 | Return: -59.0 | 50-ep avg: -61.5 | Eps: 0.529


Training:  54%|█████▍    | 271/500 [00:38<00:35,  6.51it/s]

Ep 270/500 | Return: -62.0 | 50-ep avg: -60.9 | Eps: 0.509


Training:  56%|█████▌    | 281/500 [00:39<00:36,  6.08it/s]

Ep 280/500 | Return: -64.0 | 50-ep avg: -60.5 | Eps: 0.489


Training:  58%|█████▊    | 291/500 [00:41<00:29,  7.13it/s]

Ep 290/500 | Return: -63.0 | 50-ep avg: -60.8 | Eps: 0.470


Training:  60%|██████    | 301/500 [00:42<00:29,  6.83it/s]

Ep 300/500 | Return: -64.0 | 50-ep avg: -60.3 | Eps: 0.450


Training:  62%|██████▏   | 311/500 [00:44<00:34,  5.51it/s]

Ep 310/500 | Return: -60.0 | 50-ep avg: -60.0 | Eps: 0.431


Training:  64%|██████▍   | 321/500 [00:46<00:34,  5.26it/s]

Ep 320/500 | Return: -42.0 | 50-ep avg: -59.1 | Eps: 0.409


Training:  66%|██████▌   | 331/500 [00:48<00:26,  6.27it/s]

Ep 330/500 | Return: -63.0 | 50-ep avg: -58.0 | Eps: 0.387


Training:  68%|██████▊   | 341/500 [00:49<00:29,  5.33it/s]

Ep 340/500 | Return: -54.0 | 50-ep avg: -57.1 | Eps: 0.366


Training:  70%|███████   | 351/500 [00:51<00:29,  5.04it/s]

Ep 350/500 | Return: -46.0 | 50-ep avg: -55.7 | Eps: 0.343


Training:  72%|███████▏  | 360/500 [00:53<00:23,  6.04it/s]

Ep 360/500 | Return: -60.0 | 50-ep avg: -55.1 | Eps: 0.322


Training:  74%|███████▍  | 370/500 [00:55<00:25,  5.17it/s]

Ep 370/500 | Return: -56.0 | 50-ep avg: -52.7 | Eps: 0.294


Training:  76%|███████▌  | 381/500 [00:57<00:20,  5.68it/s]

Ep 380/500 | Return: -28.0 | 50-ep avg: -51.5 | Eps: 0.270


Training:  78%|███████▊  | 390/500 [01:00<00:35,  3.06it/s]

Ep 390/500 | Return: -28.0 | 50-ep avg: -46.5 | Eps: 0.237


Training:  80%|████████  | 401/500 [01:03<00:22,  4.32it/s]

Ep 400/500 | Return: -53.0 | 50-ep avg: -43.6 | Eps: 0.207


Training:  82%|████████▏ | 411/500 [01:05<00:22,  3.90it/s]

Ep 410/500 | Return: 16.0 | 50-ep avg: -38.4 | Eps: 0.174


Training:  84%|████████▍ | 420/500 [01:08<00:22,  3.53it/s]

Ep 420/500 | Return: 7.0 | 50-ep avg: -35.8 | Eps: 0.140


Training:  86%|████████▌ | 431/500 [01:11<00:14,  4.74it/s]

Ep 430/500 | Return: -72.0 | 50-ep avg: -30.0 | Eps: 0.101


Training:  88%|████████▊ | 441/500 [01:14<00:14,  3.99it/s]

Ep 440/500 | Return: -72.0 | 50-ep avg: -29.5 | Eps: 0.067


Training:  90%|█████████ | 450/500 [01:20<00:28,  1.77it/s]

Ep 450/500 | Return: 56.0 | 50-ep avg: -15.7 | Eps: 0.050


Training:  92%|█████████▏| 460/500 [01:27<00:25,  1.54it/s]

Ep 460/500 | Return: 43.0 | 50-ep avg: -1.6 | Eps: 0.050


Training:  94%|█████████▍| 470/500 [01:33<00:20,  1.45it/s]

Ep 470/500 | Return: 43.0 | 50-ep avg: 12.8 | Eps: 0.050


Training:  96%|█████████▌| 480/500 [01:40<00:14,  1.36it/s]

Ep 480/500 | Return: 43.0 | 50-ep avg: 24.5 | Eps: 0.050


Training:  98%|█████████▊| 490/500 [01:47<00:06,  1.49it/s]

Ep 490/500 | Return: 43.0 | 50-ep avg: 35.6 | Eps: 0.050


Training: 100%|██████████| 500/500 [01:52<00:00,  4.43it/s]

Ep 500/500 | Return: -20.0 | 50-ep avg: 34.2 | Eps: 0.050
Training finished. Saved: agent/flappy_dqn_live.pth





In [10]:
save_path = "agent/flappy_dqn.pth"
os.makedirs(os.path.dirname(save_path), exist_ok=True)
agent.save(save_path)
print("Saved:", save_path)

Saved: agent/flappy_dqn.pth


In [11]:

env.close()
state = env.reset()

total_reward = 0.0
for _ in range(5000):
	# Greedy action for evaluation (no epsilon)
	with torch.no_grad():
		state_t = torch.from_numpy(state).float().unsqueeze(0).to(agent.cfg.device)
		q_vals = agent.q_net(state_t)
		action = int(torch.argmax(q_vals, dim=1).item())

	next_state, reward, done, info = env.step(action)
	env.render(fps=60)
	total_reward += reward
	state = next_state
	if done:
		break

print("Eval episode reward:", total_reward)
env.close()

Eval episode reward: 74.0
