In [2]:
%load_ext autoreload
%autoreload 2

import os
import time
import numpy as np
import torch
from tqdm import trange

from game.flappy_bird import FlappyBirdEnv
from agent.dqn_agent import DQNAgent, DQNConfig
CKPT_PATH = "agent/flappy_dqn_live.pth"

pygame 2.6.1 (SDL 2.28.4, Python 3.12.6)
Hello from the pygame community. https://www.pygame.org/contribute.html


  from pkg_resources import resource_stream, resource_exists


In [3]:

env = FlappyBirdEnv(seed=42)
state_dim = 5
action_dim = 2

cfg = DQNConfig(
	state_dim=state_dim,
	action_dim=action_dim,
	gamma=0.99,
	lr=1e-3,
	batch_size=64,
	replay_size=50_000,
	start_learning_after=5_000,
	target_update_freq=1_000,
	eps_start=1.0,
	eps_end=0.05,
	eps_decay_steps=50_000,
	gradient_clip_norm=5.0,
)

agent = DQNAgent(cfg)
print("Using device:", agent.cfg.device)

Using device: cpu


In [4]:
num_episodes = 500
max_steps_per_ep = 10_000

moving_avg_window = 50
ep_returns = []
losses = []

for ep in trange(num_episodes, desc="Training"):
	state = env.reset()
	ep_return = 0.0

	for t in range(max_steps_per_ep):
		action = agent.select_action(state)
		next_state, reward, done, info = env.step(action)

		agent.store(state, action, reward, next_state, done)
		loss = agent.train_step()
		if loss is not None:
			losses.append(loss)

		# autosave every 1k steps so a live viewer can reload
		if agent.total_steps % 1000 == 0:
			agent.save(CKPT_PATH)

		ep_return += reward
		state = next_state
		if done:
			break

	ep_returns.append(ep_return)
	if (ep + 1) % 10 == 0:
		recent = ep_returns[-moving_avg_window:]
		mavg = np.mean(recent) if recent else 0.0
		print(f"Ep {ep+1}/{num_episodes} | Return: {ep_return:.1f} | 50-ep avg: {mavg:.1f} | Eps: {agent.epsilon:.3f}")

# final save
agent.save(CKPT_PATH)
print("Training finished. Saved:", CKPT_PATH)

Training:  12%|█▏        | 59/500 [00:00<00:00, 589.29it/s]

Ep 10/500 | Return: -64.0 | 50-ep avg: -64.6 | Eps: 0.993
Ep 20/500 | Return: -64.0 | 50-ep avg: -64.5 | Eps: 0.986
Ep 30/500 | Return: -66.0 | 50-ep avg: -64.8 | Eps: 0.979
Ep 40/500 | Return: -65.0 | 50-ep avg: -64.6 | Eps: 0.972
Ep 50/500 | Return: -63.0 | 50-ep avg: -64.6 | Eps: 0.965
Ep 60/500 | Return: -65.0 | 50-ep avg: -64.7 | Eps: 0.959
Ep 70/500 | Return: -65.0 | 50-ep avg: -64.8 | Eps: 0.952
Ep 80/500 | Return: -64.0 | 50-ep avg: -64.5 | Eps: 0.945
Ep 90/500 | Return: -62.0 | 50-ep avg: -64.5 | Eps: 0.938
Ep 100/500 | Return: -65.0 | 50-ep avg: -64.4 | Eps: 0.931
Ep 110/500 | Return: -65.0 | 50-ep avg: -64.2 | Eps: 0.924
Ep 120/500 | Return: -62.0 | 50-ep avg: -64.0 | Eps: 0.917
Ep 130/500 | Return: -66.0 | 50-ep avg: -63.9 | Eps: 0.909


Training:  27%|██▋       | 137/500 [00:00<00:00, 460.33it/s]

Ep 140/500 | Return: -61.0 | 50-ep avg: -63.9 | Eps: 0.902
Ep 150/500 | Return: -65.0 | 50-ep avg: -64.0 | Eps: 0.896
Ep 160/500 | Return: -65.0 | 50-ep avg: -64.0 | Eps: 0.888
Ep 170/500 | Return: -67.0 | 50-ep avg: -64.2 | Eps: 0.882
Ep 180/500 | Return: -63.0 | 50-ep avg: -64.6 | Eps: 0.875


Training:  37%|███▋      | 186/500 [00:05<00:13, 23.23it/s] 

Ep 190/500 | Return: -58.0 | 50-ep avg: -64.5 | Eps: 0.868
Ep 200/500 | Return: -66.0 | 50-ep avg: -64.8 | Eps: 0.861
Ep 210/500 | Return: -68.0 | 50-ep avg: -64.7 | Eps: 0.854


Training:  43%|████▎     | 213/500 [00:08<00:17, 16.81it/s]

Ep 220/500 | Return: -66.0 | 50-ep avg: -64.8 | Eps: 0.847


Training:  46%|████▌     | 230/500 [00:10<00:18, 14.52it/s]

Ep 230/500 | Return: -66.0 | 50-ep avg: -64.6 | Eps: 0.840


Training:  48%|████▊     | 241/500 [00:12<00:19, 13.03it/s]

Ep 240/500 | Return: -65.0 | 50-ep avg: -64.7 | Eps: 0.833


Training:  50%|████▉     | 249/500 [00:13<00:20, 11.96it/s]

Ep 250/500 | Return: -64.0 | 50-ep avg: -64.4 | Eps: 0.826


Training:  52%|█████▏    | 259/500 [00:14<00:21, 11.07it/s]

Ep 260/500 | Return: -65.0 | 50-ep avg: -64.3 | Eps: 0.819


Training:  54%|█████▍    | 271/500 [00:15<00:22,  9.98it/s]

Ep 270/500 | Return: -65.0 | 50-ep avg: -64.2 | Eps: 0.812


Training:  56%|█████▌    | 281/500 [00:17<00:25,  8.67it/s]

Ep 280/500 | Return: -67.0 | 50-ep avg: -64.3 | Eps: 0.805


Training:  58%|█████▊    | 291/500 [00:18<00:31,  6.67it/s]

Ep 290/500 | Return: -63.0 | 50-ep avg: -64.1 | Eps: 0.798


Training:  60%|██████    | 301/500 [00:19<00:24,  8.05it/s]

Ep 300/500 | Return: -60.0 | 50-ep avg: -63.8 | Eps: 0.791


Training:  62%|██████▏   | 311/500 [00:21<00:22,  8.40it/s]

Ep 310/500 | Return: -66.0 | 50-ep avg: -63.8 | Eps: 0.784


Training:  64%|██████▍   | 321/500 [00:22<00:23,  7.49it/s]

Ep 320/500 | Return: -61.0 | 50-ep avg: -63.4 | Eps: 0.777


Training:  66%|██████▌   | 331/500 [00:23<00:28,  5.86it/s]

Ep 330/500 | Return: -66.0 | 50-ep avg: -63.4 | Eps: 0.770


Training:  68%|██████▊   | 341/500 [00:25<00:22,  6.93it/s]

Ep 340/500 | Return: -63.0 | 50-ep avg: -63.3 | Eps: 0.762


Training:  70%|███████   | 351/500 [00:26<00:21,  6.88it/s]

Ep 350/500 | Return: -63.0 | 50-ep avg: -63.5 | Eps: 0.755


Training:  72%|███████▏  | 361/500 [00:28<00:18,  7.55it/s]

Ep 360/500 | Return: -66.0 | 50-ep avg: -63.3 | Eps: 0.748


Training:  74%|███████▍  | 371/500 [00:29<00:18,  6.94it/s]

Ep 370/500 | Return: -63.0 | 50-ep avg: -63.7 | Eps: 0.741


Training:  76%|███████▌  | 381/500 [00:31<00:17,  6.80it/s]

Ep 380/500 | Return: -61.0 | 50-ep avg: -63.2 | Eps: 0.734


Training:  78%|███████▊  | 391/500 [00:32<00:15,  7.14it/s]

Ep 390/500 | Return: -65.0 | 50-ep avg: -63.0 | Eps: 0.726


Training:  80%|████████  | 401/500 [00:33<00:13,  7.54it/s]

Ep 400/500 | Return: -66.0 | 50-ep avg: -62.9 | Eps: 0.719


Training:  82%|████████▏ | 411/500 [00:35<00:11,  7.60it/s]

Ep 410/500 | Return: -63.0 | 50-ep avg: -62.5 | Eps: 0.712


Training:  84%|████████▍ | 421/500 [00:36<00:09,  8.07it/s]

Ep 420/500 | Return: -64.0 | 50-ep avg: -62.6 | Eps: 0.705


Training:  86%|████████▌ | 431/500 [00:38<00:10,  6.90it/s]

Ep 430/500 | Return: -63.0 | 50-ep avg: -62.6 | Eps: 0.697


Training:  88%|████████▊ | 441/500 [00:39<00:09,  5.98it/s]

Ep 440/500 | Return: -55.0 | 50-ep avg: -62.0 | Eps: 0.689


Training:  90%|█████████ | 451/500 [00:41<00:06,  7.11it/s]

Ep 450/500 | Return: -61.0 | 50-ep avg: -61.8 | Eps: 0.682


Training:  92%|█████████▏| 461/500 [00:42<00:04,  7.87it/s]

Ep 460/500 | Return: -67.0 | 50-ep avg: -62.4 | Eps: 0.675


Training:  94%|█████████▍| 470/500 [00:45<00:09,  3.21it/s]

Ep 470/500 | Return: -63.0 | 50-ep avg: -61.6 | Eps: 0.667


Training:  96%|█████████▌| 481/500 [00:47<00:02,  7.32it/s]

Ep 480/500 | Return: -63.0 | 50-ep avg: -61.6 | Eps: 0.660


Training:  98%|█████████▊| 491/500 [00:48<00:01,  7.14it/s]

Ep 490/500 | Return: -65.0 | 50-ep avg: -62.3 | Eps: 0.653


Training: 100%|██████████| 500/500 [00:49<00:00, 10.05it/s]

Ep 500/500 | Return: -55.0 | 50-ep avg: -62.1 | Eps: 0.645
Training finished. Saved: agent/flappy_dqn_live.pth





In [None]:
num_episodes = 500  
max_steps_per_ep = 10_000

moving_avg_window = 50
ep_returns = []
losses = []
CKPT_PATH = "agent/flappy_dqn_live.pth"
for ep in trange(num_episodes, desc="Training"):
	state = env.reset()
	ep_return = 0.0

	for t in range(max_steps_per_ep):
		action = agent.select_action(state)
		next_state, reward, done, info = env.step(action)

		agent.store(state, action, reward, next_state, done)
		loss = agent.train_step()

		if agent.total_steps % 1000 == 0:  # save every 1k steps
			agent.save(CKPT_PATH)
		
		if loss is not None:
			losses.append(loss)

		ep_return += reward
		state = next_state
		if done:
			break

	ep_returns.append(ep_return)

	# Display simple stats every few episodes
	if (ep + 1) % 10 == 0:
		recent = ep_returns[-moving_avg_window:]
		mavg = np.mean(recent) if recent else 0.0
		print(f"Ep {ep+1}/{num_episodes} | Return: {ep_return:.1f} | 50-ep avg: {mavg:.1f} | Eps: {agent.epsilon:.3f}")

print("Training finished.")

NameError: name 'trange' is not defined

In [5]:
save_path = "agent/flappy_dqn.pth"
os.makedirs(os.path.dirname(save_path), exist_ok=True)
agent.save(save_path)
print("Saved:", save_path)

Saved: agent/flappy_dqn.pth


In [6]:

env.close()
state = env.reset()

total_reward = 0.0
for _ in range(5000):
	# Greedy action for evaluation (no epsilon)
	with torch.no_grad():
		state_t = torch.from_numpy(state).float().unsqueeze(0).to(agent.cfg.device)
		q_vals = agent.q_net(state_t)
		action = int(torch.argmax(q_vals, dim=1).item())

	next_state, reward, done, info = env.step(action)
	env.render(fps=60)
	total_reward += reward
	state = next_state
	if done:
		break

print("Eval episode reward:", total_reward)
env.close()

Eval episode reward: -72.0
