# Basic DQN for Space Invaders

Minimal implementation of Deep Q-Network for ALE/SpaceInvaders-v5

## Install Dependencies

In [None]:
!pip install gymnasium[atari,accept-rom-license]
!pip install ale-py
!pip install torch scipy numpy

# Import and verify ALE is available
import ale_py
import gymnasium as gym
gym.register_envs(ale_py)

## Import Libraries

In [None]:
import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from collections import deque
import random
from scipy.ndimage import zoom
import matplotlib.pyplot as plt
import psutil

# Register ALE environments
import ale_py
gym.register_envs(ale_py)

## Define DQN Network

In [None]:
class DQN(nn.Module):
    def __init__(self, input_shape, n_actions):
        super(DQN, self).__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1),
            nn.ReLU()
        )
        
        conv_out_size = self._get_conv_out(input_shape)
        self.fc = nn.Sequential(
            nn.Linear(conv_out_size, 512),
            nn.ReLU(),
            nn.Linear(512, n_actions)
        )
    
    def _get_conv_out(self, shape):
        o = self.conv(torch.zeros(1, *shape))
        return int(np.prod(o.size()))
    
    def forward(self, x):
        conv_out = self.conv(x).view(x.size()[0], -1)
        return self.fc(conv_out)

## Define Replay Buffer

In [None]:
class ReplayBuffer:
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)
    
    def push(self, state, action, reward, next_state, done):
        self.buffer.append((state, action, reward, next_state, done))
    
    def sample(self, batch_size):
        batch = random.sample(self.buffer, batch_size)
        state, action, reward, next_state, done = zip(*batch)
        return np.array(state), action, reward, np.array(next_state), done
    
    def __len__(self):
        return len(self.buffer)

## Frame Preprocessing

In [None]:
def preprocess_frame(frame):
    """Convert frame to grayscale, resize to 84x84, and normalize"""
    # Convert to grayscale
    gray = np.dot(frame[..., :3], [0.299, 0.587, 0.114])
    # Resize to 84x84
    resized = zoom(gray, (84/210, 84/160), order=1)
    # Normalize
    normalized = resized / 255.0
    return normalized.astype(np.float32)

## Hyperparameters

In [None]:
# Configuration Dictionary
CONFIG = {
    # Environment
    'ENV_ID': 'ALE/SpaceInvaders-v5',
    'SEED': 7,
    
    # Network
    'N_FRAMES': 4,
    'N_ACTIONS': 6,
    
    # Training
    'N_EPISODES': 1000,
    'LEARNING_RATE': 0.00025,
    'GAMMA': 0.99,
    'BATCH_SIZE': 32,
    
    # Exploration
    'EPSILON_START': 1.0,
    'EPSILON_END': 0.1,
    'EPSILON_DECAY': 10000,
    
    # Memory
    'BUFFER_SIZE': 10000,
    'TARGET_UPDATE': 1000
}

## Initialize Environment and Networks

In [None]:
# Setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Set random seeds for reproducibility
random.seed(CONFIG['SEED'])
np.random.seed(CONFIG['SEED'])
torch.manual_seed(CONFIG['SEED'])

# Networks
policy_net = DQN((CONFIG['N_FRAMES'], 84, 84), CONFIG['N_ACTIONS']).to(device)
target_net = DQN((CONFIG['N_FRAMES'], 84, 84), CONFIG['N_ACTIONS']).to(device)
target_net.load_state_dict(policy_net.state_dict())

optimizer = optim.Adam(policy_net.parameters(), lr=CONFIG['LEARNING_RATE'])
replay_buffer = ReplayBuffer(CONFIG['BUFFER_SIZE'])

print(f"Environment: {CONFIG['ENV_ID']}")
print(f"Action space: {CONFIG['N_ACTIONS']}")
print(f"Seed: {CONFIG['SEED']}")
print(f"Policy Network initialized")

## Training Functions

In [None]:
def select_action(state, epsilon, policy_net, n_actions, device):
    """Epsilon-greedy action selection"""
    if random.random() < epsilon:
        return random.randrange(n_actions)
    else:
        with torch.no_grad():
            state_tensor = torch.FloatTensor(state).unsqueeze(0).to(device)
            q_values = policy_net(state_tensor)
            return q_values.max(1)[1].item()

def optimize_model(policy_net, target_net, optimizer, replay_buffer, batch_size, gamma, device):
    """Perform one step of optimization"""
    if len(replay_buffer) < batch_size:
        return
    
    states, actions, rewards, next_states, dones = replay_buffer.sample(batch_size)
    
    states = torch.FloatTensor(states).to(device)
    actions = torch.LongTensor(actions).to(device)
    rewards = torch.FloatTensor(rewards).to(device)
    next_states = torch.FloatTensor(next_states).to(device)
    dones = torch.FloatTensor(dones).to(device)
    
    # Current Q values
    current_q = policy_net(states).gather(1, actions.unsqueeze(1))
    
    # Next Q values from target network
    next_q = target_net(next_states).max(1)[0].detach()
    target_q = rewards + (1 - dones) * gamma * next_q
    
    # Loss
    loss = nn.MSELoss()(current_q.squeeze(), target_q)
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

## Generic Training Function

In [None]:
def train_dqn(config, policy_net, target_net, optimizer, replay_buffer, device='cpu'):
    """
    Generic DQN training function that works with different DQN variants.
    
    Args:
        config: Configuration dictionary with all hyperparameters
        policy_net: Policy network (DQN, Double DQN, Dueling DQN, etc.)
        target_net: Target network
        optimizer: Optimizer for policy network
        replay_buffer: Experience replay buffer
        device: Device to run on ('cpu' or 'cuda')
    
    Returns:
        episode_rewards: List of rewards per episode
    """
    import psutil
    
    # Create environment
    env = gym.make(config['ENV_ID'])
    if config.get('SEED') is not None:
        env.reset(seed=config['SEED'])
    
    n_actions = config['N_ACTIONS']
    episode_rewards = []
    steps = 0
    
    for episode in range(config['N_EPISODES']):
        state, _ = env.reset()
        state = preprocess_frame(state)
        state_stack = deque([state] * config['N_FRAMES'], maxlen=config['N_FRAMES'])
        
        episode_reward = 0
        done = False
        
        while not done:
            # Epsilon decay
            epsilon = config['EPSILON_END'] + (config['EPSILON_START'] - config['EPSILON_END']) * \
                      np.exp(-1. * steps / config['EPSILON_DECAY'])
            
            # Select action
            state_array = np.array(state_stack)
            action = select_action(state_array, epsilon, policy_net, n_actions, device)
            
            # Take step
            next_state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
            
            next_state = preprocess_frame(next_state)
            next_state_stack = state_stack.copy()
            next_state_stack.append(next_state)
            
            # Store transition
            replay_buffer.push(
                np.array(state_stack),
                action,
                reward,
                np.array(next_state_stack),
                float(done)
            )
            
            state_stack = next_state_stack
            episode_reward += reward
            steps += 1
            
            # Optimize
            optimize_model(policy_net, target_net, optimizer, replay_buffer, 
                          config['BATCH_SIZE'], config['GAMMA'], device)
            
            # Update target network
            if steps % config['TARGET_UPDATE'] == 0:
                target_net.load_state_dict(policy_net.state_dict())
        
        episode_rewards.append(episode_reward)
        
        if episode % 10 == 0:
            avg_score = np.mean(episode_rewards[-100:]) if len(episode_rewards) >= 100 else np.mean(episode_rewards)
            
            # Get memory info
            mem = psutil.virtual_memory()
            gpu_mem = 0.0
            if torch.cuda.is_available():
                gpu_mem = torch.cuda.memory_allocated() / 1024**3  # Convert to GB
            
            print(f'Episode {episode}\tScore: {episode_reward:.1f}\tAvg: {avg_score:.2f}\tEps: {epsilon:.3f}\tSteps: {steps}')
            print(f'RAM: {mem.percent:.1f}% | GPU: {gpu_mem:.2f}GB | Buffer: {len(replay_buffer)}/{config["BUFFER_SIZE"]}')
    
    env.close()
    print("\nTraining completed!")
    return episode_rewards

## Training Loop

In [None]:
# Train the DQN
episode_rewards = train_dqn(
    config=CONFIG,
    policy_net=policy_net,
    target_net=target_net,
    optimizer=optimizer,
    replay_buffer=replay_buffer,
    device=device
)

## Consolidated Results Plotting

In [None]:
def plot_consolidated_results(results_dict, window=100, figsize=(14, 8)):
    """
    Plot consolidated training progress for multiple DQN runs.
    
    Args:
        results_dict: Dictionary mapping run names to episode rewards lists
                     e.g., {'DQN_basic': [rewards], 'DoubleDQN': [rewards]}
        window: Window size for moving average
        figsize: Figure size tuple
    """
    plt.figure(figsize=figsize)
    
    colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b']
    
    for idx, (name, rewards) in enumerate(results_dict.items()):
        # Calculate moving average
        if len(rewards) >= window:
            moving_avg = np.convolve(rewards, np.ones(window)/window, mode='valid')
            episodes = range(window-1, len(rewards))
            final_avg = np.mean(rewards[-100:])
            
            # Plot moving average
            plt.plot(episodes, moving_avg, 
                    label=f'{name} (Avg={final_avg:.2f})',
                    color=colors[idx % len(colors)],
                    linewidth=2)
    
    # Add goal lines
    plt.axhline(y=500, color='green', linestyle='--', linewidth=2, label='Goal: 500', alpha=0.7)
    plt.axhline(y=400, color='red', linestyle='--', linewidth=2, label='Goal: 400', alpha=0.7)
    
    plt.xlabel('Episode #', fontsize=12)
    plt.ylabel(f'Average Score ({window}-Game Window)', fontsize=12)
    plt.title(f'Consolidated DQN Training Progress ({window}-Episode Moving Average)', fontsize=14, fontweight='bold')
    plt.legend(loc='best', fontsize=10)
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()

# Storage for multiple runs
all_results = {}

# Store current run
run_name = 'DQN_basic'  # Change this for each variant
all_results[run_name] = episode_rewards

## Plot Current Results

In [None]:
# Plot episode rewards for current run
plt.figure(figsize=(12, 6))
plt.plot(episode_rewards, alpha=0.6, label='Episode Reward')

# Calculate moving average
window = 100
moving_avg = np.convolve(episode_rewards, np.ones(window)/window, mode='valid')
plt.plot(range(window-1, len(episode_rewards)), moving_avg, label=f'Moving Average ({window})', linewidth=2)

plt.axhline(y=500, color='r', linestyle='--', label='Target (500)')
plt.axhline(y=400, color='orange', linestyle='--', label='Minimum (400)')

plt.xlabel('Episode')
plt.ylabel('Reward')
plt.title(f'{run_name} - Training Progress on Space Invaders')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

# Print final statistics
final_avg = np.mean(episode_rewards[-100:])
print(f"\nFinal average reward (last 100 episodes): {final_avg:.2f}")

## Plot Consolidated Results (All Runs)

In [None]:
# Plot consolidated results comparing all runs
plot_consolidated_results(all_results, window=100)

In [None]:
# Example with more DQN variants

# Run 1: Basic DQN
run_name = 'DQN_basic'
episode_rewards_1 = train_dqn(CONFIG, policy_net, target_net, optimizer, replay_buffer, device)
all_results[run_name] = episode_rewards_1

# Run 2: Lower Learning Rate
CONFIG_v2 = CONFIG.copy()
CONFIG_v2['LEARNING_RATE'] = 0.0001
CONFIG_v2['SEED'] = 42

policy_net_2 = DQN((CONFIG['N_FRAMES'], 84, 84), CONFIG['N_ACTIONS']).to(device)
target_net_2 = DQN((CONFIG['N_FRAMES'], 84, 84), CONFIG['N_ACTIONS']).to(device)
optimizer_2 = optim.Adam(policy_net_2.parameters(), lr=CONFIG_v2['LEARNING_RATE'])
replay_buffer_2 = ReplayBuffer(CONFIG['BUFFER_SIZE'])

run_name = 'DQN_lower_lr'
episode_rewards_2 = train_dqn(CONFIG_v2, policy_net_2, target_net_2, optimizer_2, replay_buffer_2, device)
all_results[run_name] = episode_rewards_2

# Run 3: Larger Buffer
CONFIG_v3 = CONFIG.copy()
CONFIG_v3['BUFFER_SIZE'] = 50000
CONFIG_v3['SEED'] = 123

policy_net_3 = DQN((CONFIG['N_FRAMES'], 84, 84), CONFIG['N_ACTIONS']).to(device)
target_net_3 = DQN((CONFIG['N_FRAMES'], 84, 84), CONFIG['N_ACTIONS']).to(device)
optimizer_3 = optim.Adam(policy_net_3.parameters(), lr=CONFIG_v3['LEARNING_RATE'])
replay_buffer_3 = ReplayBuffer(CONFIG_v3['BUFFER_SIZE'])

run_name = 'DQN_large_buffer'
episode_rewards_3 = train_dqn(CONFIG_v3, policy_net_3, target_net_3, optimizer_3, replay_buffer_3, device)
all_results[run_name] = episode_rewards_3

# Plot all three together
plot_consolidated_results(all_results, window=100)

## Save Model

In [None]:
# Save the trained model and all results
torch.save({
    'config': CONFIG,
    'policy_net_state_dict': policy_net.state_dict(),
    'target_net_state_dict': target_net.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'all_results': all_results,
}, f'dqn_space_invaders_{run_name}.pth')

print(f"Model saved successfully as 'dqn_space_invaders_{run_name}.pth'!")