# Deep Q-Network (DQN) in Python

This notebook demonstrates how to implement a Deep Q-Network using PyTorch and OpenAI Gym.

# **Step 1: Import Libraries**

In [1]:
import numpy as np
import gymnasium as gym
import torch
import torch.nn as nn
import torch.optim as optim
import random
from collections import deque

# **Step 2: Define DQN Model**

In [2]:
class DQN(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(state_dim, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, action_dim)
    
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)

# **Step 3: Initialize Environment and Hyperparameters**

In [3]:
# Create CartPole environment
env = gym.make("CartPole-v1")
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n

dqn = DQN(state_dim, action_dim)
optimizer = optim.Adam(dqn.parameters(), lr=0.001)
loss_fn = nn.MSELoss()

replay_buffer = deque(maxlen=10000)

# **Step 4: Define Training Parameters**

In [4]:
epsilon = 1.0  # Initial exploration rate
epsilon_decay = 0.995
min_epsilon = 0.01
discount_factor = 0.99
batch_size = 32
num_episodes = 500

# **Step 5: Define Training Loop**

In [6]:
def train_dqn():
    global epsilon  # Move this to the start
    for episode in range(num_episodes):
        state, _ = env.reset()
        state = torch.FloatTensor(state).unsqueeze(0)
        total_reward = 0
        done = False
        while not done:
            # Choose action (epsilon-greedy policy)
            if np.random.rand() < epsilon:
                action = env.action_space.sample()
            else:
                action = torch.argmax(dqn(state)).item()
            
            # Take action
            new_state, reward, done, _, _ = env.step(action)
            new_state = torch.FloatTensor(new_state).unsqueeze(0)
            replay_buffer.append((state, action, reward, new_state, done))
            state = new_state
            total_reward += reward

            # Training step
            if len(replay_buffer) > batch_size:
                minibatch = random.sample(replay_buffer, batch_size)
                states, actions, rewards, new_states, dones = zip(*minibatch)
                states = torch.cat(states)
                actions = torch.LongTensor(actions).unsqueeze(1)
                rewards = torch.FloatTensor(rewards).unsqueeze(1)
                new_states = torch.cat(new_states)
                dones = torch.FloatTensor(dones).unsqueeze(1)
                
                q_values = dqn(states).gather(1, actions)
                next_q_values = dqn(new_states).max(1, keepdim=True)[0].detach()
                target_q_values = rewards + (discount_factor * next_q_values * (1 - dones))
                
                loss = loss_fn(q_values, target_q_values)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

        # Decay epsilon
        epsilon = max(min_epsilon, epsilon * epsilon_decay)
        print(f"Episode {episode+1}, Total Reward: {total_reward}")

# **Step 6: Train the DQN Agent**

In [7]:
train_dqn()

Episode 1, Total Reward: 14.0
Episode 2, Total Reward: 31.0
Episode 3, Total Reward: 53.0
Episode 4, Total Reward: 15.0
Episode 5, Total Reward: 17.0
Episode 6, Total Reward: 17.0
Episode 7, Total Reward: 15.0
Episode 8, Total Reward: 32.0
Episode 9, Total Reward: 17.0
Episode 10, Total Reward: 17.0
Episode 11, Total Reward: 22.0
Episode 12, Total Reward: 9.0
Episode 13, Total Reward: 13.0
Episode 14, Total Reward: 13.0
Episode 15, Total Reward: 28.0
Episode 16, Total Reward: 13.0
Episode 17, Total Reward: 27.0
Episode 18, Total Reward: 22.0
Episode 19, Total Reward: 22.0
Episode 20, Total Reward: 18.0
Episode 21, Total Reward: 21.0
Episode 22, Total Reward: 26.0
Episode 23, Total Reward: 21.0
Episode 24, Total Reward: 17.0
Episode 25, Total Reward: 57.0
Episode 26, Total Reward: 20.0
Episode 27, Total Reward: 45.0
Episode 28, Total Reward: 55.0
Episode 29, Total Reward: 31.0
Episode 30, Total Reward: 35.0
Episode 31, Total Reward: 23.0
Episode 32, Total Reward: 17.0
Episode 33, Total 

# **Step 7: Test the Trained Agent**

In [8]:
def test_dqn():
    state, _ = env.reset()
    state = torch.FloatTensor(state).unsqueeze(0)
    done = False
    total_reward = 0
    while not done:
        action = torch.argmax(dqn(state)).item()
        new_state, reward, done, _, _ = env.step(action)
        state = torch.FloatTensor(new_state).unsqueeze(0)
        total_reward += reward
    print(f"Test Run Total Reward: {total_reward}")

In [9]:
test_dqn()

Test Run Total Reward: 10.0
