In [24]:
import gym
import gym_super_mario_bros
from nes_py.wrappers import JoypadSpace
from gym_super_mario_bros.actions import SIMPLE_MOVEMENT
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from collections import deque
import random
from torch.utils.tensorboard import SummaryWriter
import os
import cv2

In [25]:
# DQN model definition
class DQN(nn.Module):
    def __init__(self, output_dim):
        super(DQN, self).__init__()
        self.conv1 = nn.Conv2d(1, 8, kernel_size=8, stride=4)
        self.conv2 = nn.Conv2d(8, 16, kernel_size=4, stride=2)
        self.conv3 = nn.Conv2d(16, 16, kernel_size=3, stride=1)

        def conv2d_size_out(size, kernel_size, stride):
            return (size - (kernel_size - 1) - 1) // stride + 1

        convw = conv2d_size_out(conv2d_size_out(conv2d_size_out(84, 8, 4), 4, 2), 3, 1)
        convh = conv2d_size_out(conv2d_size_out(conv2d_size_out(84, 8, 4), 4, 2), 3, 1)
        linear_input_size = convw * convh * 16

        self.fc1 = nn.Linear(linear_input_size, 512)
        self.fc2 = nn.Linear(512, output_dim)

    def forward(self, x):
        x = torch.relu(self.conv1(x))
        x = torch.relu(self.conv2(x))
        x = torch.relu(self.conv3(x))
        x = x.view(x.size(0), -1) # Flatten the tensor for the fully connected layer
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x


In [26]:
# Environment setup
env = gym_super_mario_bros.make('SuperMarioBros-v0')
env = JoypadSpace(env, SIMPLE_MOVEMENT)

In [27]:
# Parameters
gamma = 0.99
alpha = 0.001
epsilon = 1.0
epsilon_min = 0.01
epsilon_decay = 0.995
batch_size = 8
memory = deque(maxlen=10000)

In [28]:
# Function to preprocess the state
def preprocess_state(state):
    state = np.array(state, copy=True)
    state = cv2.cvtColor(state, cv2.COLOR_RGB2GRAY)  # Convert to grayscale
    state = cv2.resize(state, (84, 84))
    state = torch.tensor(state, dtype=torch.float32).unsqueeze(0)
    state = state.unsqueeze(0)
    return state

# Function to select an action
def select_action(state, epsilon):
    if random.random() < epsilon:
        return env.action_space.sample()
    else:
        with torch.no_grad():
            return policy_net(state).argmax().item()


In [29]:
# Setup device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


Using device: cuda


In [30]:
# Model initialization
policy_net = DQN(env.action_space.n).to(device)
optimizer = optim.Adam(policy_net.parameters(), lr=alpha)
loss_function = nn.MSELoss()

# TensorBoard setup
writer = SummaryWriter()

# Training loop
num_episodes = 1100
save_interval = 500
all_rewards = []
avg_rewards = []
avg_max_q_values = []

for episode in range(1, num_episodes + 1):
    state = env.reset()
    state = preprocess_state(state).to(device)
    
    total_reward = 0
    total_loss = 0
    total_max_q = 0
    done = False
    step = 0

    while not done:
        step += 1
        action = select_action(state, epsilon)
        next_state, reward, done, info = env.step(action)
        next_state = preprocess_state(next_state).to(device)

        # Store in memory
        memory.append((state, action, reward, next_state, done))

        # Experience replay
        if len(memory) >= batch_size:
            minibatch = random.sample(memory, batch_size)
            states, actions, rewards, next_states, dones = zip(*minibatch)

            states = torch.cat(states).to(device)
            next_states = torch.cat(next_states).to(device)
            rewards = torch.tensor(rewards, dtype=torch.float32).to(device)
            actions = torch.tensor(actions, dtype=torch.long).to(device)
            dones = torch.tensor(dones, dtype=torch.float32).to(device)

            current_q_values = policy_net(states).gather(1, actions.unsqueeze(1)).squeeze()
            next_q_values = policy_net(next_states).max(1)[0]
            target_q_values = rewards + gamma * next_q_values * (1 - dones)

            loss = loss_function(current_q_values, target_q_values)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            total_max_q += next_q_values.mean().item()

        state = next_state
        total_reward += reward

    all_rewards.append(total_reward)
    avg_rewards.append(np.mean(all_rewards[-100:]))
    avg_max_q_values.append(total_max_q / step)

    # Update epsilon
    if epsilon > epsilon_min:
        epsilon *= epsilon_decay

    # Logging
    writer.add_scalar('Reward', total_reward, episode)
    writer.add_scalar('Loss', total_loss / step, episode)
    writer.add_scalar('Average_Reward_100_Episodes', avg_rewards[-1], episode)
    writer.add_scalar('Average_Max_Q_Value', avg_max_q_values[-1], episode)

    if episode % 100 == 0:
        print(f'Episode {episode} - Total Reward: {total_reward}, Average Reward (last 100): {avg_rewards[-1]}, Average Max Q-Value: {avg_max_q_values[-1]}, Loss: {total_loss / step}')

    if episode % save_interval == 0:
        model_save_path = os.path.join("saved_models", f"mario_dqn_{episode}.pth")
        torch.save(policy_net.state_dict(), model_save_path)
        print(f'Model saved to {model_save_path}')

writer.close()

Episode 100 - Total Reward: 1106, Average Reward (last 100): 732.91, Average Max Q-Value: 1.3978255548449905, Loss: 0.393717826059018
Episode 200 - Total Reward: 1507, Average Reward (last 100): 1114.82, Average Max Q-Value: 2.084083589146163, Loss: 0.6719064089550508
Episode 300 - Total Reward: 1930, Average Reward (last 100): 1507.75, Average Max Q-Value: 2.634306325097857, Loss: 0.702398296069491
Episode 400 - Total Reward: 2126, Average Reward (last 100): 1760.69, Average Max Q-Value: 2.895023412860749, Loss: 1.346997564173488
Episode 500 - Total Reward: 1414, Average Reward (last 100): 1634.99, Average Max Q-Value: 3.014570888325188, Loss: 1.4534282590336716
Model saved to saved_models\mario_dqn_500.pth
Episode 600 - Total Reward: 1547, Average Reward (last 100): 1725.83, Average Max Q-Value: 3.0706415864338132, Loss: 1.3089137729772562
Episode 700 - Total Reward: 1407, Average Reward (last 100): 1961.69, Average Max Q-Value: 3.312663466250137, Loss: 2.076434823402853
Episode 800 

In [32]:
# Function to preprocess the state (same as before)
def preprocess_state(state):
    state = np.array(state, copy=True)
    state = cv2.cvtColor(state, cv2.COLOR_RGB2GRAY)  # Convert to grayscale
    state = cv2.resize(state, (84, 84))
    state = torch.tensor(state, dtype=torch.float32).unsqueeze(0)
    state = state.unsqueeze(0)
    return state

# Load the environment
env = gym_super_mario_bros.make('SuperMarioBros-v0')
env = JoypadSpace(env, SIMPLE_MOVEMENT)

# Setup device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# Initialize the model
policy_net = DQN(env.action_space.n).to(device)

# Load the trained model weights
model_path = os.path.join("saved_models", "mario_dqn_1000.pth")
policy_net.load_state_dict(torch.load(model_path, map_location=device))
policy_net.eval()
print(f'Model loaded from {model_path}')

# Testing loop
test_episodes = 10  # Number of episodes to test
for episode in range(1, test_episodes + 1):
    state = env.reset()
    state = preprocess_state(state).to(device)
    total_reward = 0
    done = False
    
    while not done:
        env.render()  # Render the environment to see the agent's performance
        with torch.no_grad():
            action = policy_net(state).argmax().item()
        next_state, reward, done, _ = env.step(action)
        next_state = preprocess_state(next_state).to(device)
        state = next_state
        total_reward += reward
    
    print(f'Episode {episode} - Total Reward: {total_reward}')

env.close()

Using device: cuda
Model loaded from saved_models\mario_dqn_1000.pth
Episode 1 - Total Reward: 680
Episode 2 - Total Reward: 680
Episode 3 - Total Reward: 680
Episode 4 - Total Reward: 680
Episode 5 - Total Reward: 680
Episode 6 - Total Reward: 680
Episode 7 - Total Reward: 680
Episode 8 - Total Reward: 680
Episode 9 - Total Reward: 680
Episode 10 - Total Reward: 680
