# Mountain Car DQN

## Imports

In [3]:
import os
import time
from abc import ABC, abstractmethod
from enum import Enum
from gymnasium import spaces
import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt
import random
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from collections import deque

from IPython.display import clear_output

%pip install swig
%pip install "gymnasium[box2d]"

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0.1 -> 25.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0.1 -> 25.1
[notice] To update, run: python.exe -m pip install --upgrade pip


## Class definition

In [5]:
class DQN(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(state_dim, 12)
        self.fc2 = nn.Linear(12, 8)
        self.fc3 = nn.Linear(8, action_dim)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))

        return self.fc3(x)

class ReplayBuffer:
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)

    def push(self, state, action, reward, next_state, done):
        self.buffer.append((state, action, reward, next_state, done))

    def sample(self, batch_size):
        return random.sample(self.buffer, batch_size)

    def __len__(self):
        return len(self.buffer)

# DQN Agent
class DQNAgent:
    BATCH_SIZE = 32
    EPSILON_START = 1
    EPSILON_CUTOFF = 0.005
    EPSILON_DECAY = 0.987
    GAMMA = 0.9
    LR = 75e-5
    MEMORY_SIZE = 50000

    def __init__(self, env):
        state_dim = env.observation_space.shape[0]
        action_dim = env.action_space.n
        
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.policy_net = DQN(state_dim, action_dim).to(self.device)
        self.target_net = DQN(state_dim, action_dim).to(self.device)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()
        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=self.LR)
        self.memory = ReplayBuffer(self.MEMORY_SIZE)
        self.steps_done = 0
        self.epsilon = self.EPSILON_START
        self.total_steps = 0

    def select_e_greedy_action(self, env, state):
        if random.random() < self.epsilon:
            return random.randint(0, env.action_space.n - 1)
        else:
            state = torch.tensor(state, dtype=torch.float32, device=self.device).unsqueeze(0)
            with torch.no_grad():
                return self.policy_net(state).argmax(dim=1).item()

    def select_greedy_action(self, state):
        state = torch.tensor(state, dtype=torch.float32, device=self.device).unsqueeze(0)
        with torch.no_grad():
            return self.policy_net(state).argmax(dim=1).item()

    def decay_epsilon(self):
        self.total_steps += 1
        self.epsilon = max(self.EPSILON_CUTOFF, self.epsilon * self.EPSILON_DECAY)

    def step(self):
        if len(self.memory) < self.BATCH_SIZE:
            return
        batch = self.memory.sample(self.BATCH_SIZE)

        states, actions, rewards, next_states, dones = zip(*batch)
        states = torch.tensor(np.array(states), dtype=torch.float32, device=self.device)
        actions = torch.tensor(np.array(actions), dtype=torch.int64, device=self.device).unsqueeze(1)
        rewards = torch.tensor(np.array(rewards), dtype=torch.float32, device=self.device).unsqueeze(1)
        next_states = torch.tensor(np.array(next_states), dtype=torch.float32, device=self.device)
        dones = torch.tensor(np.array(dones), dtype=torch.float32, device=self.device).unsqueeze(1)
        current_q = self.policy_net(states).gather(1, actions)
        next_q = self.target_net(next_states).max(1, keepdim=True)[0].detach()
        target_q = rewards + (self.GAMMA * next_q * (1 - dones))

        loss = F.mse_loss(target_q, current_q)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

    def update_target_network(self):
        self.target_net.load_state_dict(self.policy_net.state_dict())


## Training function

In [7]:

def train_q_values(env, env_name, target_update_interval=2000, training_episodes=1000, agent_class=DQNAgent):
    OBS_HIGH_VALUES = env.observation_space.high
    
    agent = agent_class(env)
    rewards_per_episode = []
    steps_per_episode = []
    epsilon_values = []
    episode_times = []
    start_episode = 0
    early_stop_counter = 0

    for episode in range(start_episode, training_episodes):
        start_time = time.time()
        obs, _ = env.reset()
        state = obs
        total_reward = 0
        episode_over = False
        steps_taken = 0
        episode_transitions = []

        epsilon_values.append(agent.epsilon)

        while not episode_over:
            action = agent.select_e_greedy_action(env, state)
            obs, reward, done, truncated, _ = env.step(action)
            reward += 3 * abs(state[1])
            reward += 100000 if state[0] >= np.max(OBS_HIGH_VALUES[0]) else 0
            next_state = np.array(obs, dtype=np.float32)
            agent.memory.push(state, action, reward, next_state, done)
            episode_over = done or truncated
            state = next_state
            total_reward += reward
            steps_taken += 1
            
            agent.step()
            
            if agent.total_steps % target_update_interval == 0:
                agent.update_target_network()

        if steps_taken <= 110:
            print(f"Episode {episode + 1:}: Total Reward: {total_reward:.3f}, Epsilon: {agent.epsilon:.3f}, Steps: {steps_taken}, Time: {episode_time:.2f}s")
            break
        
        agent.decay_epsilon()
        rewards_per_episode.append(total_reward)
        steps_per_episode.append(steps_taken)
        end_time = time.time()
        episode_time = end_time - start_time
        episode_times.append(episode_time)

        if episode % 10 == 0:
            print(f"Episode {episode + 1:}: Total Reward: {total_reward:.3f}, Epsilon: {agent.epsilon:.3f}, Steps: {steps_taken}, Time: {episode_time:.2f}s")
        if episode % 100 == 0:
            # Saving model
            torch.save(agent.policy_net.state_dict(), "dqn_mountain_car_env_v0.pth")
            checkpoint = {
                'policy_net_state_dict': agent.policy_net.state_dict(),
                'optimizer_state_dict': agent.optimizer.state_dict(),
                'steps_done': agent.steps_done,
                'episode': episode,
                'rewards': rewards_per_episode,
                'epsilons': epsilon_values,
                'times': episode_times
            }
            torch.save(checkpoint, "dqn_mountain_car_env_v0_checkpoint.pth")
            print("Checkpoint Reached!")

    return agent, rewards_per_episode, epsilon_values, episode_times, steps_per_episode
     


## Mountain Car

In [9]:
env_name = "MountainCar-v0"
mountain_env = gym.make(env_name, max_episode_steps=1000, render_mode="rgb_array")

trained_agent, rewards_per_episode, epsilon_values, episode_times = train_q_values(mountain_env, env_name, training_episodes=1000)
torch.save(trained_agent.policy_net.state_dict(), "dqn_mountain_car_env_v0.pth")

Episode 1: Total Reward: -988.678, Epsilon: 0.987, Steps: 1000, Time: 1.50s
Checkpoint Reached!
Episode 11: Total Reward: -992.208, Epsilon: 0.866, Steps: 1000, Time: 1.46s
Episode 21: Total Reward: -981.603, Epsilon: 0.760, Steps: 1000, Time: 1.61s
Episode 31: Total Reward: -982.253, Epsilon: 0.667, Steps: 1000, Time: 1.53s
Episode 41: Total Reward: -989.893, Epsilon: 0.585, Steps: 1000, Time: 1.51s
Episode 51: Total Reward: -990.714, Epsilon: 0.513, Steps: 1000, Time: 1.54s
Episode 61: Total Reward: -985.377, Epsilon: 0.450, Steps: 1000, Time: 1.61s
Episode 71: Total Reward: -994.694, Epsilon: 0.395, Steps: 1000, Time: 1.65s
Episode 81: Total Reward: -993.932, Epsilon: 0.346, Steps: 1000, Time: 1.69s
Episode 91: Total Reward: -992.484, Epsilon: 0.304, Steps: 1000, Time: 1.55s
Episode 101: Total Reward: -994.288, Epsilon: 0.267, Steps: 1000, Time: 1.76s
Checkpoint Reached!
Episode 111: Total Reward: -995.980, Epsilon: 0.234, Steps: 1000, Time: 1.61s
Episode 121: Total Reward: -991.496