# Mountain Car DQN

## Imports

In [3]:
import os
import time
from abc import ABC, abstractmethod
from enum import Enum
from gymnasium import spaces
import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt
import random
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from collections import deque

from IPython.display import clear_output

%pip install swig
%pip install "gymnasium[box2d]"

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0.1 -> 25.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0.1 -> 25.1
[notice] To update, run: python.exe -m pip install --upgrade pip


## Class definition

In [5]:
class DQN(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(state_dim, 12)
        self.fc2 = nn.Linear(12, 8)
        self.fc3 = nn.Linear(8, action_dim)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))

        return self.fc3(x)

class ReplayBuffer:
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)

    def push(self, state, action, reward, next_state, done):
        self.buffer.append((state, action, reward, next_state, done))

    def sample(self, batch_size):
        return random.sample(self.buffer, batch_size)

    def __len__(self):
        return len(self.buffer)

# DQN Agent
class DQNAgent:
    BATCH_SIZE = 32
    EPSILON_START = 1
    EPSILON_CUTOFF = 0.00
    EPSILON_DECAY = 0.987
    GAMMA = 0.9
    LR = 0.01
    MEMORY_SIZE = 100000

    def __init__(self, env):
        state_dim = env.observation_space.shape[0]
        action_dim = env.action_space.n
        
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.policy_net = DQN(state_dim, action_dim).to(self.device)
        self.target_net = DQN(state_dim, action_dim).to(self.device)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()
        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=self.LR)
        self.memory = ReplayBuffer(self.MEMORY_SIZE)
        self.steps_done = 0
        self.epsilon = self.EPSILON_START
        self.total_steps = 0

    def select_e_greedy_action(self, env, state):
        if random.random() < self.epsilon:
            return random.randint(0, env.action_space.n - 1)
        else:
            state = torch.tensor(state, dtype=torch.float32, device=self.device).unsqueeze(0)
            with torch.no_grad():
                return self.policy_net(state).argmax(dim=1).item()

    def select_greedy_action(self, state):
        state = torch.tensor(state, dtype=torch.float32, device=self.device).unsqueeze(0)
        with torch.no_grad():
            return self.policy_net(state).argmax(dim=1).item()

    def decay_epsilon(self):
        self.total_steps += 1
        self.epsilon = max(self.EPSILON_CUTOFF, self.epsilon * self.EPSILON_DECAY)

    def step(self):
        if len(self.memory) < self.BATCH_SIZE:
            return
        batch = self.memory.sample(self.BATCH_SIZE)

        states, actions, rewards, next_states, dones = zip(*batch)
        states = torch.tensor(np.array(states), dtype=torch.float32, device=self.device)
        actions = torch.tensor(np.array(actions), dtype=torch.int64, device=self.device).unsqueeze(1)
        rewards = torch.tensor(np.array(rewards), dtype=torch.float32, device=self.device).unsqueeze(1)
        next_states = torch.tensor(np.array(next_states), dtype=torch.float32, device=self.device)
        dones = torch.tensor(np.array(dones), dtype=torch.float32, device=self.device).unsqueeze(1)
        current_q = self.policy_net(states).gather(1, actions)
        next_q = self.target_net(next_states).max(1, keepdim=True)[0].detach()
        target_q = rewards + (self.GAMMA * next_q * (1 - dones))

        loss = F.mse_loss(target_q, current_q)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

    def update_target_network(self):
        self.target_net.load_state_dict(self.policy_net.state_dict())


## Training function

In [7]:

def train_q_values(env, env_name, target_update_interval=10000, training_episodes=1000, agent_class=DQNAgent):

    SPLITS = 30
    OBS_LOW_VALUES = env.observation_space.low
    OBS_HIGH_VALUES = env.observation_space.high
    
    POS_VALUES =  np.linspace(OBS_LOW_VALUES[0], OBS_HIGH_VALUES[0], SPLITS)
    VEL_VALUES = np.linspace(OBS_LOW_VALUES[1], OBS_HIGH_VALUES[1], SPLITS) 
    
    def quantize_state(state):
        return torch.FloatTensor([
            np.digitize(state[0], POS_VALUES), 
            np.digitize(state[1], VEL_VALUES)
        ])
    
    agent = agent_class(env)
    rewards_per_episode = []
    epsilon_values = []
    episode_times = []
    start_episode = 0
    early_stop_counter = 0

    for episode in range(start_episode, training_episodes):
        start_time = time.time()
        obs, _ = env.reset()
        state = quantize_state(obs)
        total_reward = 0
        episode_over = False
        steps_taken = 0
        episode_transitions = []

        epsilon_values.append(agent.epsilon)

        while not episode_over:
            action = agent.select_e_greedy_action(env, state)
            obs, reward, done, truncated, _ = env.step(action)
            reward += 2 * abs(state[1])
            reward += 10000 if state[0] >= np.max(OBS_HIGH_VALUES) else 0
            next_state = quantize_state(np.array(obs, dtype=np.float32))
            agent.memory.push(state, action, reward, next_state, done)
            episode_over = done or truncated
            state = next_state
            total_reward += reward
            steps_taken += 1
            
            agent.step()
            
            if agent.total_steps % target_update_interval == 0:
                agent.update_target_network()
    
        agent.decay_epsilon()
        rewards_per_episode.append(total_reward)
        end_time = time.time()
        episode_time = end_time - start_time
        episode_times.append(episode_time)

        if episode % 10 == 0:
            print(f"Episode {episode + 1}: Total Reward: {total_reward}, Epsilon: {agent.epsilon:.3f}, Steps: {steps_taken}, Time: {episode_time:.2f}s")
        if episode % 100 == 0:
            # Saving model
            torch.save(agent.policy_net.state_dict(), "dqn_mountain_car_env_v0.pth")
            # Creating checkpoint dictionary
            checkpoint = {
                'policy_net_state_dict': agent.policy_net.state_dict(),
                'optimizer_state_dict': agent.optimizer.state_dict(),
                'total_steps': agent.total_steps,
                'current_episode': episode,
                'rewards_per_episode': rewards_per_episode,
                'epsilon_values': epsilon_values,
                'episode_times': episode_times
            }
            torch.save(checkpoint, "dqn_mountain_car_checkpoints.pth")
            print("Checkpoint Reached!")

    return agent, rewards_per_episode, epsilon_values, episode_times
     


## Mountain Car

In [None]:
env_name = "MountainCar-v0"
mountain_env = gym.make(env_name, max_episode_steps=1000, render_mode="rgb_array")

trained_agent, rewards_per_episode, epsilon_values, episode_times = train_q_values(mountain_env, env_name, training_episodes=20000)
torch.save(trained_agent.policy_net.state_dict(), "dqn_mountain_car_env_v0.pth")

Episode 1: Total Reward: 10028998.0, Epsilon: 0.987, Steps: 1000, Time: 2.39s
Checkpoint Reached!


  state = torch.tensor(state, dtype=torch.float32, device=self.device).unsqueeze(0)


Episode 11: Total Reward: 10028898.0, Epsilon: 0.866, Steps: 1000, Time: 2.35s
Episode 21: Total Reward: 10028878.0, Epsilon: 0.760, Steps: 1000, Time: 2.35s
Episode 31: Total Reward: 10029162.0, Epsilon: 0.667, Steps: 1000, Time: 2.42s
Episode 41: Total Reward: 10028998.0, Epsilon: 0.585, Steps: 1000, Time: 4.01s
Episode 51: Total Reward: 10028924.0, Epsilon: 0.513, Steps: 1000, Time: 2.98s
Episode 61: Total Reward: 10029044.0, Epsilon: 0.450, Steps: 1000, Time: 3.08s
Episode 71: Total Reward: 10028932.0, Epsilon: 0.395, Steps: 1000, Time: 2.77s
Episode 81: Total Reward: 10029082.0, Epsilon: 0.346, Steps: 1000, Time: 2.81s
Episode 91: Total Reward: 10029106.0, Epsilon: 0.304, Steps: 1000, Time: 2.68s
Episode 101: Total Reward: 10029072.0, Epsilon: 0.267, Steps: 1000, Time: 3.54s
Checkpoint Reached!
Episode 111: Total Reward: 10029240.0, Epsilon: 0.234, Steps: 1000, Time: 2.86s
Episode 121: Total Reward: 10029092.0, Epsilon: 0.205, Steps: 1000, Time: 3.00s
Episode 131: Total Reward: 10