# Mountain Car DQN

## Imports

In [1]:
import os
import time
from abc import ABC, abstractmethod
from enum import Enum
from gymnasium import spaces
import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt
import random
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from collections import deque

from IPython.display import clear_output

%pip install swig
%pip install "gymnasium[box2d]"



## Mounting Google Drive

In [2]:
from google.colab import drive

DRIVE_PATH = '/content/drive/MyDrive/dqn_mountain_car_checkpoints/'

drive.mount('/content/drive')
os.makedirs(DRIVE_PATH, exist_ok=True)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Class definition

In [3]:
class DQN(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(state_dim, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, action_dim)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))

        return self.fc3(x)

class ReplayBuffer:
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)

    def push(self, state, action, reward, next_state, done):
        self.buffer.append((state, action, reward, next_state, done))

    def sample(self, batch_size):
        return random.sample(self.buffer, batch_size)

    def __len__(self):
        return len(self.buffer)

class DQNAgent:
    BATCH_SIZE = 32
    EPSILON_START = 1
    EPSILON_CUTOFF = 0.05
    EPSILON_DECAY = 0.995 # Will take about 1000 episodes to reach 0.1 and 1300 to reach 0.05
    GAMMA = 0.99
    LR = 5e-4
    MEMORY_SIZE = 30000
    total_steps = 0

    def __init__(self, state_dim, action_dim):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.policy_net = DQN(state_dim, action_dim).to(self.device)
        self.target_net = DQN(state_dim, action_dim).to(self.device)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()
        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=self.LR)
        self.memory = ReplayBuffer(self.MEMORY_SIZE)
        self.steps_done = 0
        self.epsilon = self.EPSILON_START

    def select_e_greedy_action(self, env, state):
        if random.random() < self.epsilon:
            return random.randint(0, env.action_space.n - 1)
        else:
            state = torch.tensor(state, dtype=torch.float32, device=self.device).unsqueeze(0)
            with torch.no_grad():
                return self.policy_net(state).argmax(dim=1).item()

    def select_greedy_action(self, state):
      state = torch.tensor(state, dtype=torch.float32, device=self.device).unsqueeze(0)
      with torch.no_grad():
          return self.policy_net(state).argmax(dim=1).item()

    def decay_epsilon(self):
        self.total_steps += 1
        self.epsilon = max(self.EPSILON_CUTOFF, self.EPSILON_START * (self.EPSILON_DECAY ** self.current_episode))

    def step(self):
        if len(self.memory) < self.BATCH_SIZE:
            return
        batch = self.memory.sample(self.BATCH_SIZE)

        states, actions, rewards, next_states, dones = zip(*batch)
        states = torch.tensor(np.array(states), dtype=torch.float32, device=self.device)
        actions = torch.tensor(np.array(actions), dtype=torch.int64, device=self.device).unsqueeze(1)
        rewards = torch.tensor(np.array(rewards), dtype=torch.float32, device=self.device).unsqueeze(1)
        next_states = torch.tensor(np.array(next_states), dtype=torch.float32, device=self.device)
        dones = torch.tensor(np.array(dones), dtype=torch.float32, device=self.device).unsqueeze(1)
        current_q = self.policy_net(states).gather(1, actions)
        next_q = self.target_net(next_states).max(1, keepdim=True)[0].detach()
        target_q = rewards + (self.GAMMA * next_q * (1 - dones))

        loss = F.mse_loss(target_q, current_q)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

    def update_target_network(self):
        self.target_net.load_state_dict(self.policy_net.state_dict())

## Training function

In [4]:
def train_q_values(env, env_name, target_update_interval=10, training_episodes=1000, agent_class=DQNAgent, checkpoint_path=None):
    MAX_STEPS = 200

    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.n
    agent = agent_class(state_dim, action_dim)
    rewards_per_episode = []
    epsilon_values = []
    episode_times = []
    start_episode = 0
    early_stop_counter = 0

    # Loading from last checkpoint in case training fails
    if checkpoint_path is not None:
        checkpoint = torch.load(os.path.join(DRIVE_PATH, checkpoint_path))
        agent.policy_net.load_state_dict(checkpoint['policy_net_state_dict'])
        agent.target_net.load_state_dict(checkpoint['policy_net_state_dict'])
        agent.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        agent.total_steps = checkpoint['total_steps']
        agent.current_episode = checkpoint['current_episode']
        rewards_per_episode = checkpoint['rewards_per_episode']
        epsilon_values = checkpoint['epsilon_values']
        episode_times = checkpoint['episode_times']
        start_episode = checkpoint['current_episode'] + 1
        agent.epsilon = max(agent.EPSILON_CUTOFF, agent.EPSILON_START * (agent.EPSILON_DECAY ** agent.current_episode))
        print(f"Resumed training from episode {start_episode}")

    for episode in range(start_episode, training_episodes):
        agent.current_episode = episode
        start_time = time.time()
        obs, _ = env.reset()
        state = obs
        total_reward = 0
        done = False
        steps_taken = 0
        episode_transitions = []

        epsilon_values.append(agent.epsilon)

        while not done and steps_taken < MAX_STEPS:
            action = agent.select_e_greedy_action(env, state)
            obs, reward, done, _, _ = env.step(action)
            next_state = np.array(obs, dtype=np.float32)
            episode_transitions.append((state, action, reward, next_state, done))
            state = next_state
            total_reward += reward
            steps_taken += 1

            agent.decay_epsilon()

        for transition in episode_transitions:
            agent.memory.push(*transition)

        for _ in range(len(episode_transitions)):
            agent.step()

        rewards_per_episode.append(total_reward)
        end_time = time.time()
        episode_time = end_time - start_time
        episode_times.append(episode_time)


        if episode % 10 == 0:
            print(f"Episode {episode + 1}: Total Reward: {total_reward}, Epsilon: {agent.epsilon:.3f}, Steps: {steps_taken}, Time: {episode_time:.2f}s")
            # Saving model
            torch.save(agent.policy_net.state_dict(), os.path.join(DRIVE_PATH, "dqn_mountain_car_env_v0.pth"))
            # Creating checkpoint dictionary and saving
            checkpoint = {
                'policy_net_state_dict': agent.policy_net.state_dict(),
                'optimizer_state_dict': agent.optimizer.state_dict(),
                'total_steps': agent.total_steps,
                'current_episode': agent.current_episode,
                'rewards_per_episode': rewards_per_episode,
                'epsilon_values': epsilon_values,
                'episode_times': episode_times
            }
            torch.save(checkpoint, os.path.join(DRIVE_PATH, "dqn_mountain_car_checkpoints.pth"))

        if episode % target_update_interval == 0:
            agent.update_target_network()

    return agent, rewards_per_episode, epsilon_values, episode_times

## Mountain Car

In [5]:
env_name = "MountainCar-v0"
mountain_env = gym.make(env_name, render_mode="rgb_array", goal_velocity=0.1)

trained_agent, rewards_per_episode, epsilon_values, episode_times = train_q_values(mountain_env, env_name, training_episodes=1500)
torch.save(trained_agent.policy_net.state_dict(), os.path.join(DRIVE_PATH, "dqn_mountain_car_env_v0.pth"))

Episode 1: Total Reward: -200.0, Epsilon: 1.000, Steps: 200, Time: 1.91s
Episode 11: Total Reward: -200.0, Epsilon: 0.951, Steps: 200, Time: 0.38s
Episode 21: Total Reward: -200.0, Epsilon: 0.905, Steps: 200, Time: 0.55s
Episode 31: Total Reward: -200.0, Epsilon: 0.860, Steps: 200, Time: 0.47s
Episode 41: Total Reward: -200.0, Epsilon: 0.818, Steps: 200, Time: 0.40s
Episode 51: Total Reward: -200.0, Epsilon: 0.778, Steps: 200, Time: 0.58s
Episode 61: Total Reward: -200.0, Epsilon: 0.740, Steps: 200, Time: 0.40s
Episode 71: Total Reward: -200.0, Epsilon: 0.704, Steps: 200, Time: 0.47s
Episode 81: Total Reward: -200.0, Epsilon: 0.670, Steps: 200, Time: 0.55s
Episode 91: Total Reward: -200.0, Epsilon: 0.637, Steps: 200, Time: 0.40s
Episode 101: Total Reward: -200.0, Epsilon: 0.606, Steps: 200, Time: 0.40s
Episode 111: Total Reward: -200.0, Epsilon: 0.576, Steps: 200, Time: 0.59s
Episode 121: Total Reward: -200.0, Epsilon: 0.548, Steps: 200, Time: 0.41s
Episode 131: Total Reward: -200.0, E