In [123]:
from collections import namedtuple, deque
import math
import random
from tqdm import tqdm

import gym
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

# if GPU is to be used
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 1. MountainCar Environment

In [2]:
""" Environment information
ref: https://github.com/openai/gym/blob/master/gym/envs/classic_control/mountain_car.py#L16

Observation Space
    The observation is a `ndarray` with shape `(2,)` where the elements correspond to the following:
    | Num | Observation                          | Min  | Max | Unit         |
    |-----|--------------------------------------|------|-----|--------------|
    | 0   | position of the car along the x-axis | -Inf | Inf | position (m) |
    | 1   | velocity of the car                  | -Inf | Inf | position (m) |
Action Space
    There are 3 discrete deterministic actions:
    | Num | Observation             | Value | Unit         |
    |-----|-------------------------|-------|--------------|
    | 0   | Accelerate to the left  | Inf   | position (m) |
    | 1   | Don't accelerate        | Inf   | position (m) |
    | 2   | Accelerate to the right | Inf   | position (m) |
Reward:
    The goal is to reach the flag placed on top of the right hill as quickly as possible, as such the agent is
    penalised with a reward of -1 for each timestep.
Starting State
    The position of the car is assigned a uniform random value in *[-0.6 , -0.4]*.
    The starting velocity of the car is always assigned to 0.
Episode End
    The episode ends if either of the following happens:
    1. Termination: The position of the car is greater than or equal to 0.5 (the goal position on top of the right hill)
    2. Truncation: The length of the episode is 200.
"""
env = gym.make("MountainCar-v0")
# env.reset()
# env.render()

Sample environment image

<img width=300 src="mountain_car.png" />

In [3]:
print("observation_space: ", env.observation_space)
print("action_space: ", env.action_space)
state_Xposition, state_velocity = env.reset()
print("sample obs: ", (state_Xposition, state_velocity))

observation_space:  Box([-1.2  -0.07], [0.6  0.07], (2,), float32)
action_space:  Discrete(3)
sample obs:  (-0.520838, 0.0)


In [4]:
# sample step
new_state, reward, done, info = env.step(1)
print("sample step: ", (new_state, reward, done, info))

sample step:  (array([-5.2085871e-01, -2.0705547e-05], dtype=float32), -1.0, False, {})


# 2. Experience Replay

In [5]:
Transition = namedtuple('Transition', ('state', 'action', 'reward', 'next_state'))

class ExperienceReplayMemory(object):
    """
    Store samples of transitions based on applying epsilon-greedy with Q-Network and replay them to train the network
    """

    def __init__(self, capacity):
        self.memory = deque([], maxlen=capacity)

    def push(self, *args):
        """Save a transition"""
        self.memory.append(Transition(*args))

    def sample(self, batch_size):
        """Randomly sample a batch of transitions"""
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

In [6]:
# sample transition
state, action, reward, next_state = [1, 2, 3, 4]
Transition(state, action, reward, next_state)

Transition(state=1, action=2, reward=3, next_state=4)

# 3. Deep Q-Network

In [14]:
class VanillaNN(nn.Module):
    
    def __init__(self, n_observations, n_actions):
        super(VanillaNN, self).__init__()
        self.layer1 = nn.Linear(n_observations, 128)
        self.layer2 = nn.Linear(128, 128)
        self.layer3 = nn.Linear(128, n_actions)

    def forward(self, x):
        x = F.relu(self.layer1(x))
        x = F.relu(self.layer2(x))
        return self.layer3(x)

In [122]:
class DQN:
    def __init__(self, env, memory_size=10000, batch_size=128, gamma=0.99, epsilon_start=0.9, epsilon_end=0.05, epsilon_decay=1000, tau=0.005, learning_rate=1e-4):
        self.env = env  # the gym environment
        self.memory_size = memory_size  # the size of the replay buffer
        self.batch_size = batch_size  # number of transitions sampled from the replay buffer
        self.gamma = gamma  # the discount factor as mentioned in the previous section
        self.epsilon_start = epsilon_start  # the starting value of epsilon
        self.epsilon_end = epsilon_end  # the final value of epsilon
        self.epsilon_decay = epsilon_decay  # controls the rate of exponential decay of epsilon, higher means a slower decay
        self.tau = tau  # the update rate of the target network
        self.learning_rate = learning_rate  # the learning rate of the optimizer
        
        self.steps_done = 0  # the number of steps taken in the environment
        
        # get number of actions and observations
        self._n_actions = self.env.action_space.n
        self._n_observations = len(env.reset())
        
        # setup NN models
        self.policy_net = VanillaNN(self._n_observations, self._n_actions).to(device)
        self.target_net = VanillaNN(self._n_observations, self._n_actions).to(device)
        self.target_net.load_state_dict(self.policy_net.state_dict())

        # setup optimizer and loss function
        self.optimizer = optim.AdamW(self.policy_net.parameters(), lr=self.learning_rate, amsgrad=True)
        self.criterion = nn.MSELoss()  # ref: https://pytorch.org/docs/stable/generated/torch.nn.MSELoss.html
        # self.criterion = nn.SmoothL1Loss()  # ref: https://pytorch.org/docs/stable/generated/torch.nn.SmoothL1Loss.html
        
        # setup Experience memory
        self.memory = ExperienceReplayMemory(self.memory_size)
    
    def choose_action(self, state):
        """
        Chooses an action using an epsilon-greedy policy (with decay rate).
        """
        epsilon_threshold = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * math.exp(-1. * self.steps_done / self.epsilon_decay)
        if np.random.uniform(0, 1) > epsilon_threshold:
            with torch.no_grad():
                # return action with highest Q value
                state_Qs = self.policy_net(state)  # get Q values for all actions
                action_max_Q = state_Qs.max(1)[1]  # get the action with the highest Q value
                return action_max_Q.view(1, 1)
        else:
            return torch.tensor([[self.env.action_space.sample()]], device=device, dtype=torch.long)
    
    def train(self, num_episodes=1000, max_steps=200, target_soft_update=False, target_update_freq=1000):
        for episode in tqdm(range(num_episodes)):
            state = self.env.reset()
            state = torch.tensor(state, dtype=torch.float32, device=device).unsqueeze(0)
            
            for step in range(max_steps):
                # choose and take action using epsilon-greedy policy
                action = self.choose_action(state)
                next_state, reward, done, _ = self.env.step(action.item())

                # store the transition (state, action, reward, next_state) in memory
                next_state = torch.tensor(next_state, dtype=torch.float32, device=device).unsqueeze(0)
                reward = torch.tensor([reward], device=device, dtype=torch.float32)
                self.memory.push(state, action, reward, next_state)
                
                # move to the next state
                state = next_state
                
                # perform one step of the optimization on the policy network
                self.optimize_model()
                
                # update target network
                if target_soft_update:
                    # Soft update of the target network weights (θ′ ← τθ + (1−τ)θ′)
                    target_net_state_dict = self.target_net.state_dict()
                    policy_net_state_dict = self.policy_net.state_dict()
                    for key in policy_net_state_dict:
                        target_net_state_dict[key] = policy_net_state_dict[key]*self.tau + target_net_state_dict[key]*(1-self.tau)
                    self.target_net.load_state_dict(target_net_state_dict)
                else:
                    # Hard update of the target network weights (θ′ ← θ)
                    if self.steps_done % target_update_freq == 0:
                        self.target_net.load_state_dict(self.policy_net.state_dict())
                
                # update step counter
                self.steps_done += 1

                if done:
                    break
    
    def optimize_model(self):
        if len(self.memory) < self.batch_size:
            return
        
        # get a batch of transitions from the replay buffer
        transitions = self.memory.sample(self.batch_size)
        batch = Transition(*zip(*transitions))

        # compute a mask of non-final states and concatenate the batch elements (final state is the one after which simulation ended)
        non_final_mask = torch.tensor(tuple(map(lambda s: s is not None, batch.next_state)), device=device, dtype=torch.bool)
        non_final_next_states = torch.cat([s for s in batch.next_state if s is not None])
        state_batch = torch.cat(batch.state)
        action_batch = torch.cat(batch.action)
        reward_batch = torch.cat(batch.reward)

        # compute Q(s_t, a) using the policy_net model and select the columns of actions taken for each batch state
        state_action_values = self.policy_net(state_batch).gather(1, action_batch)

        # compute V(s_{t+1}) for all next states 
        # where the expected values of actions are computed based on the target_net model and selected based on the action with max state value
        # note: V(s_{t+1}) = 0 if the state was the final state of the simulation (i.e. non_final_mask is False)
        next_state_values = torch.zeros(self.batch_size, device=device)
        with torch.no_grad():
            next_state_values[non_final_mask] = self.target_net(non_final_next_states).max(1)[0]
        
        # compute the expected (target) Q values
        expected_state_action_values = reward_batch (self.gamma * next_state_values)

        # compute MSE loss
        loss = self.criterion(state_action_values, expected_state_action_values.unsqueeze(1))

        # optimize the model with backpropagation
        self.optimizer.zero_grad()
        loss.backward()
        # in-place gradient clipping (to avoid exploding gradients)
        torch.nn.utils.clip_grad_value_(self.policy_net.parameters(), 100)
        self.optimizer.step()
        
    def generate_sample_episode(self, max_steps=200, display=False):
        episode = list()
        
        state = self.env.reset()
        state = torch.tensor(state, dtype=torch.float32, device=device).unsqueeze(0)
        for step in tqdm(range(max_steps)):
            # choose and take action using epsilon-greedy policy
            action = self.choose_action(state)
            next_state, reward, done, _ = self.env.step(action.item())
            if display:
                self.env.render()

            # store the transition (state, action, reward, next_state) in memory
            next_state = torch.tensor(next_state, dtype=torch.float32, device=device).unsqueeze(0)
            reward = torch.tensor([reward], device=device, dtype=torch.float32)
            episode.append([state, action, reward, next_state])
            
            # move to the next state
            state = next_state
            
            if done:
                break
        
        return episode

In [96]:
dqn_agent = DQN(env)
dqn_agent.train(num_episodes=5000)

100%|██████████| 5000/5000 [12:29<00:00,  6.67it/s]


In [100]:
episode = dqn_agent.generate_sample_episode(display=True)

 90%|████████▉ | 179/200 [00:01<00:00, 96.17it/s] 
