# DQN

In [1]:
import pandas as pd
import numpy as np
import random
import networkx as nx
import osmnx as ox
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from collections import deque
from tqdm import tqdm
from time import time
import torch
import torch.nn as nn
import torch.nn.functional as F 

import os
import sys
sys.path.append("../scripts")

from plotting import *
from enviroment import *

In [2]:
seeds = [
    960703545, 1277478588, 1936856304, 186872697, 1859168769, 1598189534, 1822174485, 1871883252, 694388766,
    188312339, 773370613, 2125204119, 2041095833, 1384311643, 1000004583, 358485174, 1695858027, 762772169,
    437720306, 939612284
]
G = ox.graph_from_address('Campinas, São Paulo', network_type='drive')
G = nx.convert_node_labels_to_integers(G)
source = 507
target = 235

In [16]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [17]:
class ReplayMemory():

    def __init__(self, capacity):
        self.memory = deque([], maxlen=capacity)

    def push(self, values):
        """Save a transition"""
        self.memory.append(values)

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)
    
class DQN(nn.Module):
    def __init__(self, states_dim, n_actions):
        super(DQN, self).__init__()
        self.layer1 = nn.Linear(states_dim, 128)
        self.layer2 = nn.Linear(128, 128)
        self.layer3 = nn.Linear(128, n_actions)

    def forward(self, x):
        x = F.relu(self.layer1(x))
        x = F.relu(self.layer2(x))
        return self.layer3(x)

class DQNAgent:
    def __init__(self, env, learning_rate = 0.3, gamma = 0.99, tau = 0.05, max_epsilon = 1, min_epsilon = 0.1, n_episodes = 1000, max_steps = 1000, batch_size = 64):
        self.env = env
        self.learning_rate = learning_rate
        self.gamma = gamma
        self.tau = tau
        self.max_epsilon = max_epsilon
        self.min_epsilon = min_epsilon
        self.n_episodes = n_episodes
        self.max_steps = max_steps
        self.batch_size = batch_size
        self.policy_net = DQN(1, env.get_n_states())
        self.target_net = DQN(1, env.get_n_states())
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.policy_net.to(device)
        self.target_net.to(device)
        self.optimizer = torch.optim.Adam(self.policy_net.parameters(), lr=learning_rate)
        self.memory = ReplayMemory(1000)

    def update_epsilon(self):
        self.epsilon -= (self.max_epsilon - self.min_epsilon) / self.n_episodes
        
    def greedy_policy(self, state):
        """Greedy policy that returns the action with the highest Q value"""
        neighbors = list(self.env.G.neighbors(state.item()))
        # transform neighbors to boolean array
        neighbors = [True if neighbor in neighbors else False for neighbor in range(self.env.get_n_states())]
        actions_values = self.policy_net(state)
        # make non-neighbors equal to -inf so they are not chosen
        actions_values[~torch.tensor(neighbors, device=device, dtype=torch.bool)] = -float("Inf")
        return actions_values.argmax().view(1)
    
    def epsilon_greedy_policy(self, state, epsilon):
        """Epsilon greedy policy that returns a random action with probability epsilon"""
        if np.random.uniform(0, 1) < epsilon:
            neighbors = list(self.env.G.neighbors(state.item()))
            return torch.tensor([random.choice(neighbors)], device=device, dtype=torch.float32)
        else:
            return self.greedy_policy(state)

    def optimize_model(self):
        if len(self.memory) < self.batch_size:
            return
        
        transitions = torch.cat(self.memory.sample(self.batch_size)).view(self.batch_size, -1)
        state_batch = transitions[:, 0].view(-1, 1)
        action_batch = transitions[:, 1].view(-1, 1).long()
        reward_batch = transitions[:, 2].view(-1, 1)
        next_state_batch = transitions[:, 3].view(-1, 1)
        
        Q_s_a = self.policy_net(state_batch).gather(1, action_batch)
        Q_s_a_prime = self.target_net(next_state_batch).max(dim=1)[0].view(-1, 1).detach()
        
        loss = F.mse_loss(Q_s_a, reward_batch + self.gamma * Q_s_a_prime)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

    def generate_episode(self, epsilon):
        state = self.env.reset()
        state = torch.tensor([state], device=device, dtype = torch.float32)
        self.episode_rewards.append(0)
    
        for step in range(self.max_steps):
            # Choose action and get reward
            action = self.epsilon_greedy_policy(state, epsilon)
            new_state, reward, done = self.env.step(action.item())
            self.episode_rewards[-1] += reward
            reward = torch.tensor([reward], device=device, dtype = torch.float32)
            new_state = torch.tensor([new_state], device=device, dtype = torch.float32)
            #print([state, action, reward, new_state])
            self.memory.push(torch.stack([state, action, reward, new_state]))
            state = new_state

            self.optimize_model()

            # soft update of weights
            target_net_state_dict = self.target_net.state_dict()
            policy_net_state_dict = self.policy_net.state_dict()
            for key in policy_net_state_dict:
                target_net_state_dict[key] = policy_net_state_dict[key]*self.tau + target_net_state_dict[key]*(1-self.tau)
            self.target_net.load_state_dict(target_net_state_dict)

            if done:
                break

    def train(self):
        self.epsilon = self.max_epsilon
        self.episode_rewards = []
        for episode in tqdm(range(self.n_episodes)):
            self.generate_episode(self.epsilon)
            self.update_epsilon()
        
        states_values = torch.arange(self.env.get_n_states(), device = device, dtype = torch.float32).view(-1, 1)
        self.policy = self.policy_net(states_values).max(dim = 1)[1]
        self.policy = self.policy.cpu().numpy()
        self.policy = {state: action for state, action in enumerate(self.policy)}

In [18]:
env = Environment(G, source, target, "weighted")
agent = DQNAgent(env, n_episodes = 500)

: 

In [15]:
agent.train()

  5%|▌         | 26/500 [01:42<31:00,  3.92s/it]  


KeyboardInterrupt: 