# DQN

In [1]:
import pandas as pd
import numpy as np
import random
import networkx as nx
import osmnx as ox
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from collections import deque
from tqdm import tqdm
from time import time
import torch
import torch.nn as nn
import torch.nn.functional as F 

import os
import sys
sys.path.append("../scripts")

from plotting import *
from enviroment import *

In [2]:
seeds = [
    960703545, 1277478588, 1936856304, 186872697, 1859168769, 1598189534, 1822174485, 1871883252, 694388766,
    188312339, 773370613, 2125204119, 2041095833, 1384311643, 1000004583, 358485174, 1695858027, 762772169,
    437720306, 939612284
]
G = ox.graph_from_address('Campinas, São Paulo', network_type='drive')
G = nx.convert_node_labels_to_integers(G)
source = 507
target = 235

In [3]:
device = "cpu" #"cuda" if torch.cuda.is_available() else "cpu"

In [136]:
class ReplayMemory():

    def __init__(self, capacity):
        self.memory = deque([], maxlen=capacity)

    def push(self, values):
        """Save a transition"""
        self.memory.append(values)

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)
    
class DQN(nn.Module):
    def __init__(self, states_dim, n_actions):
        super(DQN, self).__init__()
        self.layer1 = nn.Linear(states_dim, 128)
        self.layer2 = nn.Linear(128, 128)
        self.layer3 = nn.Linear(128, n_actions)

    def forward(self, x):
        x = F.relu(self.layer1(x))
        x = F.relu(self.layer2(x))
        return self.layer3(x)

class DQNAgent:
    def __init__(self, env, learning_rate = 0.3, gamma = 0.99, tau = 0.05, max_epsilon = 1, min_epsilon = 0.1, n_episodes = 1000, max_steps = 1000, batch_size = 64):
        self.env = env
        self.learning_rate = learning_rate
        self.gamma = gamma
        self.tau = tau
        self.max_epsilon = max_epsilon
        self.min_epsilon = min_epsilon
        self.n_episodes = n_episodes
        self.max_steps = max_steps
        self.batch_size = batch_size
        self.n_states = env.get_n_states()
        self.policy_net = DQN(self.n_states + 2, self.n_states)
        self.target_net = DQN(self.n_states + 2, self.n_states)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        #self.policy_net.to(device)
        #self.target_net.to(device)
        self.optimizer = torch.optim.Adam(self.policy_net.parameters(), lr=learning_rate)
        self.memory = ReplayMemory(1000)

    def get_features(self, state):
        """Return a torch tensor with the features of the state"""
        features = torch.zeros(self.n_states + 2, dtype=torch.float32)#, device=device)
        features[state] = 1
        features[self.n_states] = self.env.G.nodes[state]["x"]
        features[self.n_states + 1] = self.env.G.nodes[state]["y"]
        return features
    

    def update_epsilon(self):
        self.epsilon -= (self.max_epsilon - self.min_epsilon) / self.n_episodes
        

    def greedy_policy(self, state):
        """Greedy policy that returns the action with the highest Q value"""
        neighbors = list(self.env.G.neighbors(state))
        # transform neighbors to boolean array
        neighbors = [True if neighbor in neighbors else False for neighbor in range(self.n_states)]
        actions_values = self.policy_net(self.get_features(state))
        # make non-neighbors equal to -inf so they are not chosen
        neighbors_boolean = torch.tensor(neighbors, dtype=torch.bool) #, device=device)
        actions_values[~neighbors_boolean] = -float("Inf")
        return actions_values.argmax().view(1).item()
    

    def epsilon_greedy_policy(self, state, epsilon):
        """Epsilon greedy policy that returns a random action with probability epsilon"""
        if np.random.uniform(0, 1) < epsilon:
            neighbors = list(self.env.G.neighbors(state))
            return random.choice(neighbors)
        else:
            return self.greedy_policy(state)
    

    def train(self):
        self.epsilon = self.max_epsilon
        self.episode_rewards = []
        for episode in tqdm(range(self.n_episodes)):
            self.generate_episode(self.epsilon)
            self.update_epsilon()
        
        with torch.no_grad():
            self.get_policy()

    
    def generate_episode(self, epsilon):
        state = self.env.reset()
        state_features = self.get_features(state)
        self.episode_rewards.append(0)

        for step in range(self.max_steps):
            # Choose action and get reward
            action = self.epsilon_greedy_policy(state, epsilon)
            new_state, reward, done = self.env.step(action)
            self.episode_rewards[-1] += reward
            new_state_features = self.get_features(new_state)

            self.memory.push(torch.cat([state_features, new_state_features, torch.tensor([action, reward], dtype=torch.float32)]))
            state = new_state

            self.optimize_model()

            # soft update of weights
            target_net_state_dict = self.target_net.state_dict()
            policy_net_state_dict = self.policy_net.state_dict()
            for key in policy_net_state_dict:
                target_net_state_dict[key] = policy_net_state_dict[key]*self.tau + target_net_state_dict[key]*(1-self.tau)
            self.target_net.load_state_dict(target_net_state_dict)

            if done:
                break

    
    def optimize_model(self):
        if len(self.memory) < self.batch_size:
            return
        
        transitions = torch.stack(self.memory.sample(self.batch_size))
        
        state_batch = transitions[:, :self.n_states + 2].view(-1, self.n_states + 2)
        new_state_batch = transitions[:, (self.n_states+2):2*(self.n_states+2)].view(-1, self.n_states + 2)
        action_batch = transitions[:, -2].view(-1, 1).long()
        reward_batch = transitions[:, -1].view(-1, 1)
        
        Q_s_a = self.policy_net(state_batch).gather(1, action_batch)
        Q_s_a_prime = self.target_net(new_state_batch).max(dim=1)[0].view(-1, 1).detach()
        
        
        loss = F.mse_loss(Q_s_a, reward_batch + self.gamma * Q_s_a_prime)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

    def get_policy(self):
        states_features = [self.get_features(state) for state in range(self.env.get_n_states())]
        states_features = torch.stack(states_features)
        states_values = self.policy_net(states_features).cpu().numpy()
        self.policy = states_values.argmax(axis=1)
        self.policy = {state: action for state, action in enumerate(self.policy)}

In [146]:
env = Environment(G, source, target, "weighted")
agent = DQNAgent(env, n_episodes = 100)

In [147]:
agent.train()

  8%|▊         | 8/100 [01:14<14:20,  9.35s/it]


KeyboardInterrupt: 

In [None]:
agent.get_features(0)

tensor([  1.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,
          0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,
          0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,
          0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,
          0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,
          0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,
          0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,
          0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,
          0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,
          0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,
          0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,
          0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,
          0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.00

In [None]:
agent.greedy_policy(0)

418

In [None]:
agent.epsilon_greedy_policy(0, 1)

418

In [None]:
agent.memory.sample(5)[0]

tensor([ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ..., -2.2910e+01,
         5.7200e+02, -9.2886e-02])

In [None]:
agent.memory.sample(5)[0].shape

torch.Size([1216])

In [None]:
agent.memory.sample(5)[0].dtype

torch.float32

In [None]:
agent.policy

{0: 592,
 1: 592,
 2: 592,
 3: 592,
 4: 592,
 5: 592,
 6: 592,
 7: 592,
 8: 592,
 9: 592,
 10: 592,
 11: 592,
 12: 592,
 13: 592,
 14: 592,
 15: 592,
 16: 592,
 17: 592,
 18: 592,
 19: 592,
 20: 592,
 21: 592,
 22: 592,
 23: 592,
 24: 592,
 25: 592,
 26: 592,
 27: 592,
 28: 592,
 29: 592,
 30: 592,
 31: 592,
 32: 592,
 33: 592,
 34: 592,
 35: 592,
 36: 592,
 37: 592,
 38: 592,
 39: 592,
 40: 592,
 41: 592,
 42: 592,
 43: 592,
 44: 592,
 45: 592,
 46: 592,
 47: 592,
 48: 592,
 49: 592,
 50: 592,
 51: 592,
 52: 592,
 53: 592,
 54: 592,
 55: 592,
 56: 592,
 57: 592,
 58: 592,
 59: 592,
 60: 592,
 61: 592,
 62: 592,
 63: 592,
 64: 592,
 65: 592,
 66: 592,
 67: 592,
 68: 592,
 69: 592,
 70: 592,
 71: 592,
 72: 592,
 73: 592,
 74: 592,
 75: 592,
 76: 592,
 77: 592,
 78: 592,
 79: 592,
 80: 592,
 81: 592,
 82: 592,
 83: 592,
 84: 592,
 85: 592,
 86: 592,
 87: 592,
 88: 592,
 89: 592,
 90: 592,
 91: 592,
 92: 592,
 93: 592,
 94: 592,
 95: 592,
 96: 592,
 97: 592,
 98: 592,
 99: 592,
 100: 592,