# Implementação de uma Deep Q-Network

## Importações

In [None]:
import sys
import gym
import math
import random
import numpy as np
import matplotlib.pyplot as plt
from collections import namedtuple
from PIL import Image
from itertools import count

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T

Abaixo vamos importar a classe de Replay Buffer, muito importante para DQN's, que você pode ver a implementação no nosso repositório em Aprendizado por Reforço Profundo -> Deep Q-Learning -> Experience Replay -> ExperienceReplay.ipynb 

In [None]:
# Código necessário para importar funções de outro arquivo de um diretório diferente
sys.path.insert(1, '../Experience Replay')
from ReplayBuffer import ReplayBuffer

## Rede Neural

In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

class LinearNetwork(nn.Module):
    def __init__(self, in_dim, out_dim):
        super(LinearNetwork, self).__init__()

        self.layers = nn.Sequential(
            nn.Linear(in_dim, 64), 
            nn.ReLU(),
            nn.Linear(64, 64), 
            nn.ReLU(), 
            nn.Linear(64, out_dim)
        )

    def forward(self, x):
        return self.layers(x)

In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

class ConvNetwork(nn.Module):
    
    def __init__(self, in_channels, h, w, outputs):
        """
        Cria uma rede convolucional 
        
        h: int
        A altura da imagem
        
        w: int
        A largura da imagem
        
        outputs:
        None
        """
        super(DQN, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, 16, kernel_size=5, stride=2)
        self.bn1 = nn.BatchNorm2d(16)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=5, stride=2)
        self.bn2 = nn.BatchNorm2d(32)
        self.conv3 = nn.Conv2d(32, 32, kernel_size=5, stride=2)
        self.bn3 = nn.BatchNorm2d(32)
        
        # Calculate the output size of conv to be the input of linear
        def conv2d_size_out(size, kernel_size = 5, stride = 2):
            return (size - (kernel_size - 1) - 1) // stride  + 1
        
        convw = conv2d_size_out(conv2d_size_out(conv2d_size_out(w)))
        convh = conv2d_size_out(conv2d_size_out(conv2d_size_out(h)))
        linear_input_size = convw * convh * 32
        self.head = nn.Linear(linear_input_size, outputs)   
        
    def forward(self, x):
        """
        Faz a forward propagation pela rede
        """
        x = F.relu(self.bn1(self.conv1(x)))
        x = F.relu(self.bn2(self.conv2(x)))
        x = F.relu(self.bn3(self.conv3(x)))
        return self.head(x.view(x.size(0), -1))

## Agente DQN

In [None]:
class DQNagent:
    """
    Uma classe que cria um agente DQN que utiliza ReplayBuffer como memória
    """
    def __init__(self, 
                 observation_space, 
                 action_space, 
                 lr=3e-4, 
                 gamma=0.99, 
                 max_memory=100000,
                 epsilon_init=0.5,
                 epsilon_decay=0.9995,
                 min_epsilon=0.01,
                 network='linear'):
      
        """
        Inicializa o agente com os parâmetros dados

        """
      
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        self.gamma = gamma
        self.memory = ReplayBuffer(max_memory, observation_space.shape[0])
        self.action_space = action_space

        self.epsilon = epsilon_init
        self.epsilon_decay = epsilon_decay
        self.min_epsilon = min_epsilon

        if network == 'linear':
            self.dqn = LinearNetwork(observation_space.shape[0], action_space.n).to(self.device)
        
        elif network == 'conv': 
            h = observation_space.shape[0]
            w = observation_space.shape[1]
            in_channels = observation_space.shape[2]
            outputs = action_space.n
            self.dqn = ConvNetwork(in_channels, h, w, outputs)

        self.optimizer  = optim.Adam(self.dqn.parameters(), lr=lr)

    def act(self, state):
        self.epsilon *= self.epsilon_decay
        self.epsilon = max(self.epsilon, self.min_epsilon)

        if np.random.random() < self.epsilon:
            action = self.action_space.sample()
            return action

        with torch.no_grad():
            state = torch.FloatTensor(state).to(self.device)
            action = self.dqn.forward(state).argmax(dim=-1)
            action = action.cpu().numpy()

        return action

    def remember(self, state, action, reward, new_state, done):
        self.memory.update(state, action, reward, new_state, done)

    def train(self, batch_size=128, epochs=1):
        # Se temos menos experiências que o batch size
        # não começamos o treinamento
        if batch_size * 10 > self.memory.size:
            return
        
        for epoch in range(epochs):
            # Pegamos uma amostra das nossas experiências para treinamento
            (states, actions, rewards, next_states, dones) = self.memory.sample(batch_size)

            # Transformar nossas experiências em tensores
            states = torch.as_tensor(states).to(self.device)
            actions = torch.as_tensor(actions).to(self.device).unsqueeze(-1)
            rewards = torch.as_tensor(rewards).to(self.device).unsqueeze(-1)
            next_states = torch.as_tensor(next_states).to(self.device)
            dones = torch.as_tensor(dones).to(self.device).unsqueeze(-1)

            q = self.dqn.forward(states).gather(-1, actions.long())

            with torch.no_grad():
                q2 = self.dqn.forward(next_states).max(dim=-1, keepdim=True)[0]

                target = (rewards + (1 - dones) * self.gamma * q2).to(self.device)

            loss = F.mse_loss(q, target)

            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()

### Definição de parâmetros

In [None]:
env_name = 'CartPole-v0'
env = gym.make(env_name)

In [None]:
BATCH_SIZE = 128
GAMMA = 0.99
EPS_INIT = 0.7
EPS_END = 0.05
EPS_DECAY = 0.999
MAX_MEMORY = 100000
NETWORK = 'linear'
OBS_SPACE = env.observation_space
ACT_SPACE = env.action_space

In [None]:
OBS_SPACE, ACT_SPACE

### Criando a DQN

In [None]:
dqn_net = DQNagent(observation_space=OBS_SPACE, 
                   action_space=ACT_SPACE, 
                   lr=3e-4, 
                   gamma=GAMMA, 
                   max_memory=MAX_MEMORY,
                   epsilon_init=EPS_INIT,
                   epsilon_decay=EPS_DECAY,
                   min_epsilon=EPS_END,
                   network=NETWORK)

## Loop

In [None]:
def train(agent, env, timesteps, render=False):
    """
    Função para treinar um agente em um determinado ambiente em gym
    
    Parâmetros
    ----------
    
    Retorna
    -------
    
    """
    
    # Resetar o ambiente e armazenar o estado inicial
    state = env.reset()
    
    # Rodar o número de episódios especificados
    for timestep in range(1, timesteps+1):
        # Pegar a ação escolhida pelo agente de acordo com
        # o estado atual
        action = agent.act(state)
        
        # Tomar a ação escolhida
        next_state, reward, done, info = env.step(action)
        
        # Guardar as informações geradas pela ação
        agent.remember(state, action, reward, next_state, done)
        
        # Treinar a rede com base no ReplayBuffer
        agent.train()
        
        # Atualiza o estado
        state = next_state
        
        if render:
        # Mostra o ambiente
            env.render()

In [None]:
timesteps = 10000
train(dqn_net, env, timesteps, render=False)