In [6]:
from collections import namedtuple
from collections import deque

import torch
import torch.nn as nn
from torch.utils.tensorboard import SummaryWriter

import numpy as np
import numpy.random as nr
from numpy.random import binomial
from numpy.random import choice

import gym
import datetime
from typing import Tuple


Transitions = namedtuple('Transitions', ['obs', 'action', 'reward', 'next_obs', 'done'])

class ReplayBuffer:
    def __init__(self, config):
        replay_buffer_size = config['replay_buffer_size']
        seed = config['seed']
        nr.seed(seed)

        self.replay_buffer_size = replay_buffer_size
        self.obs = deque([], maxlen=self.replay_buffer_size)
        self.action = deque([], maxlen=self.replay_buffer_size)
        self.reward = deque([], maxlen=self.replay_buffer_size)
        self.next_obs = deque([], maxlen=self.replay_buffer_size)
        self.done = deque([], maxlen=self.replay_buffer_size)

    def append_memory(self,
                      obs,
                      action,
                      reward,
                      next_obs,
                      done: bool):
        self.obs.append(obs)
        self.action.append(action)
        self.reward.append(reward)
        self.next_obs.append(next_obs)
        self.done.append(done)

    def sample(self, batch_size):
        buffer_size = len(self.obs)

        idx = nr.choice(buffer_size,
                        size=min(buffer_size, batch_size),
                        replace=False)
        t = Transitions
        t.obs = torch.stack(list(map(self.obs.__getitem__, idx)))
        t.action = torch.stack(list(map(self.action.__getitem__, idx)))
        t.reward = torch.stack(list(map(self.reward.__getitem__, idx)))
        t.next_obs = torch.stack(list(map(self.next_obs.__getitem__, idx)))
        t.done = torch.tensor(list(map(self.done.__getitem__, idx)))[:, None]
        return t

    def clear(self):
        self.obs = deque([], maxlen=self.replay_buffer_size)
        self.action = deque([], maxlen=self.replay_buffer_size)
        self.reward = deque([], maxlen=self.replay_buffer_size)
        self.next_obs = deque([], maxlen=self.replay_buffer_size)
        self.done = deque([], maxlen=self.replay_buffer_size)

In [7]:
Tensor = torch.DoubleTensor
torch.set_default_tensor_type(Tensor)


class DQN:
    def __init__(self, config):

        torch.manual_seed(config['seed'])

        self.lr = config['lr']  # learning rate
        self.C = config['C']  # copy steps
        self.eps_len = config['eps_len']  # length of epsilon greedy exploration
        self.eps_max = config['eps_max']
        self.eps_min = config['eps_min']
        self.discount = config['discount']  # discount factor
        self.batch_size = config['batch_size']  # mini batch size

        self.dims_hidden_neurons = config['dims_hidden_neurons']
        self.dim_obs = config['dim_obs']
        self.dim_action = config['dim_action']

        self.Q = QNetwork(dim_obs=self.dim_obs,
                          dim_action=self.dim_action,
                          dims_hidden_neurons=self.dims_hidden_neurons)
        self.Q_tar = QNetwork(dim_obs=self.dim_obs,
                              dim_action=self.dim_action,
                              dims_hidden_neurons=self.dims_hidden_neurons)

        self.optimizer_Q = torch.optim.Adam(self.Q.parameters(), lr=self.lr)
        self.training_step = 0

    def update(self, buffer):
        t = buffer.sample(self.batch_size)

        s = t.obs
        a = t.action
        r = t.reward
        sp = t.next_obs
        done = t.done

        self.training_step += 1

        # the actual size of the batch in curent step
        b_size = len(s)
        # the list to save bacth Q and target_Q
        Q_values = torch.empty((b_size, 1))
        target_values = torch.empty((b_size, 1))

        for i in range(b_size):
            actions_values = self.Q.forward(s[i])
            action = a[i][0]
            # compute target values considering the terminal condition
            if done[i][0] == True:
                with torch.no_grad():
                    target_values[i] = r[i][0]
            else:
                next_actions_values = self.Q_tar.forward(sp[i])
                max_action_value = max(next_actions_values)
                with torch.no_grad():
                    target_values[i] = r[i][0] + self.discount * max_action_value
            # assign Q value from the batch
            Q_values[i] = actions_values[action]

        # compute the MSE loss
        loss = nn.functional.mse_loss(Q_values, target_values)
        self.optimizer_Q.zero_grad()
        loss.backward()
        self.optimizer_Q.step()
        
        # for update target Q-network every C update steps
        if self.training_step % self.C == 0:
            update_param = self.Q.state_dict()
            self.Q_tar.load_state_dict(update_param)


    def act_operate(self, observation: torch.Tensor):
        # epsilon greedy:
        first_term = self.eps_max * (self.eps_len - self.training_step) / self.eps_len
        eps = max(first_term, self.eps_min)

        explore = binomial(1, eps)

        if explore == 1:
            a = choice(self.dim_action)
        else:
            self.Q.eval()
            Q = self.Q(observation)
            val, a = torch.max(Q, axis=1)
            a = a.item()
            self.Q.train()
        return a


class QNetwork(nn.Module):
    def __init__(self,
                 dim_obs: int,
                 dim_action: int,
                 dims_hidden_neurons: Tuple[int] = (64, 64)):
        if not isinstance(dim_obs, int):
            TypeError('dimension of observation must be int')
        if not isinstance(dim_action, int):
            TypeError('dimension of action must be int')
        if not isinstance(dims_hidden_neurons, tuple):
            TypeError('dimensions of hidden neurons must be tuple of int')

        super(QNetwork, self).__init__()
        self.num_layers = len(dims_hidden_neurons)
        self.dim_action = dim_action

        n_neurons = (dim_obs, ) + dims_hidden_neurons + (dim_action, )
        for i, (dim_in, dim_out) in enumerate(zip(n_neurons[:-2], n_neurons[1:-1])):
            layer = nn.Linear(dim_in, dim_out).double()
            torch.nn.init.xavier_uniform_(layer.weight)
            torch.nn.init.zeros_(layer.bias)
            exec('self.layer{} = layer'.format(i + 1))

        self.output = nn.Linear(n_neurons[-2], n_neurons[-1]).double()
        torch.nn.init.xavier_uniform_(self.output.weight)
        torch.nn.init.zeros_(self.output.bias)

    def forward(self, observation: torch.Tensor):
        x = observation
        for i in range(self.num_layers):
            x = eval('torch.tanh(self.layer{}(x))'.format(i + 1))
        return self.output(x)


In [None]:
############################################################
####     Training for the 'Chart-Pole' Environment     #####
############################################################

Tensor = torch.DoubleTensor
torch.set_default_tensor_type(Tensor)

env = gym.make('CartPole-v1')

config = {
    'dim_obs': 4,  # Q network input
    'dim_action': 2,  # Q network output
    'dims_hidden_neurons': (64, 64),  # Q network hidden
    'lr': 0.0003,  # learning rate, default = 0.0005
    'C': 60,  # copy steps
    'discount': 0.99,  # discount factor
    'batch_size': 64,
    'replay_buffer_size': 100000,
    'eps_min': 0.01,
    'eps_max': 1.0,
    'eps_len': 4000,
    'seed': 1,
}

dqn = DQN(config)
buffer = ReplayBuffer(config)
train_writer = SummaryWriter(log_dir='tensorboard/dqn_{date:%Y-%m-%d_%H:%M:%S}'.format(
                             date=datetime.datetime.now()))

steps = 0  # total number of steps
for i_episode in range(500):
    observation = env.reset()
    done = False
    t = 0  # time steps within each episode
    ret = 0.  # episodic return
    while done is False:
        env.render()  # render to screen

        obs = torch.tensor(env.state)  # observe the environment state

        action = dqn.act_operate(obs[None, :])  # take action

        next_obs, reward, done, info = env.step(action)  # environment advance to next step

        buffer.append_memory(obs=obs,  # put the transition to memory
                             action=torch.from_numpy(np.array([action])),
                             reward=torch.from_numpy(np.array([reward])),
                             next_obs=torch.from_numpy(next_obs),
                             done=done)

        dqn.update(buffer)  # agent learn

        t += 1
        steps += 1
        ret += reward  # update episodic return
        if done:
            print("Episode {} finished after {} timesteps".format(i_episode, t+1))
        train_writer.add_scalar('Performance/episodic_return', ret, i_episode)  # plot

env.close()
train_writer.close()


In [None]:
############################################################
####    Training for the 'MountainCar' Environment     #####
############################################################

Tensor = torch.DoubleTensor
torch.set_default_tensor_type(Tensor)

env = gym.make('MountainCar-v0')

config = {
    'dim_obs': 2,  # Q network input
    'dim_action': 3,  # Q network output
    'dims_hidden_neurons': (256, 256),  # Q network hidden
    'lr': 0.005,  # learning rate, default = 0.0005
    'C': 60,  # copy steps
    'discount': 0.99,  # discount factor
    'batch_size': 64,
    'replay_buffer_size': 100000,
    'eps_min': 0.01,
    'eps_max': 1.0,
    'eps_len': 4000,
    'seed': 1,
}

dqn = DQN(config)
buffer = ReplayBuffer(config)
train_writer = SummaryWriter(log_dir='tensorboard/dqn_{date:%Y-%m-%d_%H:%M:%S}'.format(
                             date=datetime.datetime.now()))

steps = 0  # total number of steps
for i_episode in range(500):
    observation = env.reset()
    done = False
    t = 0  # time steps within each episode
    ret = 0.  # episodic return
    while done is False:
        env.render()  # render to screen

        obs = torch.tensor(env.state)  # observe the environment state

        action = dqn.act_operate(obs[None, :])  # take action

        next_obs, reward, done, info = env.step(action)  # environment advance to next step

        buffer.append_memory(obs=obs,  # put the transition to memory
                             action=torch.from_numpy(np.array([action])),
                             reward=torch.from_numpy(np.array([reward])),
                             next_obs=torch.from_numpy(next_obs),
                             done=done)

        dqn.update(buffer)  # agent learn

        t += 1
        steps += 1
        ret += reward  # update episodic return
        if done:
            print("Episode {} finished after {} timesteps".format(i_episode, t+1))
        train_writer.add_scalar('Performance/episodic_return', ret, i_episode)  # plot

env.close()
train_writer.close()


Episode 0 finished after 201 timesteps
Episode 1 finished after 201 timesteps
Episode 2 finished after 201 timesteps
Episode 3 finished after 201 timesteps
Episode 4 finished after 201 timesteps
Episode 5 finished after 201 timesteps
Episode 6 finished after 201 timesteps
Episode 7 finished after 201 timesteps
Episode 8 finished after 201 timesteps
Episode 9 finished after 201 timesteps
Episode 10 finished after 201 timesteps
Episode 11 finished after 201 timesteps
Episode 12 finished after 201 timesteps
Episode 13 finished after 201 timesteps
Episode 14 finished after 201 timesteps
Episode 15 finished after 201 timesteps
Episode 16 finished after 201 timesteps
Episode 17 finished after 201 timesteps
Episode 18 finished after 201 timesteps
Episode 19 finished after 201 timesteps
Episode 20 finished after 201 timesteps
Episode 21 finished after 201 timesteps
Episode 22 finished after 201 timesteps
Episode 23 finished after 201 timesteps
Episode 24 finished after 201 timesteps
Episode 25

Episode 203 finished after 166 timesteps
Episode 204 finished after 146 timesteps
Episode 205 finished after 170 timesteps
Episode 206 finished after 169 timesteps
Episode 207 finished after 201 timesteps
Episode 208 finished after 150 timesteps
Episode 209 finished after 161 timesteps
Episode 210 finished after 168 timesteps
Episode 211 finished after 194 timesteps
Episode 212 finished after 201 timesteps
Episode 213 finished after 201 timesteps
Episode 214 finished after 201 timesteps
Episode 215 finished after 201 timesteps
Episode 216 finished after 201 timesteps
Episode 217 finished after 117 timesteps
Episode 218 finished after 196 timesteps
Episode 219 finished after 201 timesteps
Episode 220 finished after 141 timesteps
Episode 221 finished after 131 timesteps
Episode 222 finished after 116 timesteps
Episode 223 finished after 192 timesteps
Episode 224 finished after 185 timesteps
Episode 225 finished after 171 timesteps
Episode 226 finished after 132 timesteps
Episode 227 fini

Episode 403 finished after 117 timesteps
Episode 404 finished after 120 timesteps
Episode 405 finished after 131 timesteps
Episode 406 finished after 201 timesteps
Episode 407 finished after 201 timesteps
Episode 408 finished after 176 timesteps
Episode 409 finished after 160 timesteps
Episode 410 finished after 156 timesteps
Episode 411 finished after 156 timesteps
Episode 412 finished after 151 timesteps
Episode 413 finished after 152 timesteps
Episode 414 finished after 161 timesteps
Episode 415 finished after 131 timesteps
Episode 416 finished after 97 timesteps
Episode 417 finished after 169 timesteps
Episode 418 finished after 201 timesteps
Episode 419 finished after 191 timesteps
Episode 420 finished after 201 timesteps
Episode 421 finished after 188 timesteps
Episode 422 finished after 189 timesteps
Episode 423 finished after 129 timesteps
Episode 424 finished after 124 timesteps
Episode 425 finished after 201 timesteps
Episode 426 finished after 201 timesteps
Episode 427 finis