In [54]:
import random
import numpy as np
from itertools import product

In [55]:
class Datagram:
    def __init__(self, source: int, destination: int):
        self.source = source
        self.destination = destination

In [56]:
class Queue:
    def __init__(self, size: int):
        self.queue = []
        self.size = size
        self.length = 0

    def is_empty(self):
        return self.length == 0
    
    def enqueue(self, datagram: Datagram):
        if self.length < self.size:
            self.queue.append(datagram)
            self.length += 1
            return True
        return False

    def dequeue(self):
        if not self.is_empty():
            self.length -= 1
            return self.queue.pop(0)
        return None

In [57]:
class Event:
    def __init__(self, delay: int, destination: int, datagram: Datagram):
        self.delay = delay
        self.destination = destination
        self.datagram = datagram

In [58]:
class Router: #roteador
    
    def __init__(self, address: int, size: int):
        self.address = address
        self.adjacents = {}
        self.size = size
        self.queue = Queue(self.size)
    
    def start(self, destinations):
        self.queue = Queue(self.size)
        for destination in destinations:
            if self.queue.enqueue(Datagram(-1, destination)) == False:
                raise Exception(f"start do roteador {self.address} excedeu capacidade máxima")

    def receive(self, datagram: Datagram):
        if datagram.destination != self.address:
            
            if self.queue.length == self.size:
                return -100
            else:
                self.queue.enqueue(datagram)
                return 0
            
        else:
            del datagram
            return 10

    def process(self, action: int): 
        if not self.queue.is_empty():
            datagram = self.queue.dequeue()
            if datagram.destination == self.address:
                del datagram
            else:
                return Event(self.adjacents[action][1], action, datagram)

In [6]:
r0 = Router(0, 10)
r1 = Router(1, 2)
r0.start([1, 2, 1, 1])
for d in r0.queue.queue:
    print(d.destination)

print("----------------")

print(f'reward = {r0.receive(Datagram(-1, 3))}')
for d in r0.queue.queue:
    print(d.destination)

print("----------------")

print(f'reward = {r0.receive(Datagram(-1, 0))}')
for d in r0.queue.queue:
    print(d.destination)

print("----------------")

r0.adjacents[4] = [r1, 8]
event = r0.process(4)
print(event.delay)
print(event.destination)
print(event.datagram.destination)

print("----------------")

for d in r0.queue.queue:
    print(d.destination)

print("----------------")

print(f'reward = {r1.receive(Datagram(-1, 0))}')
print(f'reward = {r1.receive(Datagram(-1, 5))}')
print(f'reward = {r1.receive(Datagram(-1, 7))}')
for d in r1.queue.queue:
    print(d.destination)

1
2
1
1
----------------
reward = 0
1
2
1
1
3
----------------
reward = 10
1
2
1
1
3
----------------
8
4
1
----------------
2
1
1
3
----------------
reward = 0
reward = 0
reward = -100
0
5


In [59]:
class Network:
    def __init__(self, size: int, gen_prob: float, max_data: int, generate: bool=True) -> None:
        """
        @param size: size of the network
        @param gen_prob: probability of generating one packet
        @param max_data: max number of datagrams that can be generated in one instant
        """
        self.size = size
        self.quantity = 0
        self.routers = {}
        self.gen_prob = gen_prob
        self.max_data = max_data
        self.generate = generate

    def add_router(self, size):
        if self.quantity < self.size:
            self.quantity += 1
            self.routers[self.quantity] = Router(self.quantity, size)
            return True
        return False
    
    #método mapear
    def link(self, address_1, address_2, delay):
        """
        Ele deve ter o mesmo retorno do método mapear da TabelaDeRepasse: true se o
        mapeamento foi feito ou false caso o endereço já tenha um mapeamento ou a tabela esteja cheia.
        """
        if address_1 in self.routers and address_2 in self.routers:
            if address_1 not in self.routers[address_2].adjacents and address_2 not in self.routers[address_1].adjacents:
                self.routers[address_1].adjacents[address_2] = (self.routers[address_2], delay)
                self.routers[address_2].adjacents[address_1] = (self.routers[address_1], delay)
                return True
            else:
                return False
        else:
            return False
    
    def start(self, routers_state):
        for address in range(1, len(routers_state) + 1):
            self.routers[address].start(routers_state[address - 1])
    
    def generate_random_data(self):
        if self.generate:
            for _ in range(self.max_data):
                if np.random.random() < self.gen_prob:
                    sender = random.choice(list(self.routers.keys()))
                    # !!!!! Assumindo que a rede seja toda conectada !!!!! "
                    receiver = random.choice([address for address in list(self.routers.keys()) if address != sender])
                    # print((sender, receiver)) # não apagar: útil para testes
                    datagram = Datagram(sender, receiver)
                    self.routers[sender].receive(datagram)

In [7]:
network = Network(10, 0.9, 3, True)
network.add_router(10)
network.add_router(3)
network.add_router(6)
for router in network.routers:
    print(router)
    print(network.routers[router].address)
    print(network.routers[router].size)
    print("-")

print("----------------")

print(network.link(1, 2, 8))
print(network.link(1, 3, 40))
print(network.link(1, 3, 60))

for i in [1, 2, 3]:
    print(network.routers[i].adjacents)
    print([network.routers[i].adjacents[j][0].address for j in network.routers[i].adjacents])

print("----------------")

network.start([[2, 3, 2, 2], [], [2, 1, 1]])
for router in network.routers.values():
    print("address:", router.address)
    for d in router.queue.queue:
        print(d.destination)
    print("-")

print("----------------")

network.generate_random_data()
for router in network.routers.values():
    print("address:", router.address)
    for d in router.queue.queue:
        print(d.destination)
    print("-")


1
1
10
-
2
2
3
-
3
3
6
-
----------------
True
True
False
{2: (<__main__.Router object at 0x000001DF6A373FA0>, 8), 3: (<__main__.Router object at 0x000001DF6A371F60>, 40)}
[2, 3]
{1: (<__main__.Router object at 0x000001DF6A371E70>, 8)}
[1]
{1: (<__main__.Router object at 0x000001DF6A371E70>, 40)}
[1]
----------------
address: 1
2
3
2
2
-
address: 2
-
address: 3
2
1
1
-
----------------
(3, 1)
(2, 1)
(1, 3)
address: 1
2
3
2
2
3
-
address: 2
1
-
address: 3
2
1
1
1
-


In [60]:
class Scheduler:
    def __init__(self, network: Network):
        self.network = network
        self.events = []
    
    def start(self, events):
        self.events = []
        for event in events:
            self.events.append(Event(event[0], event[1], Datagram(-1, event[2])))
    
    def process(self, action: list[int]):
        reward = 0
        for event in self.events:
            if event.delay == 0:
                reward += self.network.routers[event.destination].receive(event.datagram)
        self.events = [event for event in self.events if event.delay != 0]
        for event in self.events:
            event.delay -= 1
        for router in self.network.routers.values():            
            event = router.process(action[router.address - 1])
            if event is not None:
                self.events.append(event)
        self.network.generate_random_data()
        return reward

In [99]:
scheduler = Scheduler(network)
scheduler.start([[5, 1, 3], [8, 3, 1], [0, 2, 3]])
for event in scheduler.events:
    print(event.delay, event.destination, event.datagram.destination)
print("----------------")
for router in network.routers.values():
    print(router.size, [datagram.destination for datagram in router.queue.queue])
print("----------------")
reward = scheduler.process([2, 1, 1])
print(f'reward = {reward}')
for event in scheduler.events:
    print(event.delay, event.destination, event.datagram.destination)
for router in network.routers.values():
    print(router.size, [datagram.destination for datagram in router.queue.queue])


5 1 3
8 3 1
0 2 3
----------------
10 [2, 3, 2, 2, 3]
3 [3]
6 [2, 1, 1, 2]
----------------
reward = 0
4 1 3
7 3 1
8 2 2
8 1 3
40 1 2
10 [3, 2, 2, 3]
3 [3]
6 [1, 1, 2]


5 1 3
8 3 1
0 2 3
----------------
10 [2, 3, 2, 2, 3]
3 [3]
6 [2, 1, 1, 2]
----------------
reward = 0
4 1 3
7 3 1
8 2 2
8 1 3
40 1 2
10 [3, 2, 2, 3]
3 [3]
6 [1, 1, 2]

In [100]:
reward = scheduler.process([3, 1, 1])
print(f'reward = {reward}')
for event in scheduler.events:
    print(event.delay, event.destination, event.datagram.destination)
for router in network.routers.values():
    print(router.size, [datagram.destination for datagram in router.queue.queue])

reward = 0
3 1 3
6 3 1
7 2 2
7 1 3
39 1 2
40 3 3
8 1 3
40 1 1
10 [2, 2, 3]
3 []
6 [1, 2]


reward = 0
3 1 3
6 3 1
7 2 2
7 1 3
39 1 2
40 3 3
8 1 3
40 1 1
10 [2, 2, 3]
3 []
6 [1, 2]

In [61]:
class Environment:
    def __init__(self, network: Network, scheduler: Scheduler):
        self.network = network
        self.scheduler = scheduler

    def get_all_states(self):
        def get_router_states(router):
            router_states = list(product(list(router.adjacents.keys()) + [0], repeat=router.size))
            def is_valid(x):
                for i in range(len(x)):
                    if x[i] == 0 and i < len(x) - 1:
                        return all([x[j] == 0 for j in range(i + 1, len(x))])
                return True
            router_states = filter(is_valid, router_states)
            router_states = [list(state) if 0 not in state else [] if state[0] == 0 else list(state[0:state.index(0)]) for state in router_states]
            return router_states
        aux = product(*[get_router_states(router) for router in self.network.routers.values()])
        routers_states = [list(routers_state) for routers_state in aux]

        min_delay = min([min([router.adjacents[address][1] for address in router.adjacents]) for router in self.network.routers.values()])
        events_states = product(range(min_delay + 1), self.network.routers.keys(), self.network.routers.keys())
        events_states = [list(event_state) for event_state in events_states]

        return product(events_states, routers_states)
    
    def get_state(self):
        events_state = []
        for event in self.scheduler.events:
            event_state = [event.delay, event.destination, event.datagram.destination]
            events_state.append(event_state)

        routers_state = []
        for router in self.network.routers.values():
            destinations = []
            for datagram in router.queue.queue:
                destinations.append(datagram.destination)
            routers_state.append(destinations)
            
        return (events_state, routers_state)
    
    def take_action(self, action):
        reward = 0
        for router in self.network.routers.values():
            reward -= router.queue.length
        reward -= len(self.scheduler.events)
        reward += self.scheduler.process(action)
        new_state = self.get_state()
        return reward, new_state
    
    def start(self, state = None): 
        if state == None:
            state = ([], [[] for i in self.network.routers])
        self.scheduler.start(state[0])
        self.network.start(state[1])
        
    def possible_actions(self, state = None):
        if state == None:
            state = self.get_state()
        action = {}
        for address in self.network.routers:
            if state[1][address - 1] == []:
                action[address] = [-1]
            else:
                action[address] = self.network.routers[address].adjacents
        return list(product(*action.values()))

estado atualmente eh um par ordenado com:
- 1 elemento: lista de estados dos eventos, onde cada estado eh (delay, destino imediato, destino final)
- 2 elemento: lista de estados dos roteadores, onde cada estado corresponde a um roteador (via endereço) e eh uma lista dos destinos dos datagramas (na ordem da fila)

# Cria Rede Genérica
![rede 1](./img/rede1.png)

In [62]:
network = Network(4, 1.0, 1)

for _ in range(4):
    network.add_router(4)

network.link(1, 2, 3)
network.link(1, 3, 10)
network.link(1, 4, 2)
network.link(2, 4, 6)

scheduler = Scheduler(network)

env = Environment(network, scheduler)

In [21]:
print(len(list(env.get_all_states())))

27907440


In [63]:
env.get_state()

([], [[], [], [], []])

In [64]:
env.start()
print(env.get_state())
print(env.possible_actions())

([], [[], [], [], []])
[(-1, -1, -1, -1)]


In [65]:
env.start(([[4, 1, 2], [1, 2, 4]], [[2, 4, 3], [4], [1, 2, 1, 1], []]))
print(env.get_state())
print(env.possible_actions())

([[4, 1, 2], [1, 2, 4]], [[2, 4, 3], [4], [1, 2, 1, 1], []])
[(2, 1, 1, -1), (2, 4, 1, -1), (3, 1, 1, -1), (3, 4, 1, -1), (4, 1, 1, -1), (4, 4, 1, -1)]


In [66]:
env.take_action([3, 4, 1, -1])
print(env.get_state())
print(env.possible_actions())

(3, 1)
([[3, 1, 2], [0, 2, 4], [10, 3, 2], [6, 4, 4], [10, 1, 1]], [[4, 3], [], [2, 1, 1, 1], []])
[(2, -1, 1, -1), (3, -1, 1, -1), (4, -1, 1, -1)]


(3, 1)
([[3, 1, 2], [0, 2, 4], [10, 3, 2], [6, 4, 4], [10, 1, 1]], [[4, 3], [], [2, 1, 1, 1], []])
[(2, -1, 1, -1), (3, -1, 1, -1), (4, -1, 1, -1)]

In [None]:
class Agent_Q_Learning:
    def __init__(self, env):
        self.env = env
        self.policy = {}
        self.values = {}

    def q_learning(self, alpha, epsilon, time_steps, gama):
        self.initialize() #Initialize value table 

        self.env.start()
        
        for t in range(time_steps):
            state = self.env.get_state()
            
            if random.random() >= epsilon:
                action = max(self.values[state], key = self.values[state].get)
            else:
                action = random.choice(list(self.values[state].keys())) #ou usar o possible actions

            reward, next_state = self.env.take_action(action)
            self.values[state][action] += alpha*(reward + gama*max(self.values[next_state].values()) - self.values[state][action])
        
    def initialize(self):
        ...

In [None]:
class StateActionArray:
    def __init__(self, env: Environment):
        ...
    def get(self, state, action):
        ...
    def set(self, state, action, value):
        ...

class Policy:
    def __init__(self, env:Environment, eps: float=0):
        ...
    def get(self, state):
        ...
    def set(self, state, action):
        ...
        
def sarsa(env: Environment, returns: StateActionArray, policy: Policy, gamma: float, alpha: float):
    
    env.reset()
    state = env.get_state()
    action = policy.get(state)
    while True:
        reward, next_state = env.take_action(action)
        next_action = policy.get(next_state)
        current_return = returns.get(state, action)
        
        next_return = 0
        if not env.terminal():
            next_return = returns.get(next_state, next_action)

        new_return = current_return + alpha * (gamma * next_return - current_return + reward)
        returns.set(state, action, new_return)

        actions = env.possible_actions()
        values = [returns.get(state, act) for act in actions]
        policy.set(state, actions[np.argmax(values)])

        state = next_state
        action = next_action
        if env.terminal():
            break

In [None]:
class Episode:
    def __init__(self, env : Environment, agent, initial_state, initial_action):
        self.env = env
        self.initial_state = initial_state
        self.initial_action = initial_action
        self.pairs = []
        self.rewards = []
        self.agent = agent
        self.steps = 0

    def genEpisode(self):
        self.env.start(self.initial_state)
        state = self.initial_state
        action = self.initial_action
        while True:
            self.pairs.append((state, action))
            reward, state = self.env.take_action(action)
            self.rewards.append(reward)
            self.steps += 1
            if (state == -1 or self.steps == 400):
                self.steps = len(self.rewards)
                return
            action = self.agent.choose_action(state)


In [None]:
class Agent_MC:
    def __init__(self, env: Environment, strategy, eps=None):
        self.env = env
        self.hashmap = {}
        if strategy == 'ExpStarts':
            for state in env.get_all_states():
                self.hashmap[state] = env.possible_actions(state)
        self.Q_values = {(state, action) : [0, 0] for state in self.hashmap for action in self.hashmap[state]}
        self.policy = {state : np.choice(self.hashmap[state])}
        self.strategy = strategy
        self.eps=eps

    def initialize(self):
        for state in self.hashmap.keys():
            for action in self.hashmap[state]:
                self.Q_values[(state, action)] = [0, 0]
        for state in self.hashmap.keys():
            self.policy[state] = np.choice(self.hashmap[state])
    
    def choose_action(self, state):
        if state not in self.hashmap.keys():
            self.hashmap[state] = self.env.possible_actions(state)
            for action in self.hashmap[state]:
                self.Q_values[(state, action)] = [0, 0]
        if self.strategy == 'ExpStarts':
            return self.policy[state]
        else:
            if np.random.random() < self.eps:
                return np.choice(self.env.possible_actions(state))
            else:
                return self.policy[state]
    
    def learn(self, numEpisodes, gamma):
        if self.strategy == 'ExpStarts':
            self.MC_ExpStarts(numEpisodes, gamma)
        else:
            self.MC_EpsSoft(numEpisodes, gamma)
    
    def MC_ExpStarts(self, numEpisodes, gamma):
        for _ in range(numEpisodes):
            initial_state = random.choice(self.hashmap)
            initial_action = random.choice(self.hashmap[initial_state])
            episode = Episode(self.env, self, initial_state, initial_action)
            episode.genEpisode()
            g = 0
            for step in range(episode.steps - 1, -1, -1):
                g = gamma * g + episode.rewards[step]
                pair = episode.pairs[step]
                if (pair not in episode.pairs[0:step]):
                    self.Q_values[pair][0] = (self.Q_values[pair][0] * self.Q_values[pair][1] + g) / (self.Q_values[pair][1] + 1)
                    self.Q_values[pair][1] += 1
                    self.policy[pair[0]] = self.actions[pair[0]][np.argmax([self.Q_values[(pair[0], action)][0] for action in self.actions[pair[0]]])]
    
    def MC_EpsSoft(self, numEpisodes, gamma): # implementação com eps greedy
        for _ in range(numEpisodes):
            initial_state = self.env.get_state()
            initial_action = self.choose_action(initial_state)
            episode = Episode(self.env, self, initial_state, initial_action)
            episode.genEpisode()
            g = 0
            for step in range(episode.steps - 1, -1, -1):
                g = gamma * g + episode.rewards[step]
                pair = episode.pairs[step]
                if (pair not in episode.pairs[0:step]):
                    self.Q_values[pair][0] = (self.Q_values[pair][0] * self.Q_values[pair][1] + g) / (self.Q_values[pair][1] + 1)
                    self.Q_values[pair][1] += 1
                    self.policy[pair[0]] = self.actions[pair[0]][np.argmax([self.Q_values[(pair[0], action)][0] for action in self.actions[pair[0]]])]
