In [1]:
from spark_env.env import Environment
import numpy as np
import networkx as nx
from matplotlib import pyplot as plt
import dgl
import torch
from spark_env.job_dag import JobDAG
from spark_env.node import Node
import torch.nn as nn
from dgl.nn.pytorch import GraphConv
import torch.nn.functional as F 
from numpy.random import randint

cuda = "cuda"

class GCN(nn.Module):

    def __init__(self, features=5, hidden_layer_size=10, embedding_size=10):
        super(GCN, self).__init__()
        
        self.conv1 = GraphConv(in_feats=features, out_feats=hidden_layer_size)
        self.conv2 = GraphConv(in_feats=hidden_layer_size, out_feats=embedding_size)
        # self.conv3 = GraphConv(hidden_layer_size, embedding_size)

    def forward(self, g, inputs):
        h = inputs
        h = self.conv1(g, h)
        h = torch.relu(h)
        h = self.conv2(g, h)
        h = torch.relu(h)
        return h

gnn = GCN().to(cuda)

Using backend: pytorch


In [2]:
class EnvironmentWrapper:
    
    def __init__(self, view_range=50, reset_prob=5e-7, max_exec=100, env_len = 500, turn = 40) -> None:

        # Set pre built environment
        self.env = Environment()
        
        # environment parameters
        self.reset_prob = reset_prob
        self.max_exec = max_exec

        # wrapper parameters
        self.range = view_range
        self.env_len = env_len
        self.offset = 0
        self.turn = turn

        # create prebuilt environment
        self.reset()

        self.frontier_nodes = []
        self.source_exec = max_exec
        self.logits = None
    
    # reset the environment to a new seed
    def reset(self):
        seed = np.random.randint(12, 1234567)
        self.env.seed(seed)
        # self.env.seed(234)
        self.env.reset(max_time=np.random.geometric(self.reset_prob))
        self.offset = 0
        self.logits = False
        self.observe()

    # observe and decode an observation into usable paramaters for the agent
    # this involves embedding the graph nodes into 10 dim vectors
    def observe(self, gnn_reset=True):
        # get the new observation from the environement
        G, frontier_nodes, num_source_exec, action_map, node_inputs = self.env.new_observation()

        # reset the frontier nodes and the number of free executors
        # and add index of all the leaf nodes
        leaf_nodes = []
        self.frontier_nodes = []
        for node in frontier_nodes:
            self.frontier_nodes.append(node)  
            leaf_nodes.append(action_map.inverse_map[node])
        self.source_exec = num_source_exec

        # calculate the logits and filter the required indices
        # based on the number of nodes the agent can see
        if gnn_reset and not self.logits:
            logits = gnn(G.to(cuda), node_inputs.to(cuda))
        else:
            logits = self.logits
            
        required_indices = []
        for i in range(1, self.range+1):
            if len(required_indices) == len(frontier_nodes):
                break
            index = (self.offset+i)%len(frontier_nodes)
            required_indices.append(leaf_nodes[index])

        # pad the output with 0 if 50 nodes are not available
        # and add the number of source executors remaining to the vector
        logits = logits[required_indices]
        padding = torch.zeros(self.range-len(logits), 10).to(cuda)
        logits = torch.cat([logits, padding])
        logits = logits.flatten()
        logits = torch.cat([logits, torch.tensor([self.source_exec]).to(cuda)])

        return logits

    # perform an action and return the resultant state and reward
    def step(self, action, early_stop=True):

        # get the direction, job and limit
        direction, job, limit = action

        # if index is greater than the number of jobs, send a high negative reward
        index = (self.offset + job) % self.range
        if index >= len(self.frontier_nodes):
            index = len(self.frontier_nodes) - 1
        if limit > self.source_exec :
            limit = self.source_exec
        if (len(self.frontier_nodes) > 0 and limit == 0) :
            limit = 1
        if len(self.frontier_nodes) == 0:
            state = self.observe()
            reward, done = self.env.new_step(None, self.max_exec)
            return state, 0, done

        # update the view offset to check for more jobs i.e stay or move right
        self.offset += (round(direction)*self.turn)
        self.offset = self.offset % self.range

        # take a step and observe the reward, completion and the state from the old environement
        reward, done = self.env.new_step(self.frontier_nodes[index], limit)
        state = self.observe()
        
        # return None, None, None
        return state, reward, done


env = EnvironmentWrapper()

In [3]:
from numpy.random import randint
for i in range (100):
    R = 0
    env.reset()
    done = False
    steps = 0
    nodes = []
    while not done:
        action = (randint(2), randint(0, 50), randint(2, 5))
        nodes.append(" source exec : {} - chosen : {} \n".format(env.source_exec, action[2]))
        state, reward, done = env.step(action)
        R += reward
        steps += 1

    print(R, steps)
# -559.8001999999993 1691

96
-180.32318 269
0
-509.42016999999987 1663
2
-470.51302000000027 1689
80
-329.1735099999998 709
0
-387.2399700000006 1613
0
-441.77567999999997 1693
0
-518.9046599999998 1691
0
-524.7712200000009 1900
0
-453.4716200000006 1703
0
-506.7430599999998 1775
1


AssertionError: 

In [6]:
with open('temp.txt', 'a') as fp:
    for node in nodes:
        fp.write(node)

In [20]:
len(nodes)

32

In [14]:
print(R, steps, nodes[1619])

-479.6533599999999 1619 2


In [4]:
class Actor(nn.Module):

    def __init__(self, action_space=100, num_inputs=501):
        super(Actor, self).__init__()

        # mu.weight.data.mul_(0.1)
        # mu.bias.data.mul_(0.1)

        self.l1    = nn.Linear(num_inputs, 128)
        torch.nn.init.xavier_normal_(self.l1.weight)
        self.ln1   = nn.LayerNorm(128)

        self.l2    = nn.Linear(128, 128)
        torch.nn.init.xavier_normal_(self.l2.weight)
        self.ln2   = nn.LayerNorm(128)

        self.l3    = nn.Linear(128, 64)
        torch.nn.init.xavier_normal_(self.l3.weight)
        self.ln3   = nn.LayerNorm(64)
            
        self.l4    = nn.Linear(64, action_space)
        torch.nn.init.xavier_normal_(self.l4.weight)

    def forward(self, inputs):
        x = self.l1(inputs)
        x = self.ln1(x)
        x = F.relu(x)

        x = self.l2(x)
        x = self.ln2(x)
        x = F.relu(x)

        x = self.l3(x)
        x = self.ln3(x)
        x = F.relu(x)

        x = self.l4(x)
        
        return x

In [5]:
class Critic(nn.Module):

    def __init__(self,  action_space=200, num_inputs=501):
        super(Critic, self).__init__()

        self.l1    = nn.Linear(num_inputs, 128)
        torch.nn.init.xavier_normal_(self.l1.weight)
        self.ln1   = nn.LayerNorm(128)

        self.l2    = nn.Linear(128+action_space, 128)
        torch.nn.init.xavier_normal_(self.l2.weight)
        self.ln2   = nn.LayerNorm(128)

        self.l3    = nn.Linear(128, 64)
        torch.nn.init.xavier_normal_(self.l3.weight)
        self.ln3   = nn.LayerNorm(64)

        self.V = nn.Linear(64, 1)
        torch.nn.init.xavier_normal_(self.V.weight)
        # self.V.weight.data.mul_(0.1)
        # self.V.bias.data.mul_(0.1)

    def forward(self, inputs, actions):
        
        x = self.l1(inputs)
        x = self.ln1(x)
        x = F.relu(x)

        x = torch.cat((x, actions))
        x = self.l2(x)
        x = self.ln2(x)
        x = F.relu(x)

        x = self.l3(x)
        x = self.ln3(x)
        x = F.relu(x)

        V = self.V(x)
        
        return V

In [6]:
class ActorCritic(nn.Module):
    def __init__(self, action_space=100, num_inputs=501):
        super().__init__()
        
        self.actor_node        = Actor(action_space=action_space, num_inputs=num_inputs)
        self.actor_parallelism = Actor(action_space=action_space, num_inputs=num_inputs)
        self.critic            = Critic(action_space=action_space*2, num_inputs=num_inputs)
        
    def forward(self, state):
        
        action_node        = self.actor_node(state)
        action_parallelism = self.actor_parallelism(state)
        value_pred         = self.critic(state, torch.cat([action_node, action_parallelism]))
        
        return action_node, action_parallelism, value_pred

In [7]:
class Agent(object):
    def __init__(self, gamma=0.9995) -> None:
        super().__init__()
        self.model = ActorCritic().to(cuda)
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=1e-3)
        self.saved_actions = []
        self.rewards = []

        self.gamma = gamma
        self.eps = np.finfo(np.float32).eps.item()
        self.file = "actor_critic"

    def __init_weights__(m):
        if type(m) == nn.Linear:
            torch.nn.init.xavier_normal_(m.weight)
            m.bias.data.fill_(0)

    def learn(self):

        step_reward = 0
        policy_node = [] # list to save actor (policy) loss
        policy_parallelism = []
        value_losses = [] # list to save critic (value) loss
        returns = [] # list to save the true values

        # calculate the true value using rewards returned from the environment
        for reward in self.rewards:
            # calculate the discounted value
            step_reward = reward + self.gamma * step_reward
            returns.insert(0, step_reward)

        returns = torch.tensor(returns).to(cuda)
        returns = (returns - returns.mean()) / (returns.std() + self.eps)

        for (log_prob_1, log_prob_2, value), reward in zip(self.saved_actions, returns):
            advantage = reward - value.item()

            # calculate actor (policy) loss 
            policy_node.append(-log_prob_1 * advantage)
            policy_parallelism.append(-log_prob_2 * advantage)

            # calculate critic (value) loss using L1 smooth loss
            value_losses.append(F.smooth_l1_loss(value, torch.tensor([reward]).to(cuda)))

        # reset gradients
        self.optimizer.zero_grad()

        # sum up all the values of policy_losses and value_losses
        loss_node   = torch.stack(policy_node).to(cuda).sum()
        loss_parallelism = torch.stack(policy_parallelism).to(cuda).sum()
        loss_critic = torch.stack(value_losses).to(cuda).sum()

        # perform backprop
        loss = loss_node + loss_parallelism + loss_critic
        loss.backward()
        
        self.optimizer.step()

        # reset rewards and action buffer
        self.saved_actions.clear()
        self.rewards.clear()
        
    def act(self, state):
        probs_node, probs_parallelism, value = self.model(state)

        # create a categorical distribution over the list of probabilities of actions
        acts_node = torch.distributions.Categorical(logits=probs_node)                      
        acts_parallelism = torch.distributions.Categorical(logits=probs_parallelism) 

        # and sample an action using the distribution
        action_node = acts_node.sample()
        action_parallelism = acts_parallelism.sample()

        # save to action buffer
        self.saved_actions.append((acts_node.log_prob(action_node), acts_parallelism.log_prob(action_parallelism), value))

        # the action to take (node, dir) and (parallelism)
        return action_node.item(), action_parallelism.item()

    def save(self, version="1"):
        torch.save(self.model.state_dict(), self.file+"_"+version+".pt")

    def load(self, file="./actor_critic", version="1"):
        self.file = file
        self.model.load_state_dict(torch.load(file+"_"+version+".pt"))
        self.model.eval()

    def direct_train(self, env:EnvironmentWrapper, episodes=70):

        node_pred = []
        parallelism_pred = []

        node_act = []
        parallelism_act = []
        value_act = []

        running_reward = 10
        
        for i in range(episodes):
            env.reset()
            state = env.observe()
            done = False
            episode_reward = 0

            while not done:
                
                # get prediction
                node, parallelism, value = self.model(state)

                # get actions from logits for nodes and paralellism limit
                node_pred.append(node); parallelism_pred.append(parallelism)

                # use random actions within bounds to train for action
                n_a = randint(0, 100); p_a = randint(2, 5)
                n = np.zeros(100); p = np.zeros(100); n[n_a] = 1; p[p_a] = 1
                
                n_t = torch.from_numpy(n).type(torch.FloatTensor)
                p_t = torch.from_numpy(p).type(torch.FloatTensor)
                node_act.append(n_t)
                parallelism_act.append(p_t)

                # predic value from critic for random actions
                value = self.model.critic( state, torch.cat([n_t.to(cuda), p_t.to(cuda)]) )
                
                # get step and compute actual reward value for random actions
                state, reward, done = env.step(torch.tensor([int(n_a/50), n_a%50, p_a]))
                value_act.append(F.smooth_l1_loss(value, torch.tensor([reward]).type(torch.FloatTensor).to(cuda)))

                episode_reward += reward
            
            n_l = nn.MSELoss()
            p_l = nn.MSELoss()
            loss_node        = n_l(torch.stack(node_pred), torch.stack(node_act).to(cuda))
            loss_parallelism = p_l(torch.stack(parallelism_pred), torch.stack(parallelism_act).to(cuda))
            loss_critic      = torch.stack(value_act).to(cuda).sum()

            loss_node.backward(retain_graph=True)
            loss_parallelism.backward(retain_graph=True)
            loss_critic.backward(retain_graph=True)
            self.save()

            running_reward = 0.05 * episode_reward + (1 - 0.05) * running_reward

            # log results
            if i % 5 == 0:
                print('Episode {}\tLast reward: {:.2f}\tAverage reward: {:.2f}'.format(
                    i, episode_reward, running_reward))


In [8]:
# agent = Agent()
# agent.direct_train(env)

In [9]:
import time

episodes = 10
running_reward = 0

agent = Agent()
agent.load(file="./actor_critic", version="3")


for e in range(episodes):

    env.reset()
    state = env.observe()
    done = False
    episode_reward = 0
    start_time = time.time()
    actions = 0
    while not done:
        node, parallelism = agent.act(state)
        direc = int(node/50)
        node  = node % 50
        state, reward, done = env.step((direc, node, parallelism))
        agent.rewards.append(reward)
        episode_reward += reward
        actions += 1

    # print(actions, time.time()-start_time)
    # agent.learn()
    # agent.save(version="3")
    running_reward = 0.05 * episode_reward + (1 - 0.05) * running_reward

    # log results
    if e % 5 == 0:
        print('Episode {}\tLast reward: {:.2f}\tAverage reward: {:.2f}'.format(
            e, episode_reward, running_reward))

    

Episode 0	Last reward: -549.49	Average reward: -27.47
Episode 5	Last reward: 1.00	Average reward: -21.03


In [None]:
# test for the trained network

ep = 10
for e in range(10):

    env.reset()
    state = env.observe()

In [48]:
# from matplotlib import pylab

# def save_graph(graph, file_name):
#     plt.figure(num=None, figsize=(20, 20), dpi=80)
#     plt.axis('off')
#     fig = plt.figure(1)
#     pos = nx.spring_layout(graph)
#     nx.draw_networkx_nodes(graph,pos)
#     nx.draw_networkx_edges(graph,pos)
#     nx.draw_networkx_labels(graph,pos)

#     plt.savefig(file_name,bbox_inches="tight")
#     pylab.close()
#     del fig

# save_graph(job_graph, "job_dag")


# DDQN implementation,

# import copy
# from collections import deque
# import random
# import time, datetime

# class Net(nn.Module):

#     def __init__(self, input_dim=501, output_dim=3):
#         super().__init__()
#         self.static = nn.Sequential(
#             nn.Linear(input_dim, 501),
#             nn.ReLU(),
#             nn.Linear(501, 250),
#             nn.ReLU(),
#             nn.Linear(250, 200),
#             nn.ReLU(),
#             nn.Linear(200, 150),
#             nn.ReLU(),
#             nn.Linear(150, output_dim)
#         )

#         self.dynamic = copy.deepcopy(self.static)

#         for p in self.dynamic.parameters():
#             p.requires_grad = False

#     def forward(self, input, model="static"):
#         if model == "static":
#             return self.static(input)
            
#         return self.dynamic(input)

# class Agent():

#     def __init__(self, state_dim=501, action_dim=(2, 50, 100), save_dir=".", assist=False, assist_p=(2, 7)):
#         self.state_dim = state_dim
#         self.action_dim = action_dim
#         self.save_dir = save_dir

#         self.use_cuda = torch.cuda.is_available()

#         # DNN to predict the most optimal action
#         self.net = Net(self.state_dim, len(self.action_dim)).float()
#         self.net = self.net.to(device="cuda")

#         self.exploration_rate = 1
#         self.exploration_rate_decay = 0.99999975
#         self.exploration_rate_min = 0.1
#         self.curr_step = 0

#         self.memory = deque(maxlen=100000)
#         self.batch_size = 32

#         self.save_every = 5e3  # no. of experiences

#         self.gamma = 0.9

#         self.optimizer = torch.optim.Adam(self.net.parameters(), lr=0.00025)
#         self.loss_fn = torch.nn.CrossEntropyLoss()

#         self.burnin = 1e4  # min. experiences before training
#         self.learn_every = 3  # no. of experiences between updates to Q_online
#         self.sync_every = 1e4  # no. of experiences between Q_target & Q_online sync

#         self.assist = assist
#         self.assist_range = assist_p

#     def act(self, state):
#         # EXPLORE
#         if np.random.rand() < self.exploration_rate:
#             direction, job, executor = self.action_dim
#             if self.assist:
#                 action_idx = (np.random.randint(direction), np.random.randint(job)/job, np.random.randint(self.assist_range[0], self.assist_range[1])/executor)
#             else :
#                 action_idx = torch.tensor([np.random.randint(direction), np.random.randint(job)/job, np.random.randint(1, executor)/executor])

#         # EXPLOIT
#         else:
#             state = state.cuda()
#             action_idx = self.net(state, model="dynamic")

#         # decrease exploration_rate
#         self.exploration_rate *= self.exploration_rate_decay
#         self.exploration_rate = max(self.exploration_rate_min, self.exploration_rate)

#         # increment step
#         self.curr_step += 1
#         return action_idx

#     def cache(self, state, next_state, action, reward, done):
#         reward = torch.tensor([reward]).cuda()
#         done = torch.tensor([done]).cuda()
#         state = state.cuda()
#         next_state = next_state.cuda()
#         action = action.cuda()
#         self.memory.append((state, next_state, action, reward, done))

#     def recall(self):
#         batch = random.sample(self.memory, self.batch_size)
#         state, next_state, action, reward, done = map(torch.stack, zip(*batch))
#         return state, next_state, action.squeeze(), reward.squeeze(), done.squeeze()

#     def td_estimate(self, state, action):
#         current_Q = self.net(state, model="static")[
#             np.arange(0, self.batch_size), action
#         ]  # Q_online(s,a)
#         return current_Q

#     @torch.no_grad()
#     def td_target(self, reward, next_state, done):
#         best_action = self.net(next_state, model="static")
#         next_Q = self.net(next_state, model="dynamic")[
#             np.arange(0, self.batch_size, dtype=np.int64), best_action
#         ]
#         return (reward + (1 - done.float()) * self.gamma * next_Q).float()

#     def update_Q_online(self, td_estimate, td_target):
#         loss = self.loss_fn(td_estimate, td_target)
#         self.optimizer.zero_grad()
#         loss.backward()
#         self.optimizer.step()
#         return loss.item()

#     def sync_Q_target(self):
#         self.net.dynamic.load_state_dict(self.net.dynamic.state_dict())

#     def save(self):
#         save_path = (
#             self.save_dir + f"/sched_net_{int(self.curr_step // self.save_every)}.chkpt"
#         )
#         torch.save(
#             dict(model=self.net.state_dict(), exploration_rate=self.exploration_rate),
#             save_path,
#         )
#         print(f"Sched_net saved to {save_path} at step {self.curr_step}")

#     def learn(self):
#         if self.curr_step % self.sync_every == 0:
#             self.sync_Q_target()

#         if self.curr_step % self.save_every == 0:
#             self.save()

#         if self.curr_step < self.burnin:
#             return None, None

#         if self.curr_step % self.learn_every != 0:
#             return None, None

#         # Sample from memory
#         state, next_state, action, reward, done = self.recall()

#         # Get TD Estimate
#         td_est = self.td_estimate(state, action)

#         # Get TD Target
#         td_tgt = self.td_target(reward, next_state, done)

#         # Backpropagate loss through Q_online
#         loss = self.update_Q_online(td_est, td_tgt)

#         return (td_est.mean().item(), loss)

# class MetricLogger:
#     def __init__(self, save_dir:str):
#         self.save_log = save_dir + "/log"
#         with open(self.save_log, "w") as f:
#             f.write(
#                 f"{'Episode':>8}{'Step':>8}{'Epsilon':>10}{'MeanReward':>15}"
#                 f"{'MeanLength':>15}{'MeanLoss':>15}{'MeanQValue':>15}"
#                 f"{'TimeDelta':>15}{'Time':>20}\n"
#             )
#         self.ep_rewards_plot = save_dir + "/reward_plot.jpg"
#         self.ep_lengths_plot = save_dir + "/length_plot.jpg"
#         self.ep_avg_losses_plot = save_dir + "/loss_plot.jpg"
#         self.ep_avg_qs_plot = save_dir + "/q_plot.jpg"

#         # History metrics
#         self.ep_rewards = []
#         self.ep_lengths = []
#         self.ep_avg_losses = []
#         self.ep_avg_qs = []

#         # Moving averages, added for every call to record()
#         self.moving_avg_ep_rewards = []
#         self.moving_avg_ep_lengths = []
#         self.moving_avg_ep_avg_losses = []
#         self.moving_avg_ep_avg_qs = []

#         # Current episode metric
#         self.init_episode()

#         # Timing
#         self.record_time = time.time()

#     def log_step(self, reward, loss, q):
#         self.curr_ep_reward += reward
#         self.curr_ep_length += 1
#         if loss:
#             self.curr_ep_loss += loss
#             self.curr_ep_q += q
#             self.curr_ep_loss_length += 1

#     def log_episode(self):
#         "Mark end of episode"
#         self.ep_rewards.append(self.curr_ep_reward)
#         self.ep_lengths.append(self.curr_ep_length)
#         if self.curr_ep_loss_length == 0:
#             ep_avg_loss = 0
#             ep_avg_q = 0
#         else:
#             ep_avg_loss = np.round(self.curr_ep_loss / self.curr_ep_loss_length, 5)
#             ep_avg_q = np.round(self.curr_ep_q / self.curr_ep_loss_length, 5)
#         self.ep_avg_losses.append(ep_avg_loss)
#         self.ep_avg_qs.append(ep_avg_q)

#         self.init_episode()

#     def init_episode(self):
#         self.curr_ep_reward = 0.0
#         self.curr_ep_length = 0
#         self.curr_ep_loss = 0.0
#         self.curr_ep_q = 0.0
#         self.curr_ep_loss_length = 0

#     def record(self, episode, epsilon, step):
#         mean_ep_reward = np.round(np.mean(self.ep_rewards[-100:]), 3)
#         mean_ep_length = np.round(np.mean(self.ep_lengths[-100:]), 3)
#         mean_ep_loss = np.round(np.mean(self.ep_avg_losses[-100:]), 3)
#         mean_ep_q = np.round(np.mean(self.ep_avg_qs[-100:]), 3)
#         self.moving_avg_ep_rewards.append(mean_ep_reward)
#         self.moving_avg_ep_lengths.append(mean_ep_length)
#         self.moving_avg_ep_avg_losses.append(mean_ep_loss)
#         self.moving_avg_ep_avg_qs.append(mean_ep_q)

#         last_record_time = self.record_time
#         self.record_time = time.time()
#         time_since_last_record = np.round(self.record_time - last_record_time, 3)

#         print(
#             f"Episode {episode} - "
#             f"Step {step} - "
#             f"Epsilon {epsilon} - "
#             f"Mean Reward {mean_ep_reward} - "
#             f"Mean Length {mean_ep_length} - "
#             f"Mean Loss {mean_ep_loss} - "
#             f"Mean Q Value {mean_ep_q} - "
#             f"Time Delta {time_since_last_record} - "
#             f"Time {datetime.datetime.now().strftime('%Y-%m-%dT%H:%M:%S')}"
#         )

#         with open(self.save_log, "a") as f:
#             f.write(
#                 f"{episode:8d}{step:8d}{epsilon:10.3f}"
#                 f"{mean_ep_reward:15.3f}{mean_ep_length:15.3f}{mean_ep_loss:15.3f}{mean_ep_q:15.3f}"
#                 f"{time_since_last_record:15.3f}"
#                 f"{datetime.datetime.now().strftime('%Y-%m-%dT%H:%M:%S'):>20}\n"
#             )

#         for metric in ["ep_rewards", "ep_lengths", "ep_avg_losses", "ep_avg_qs"]:
#             plt.plot(getattr(self, f"moving_avg_{metric}"))
#             plt.savefig(getattr(self, f"{metric}_plot"))
#             plt.clf()