In [1]:
datafile = '../data/Topology/for_generation/toutiao_span_shape.csv'

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
from scipy import stats

In [5]:
import json

In [6]:
df = pd.read_csv(datafile, encoding= 'unicode_escape',index_col=0)

## Trace Tree--Generation

In [8]:
df_span_child = df
generated_trace_num = 1000
tree_root = range(generated_trace_num)
tree_dic = {}
cur_id = generated_trace_num
def generate_children(parent_id, cur_depth, cur_id):
    # print('---------------')
    if cur_depth>df_span_child['depth'].max():
        # print('depth reach to max!')
        tree_dic[parent_id] = []
        return cur_id
    # print('generate for node:', parent_id)
    child_num = np.random.choice(df_span_child[df_span_child['depth']==cur_depth]['Total_Child'].fillna(0).values)
    child_id_list = list(np.arange(cur_id, cur_id+int(child_num)).astype(int))
    # print('depth=', cur_depth)
    # print('child_list = ', child_id_list)
    cur_id = cur_id+child_num
    tree_dic[parent_id] = child_id_list
    for child_id in child_id_list:
        cur_id = generate_children(child_id, cur_depth+1, cur_id)
    return cur_id
for root_id in tree_root:
    depth = 0
    cur_id = generate_children(root_id, depth, cur_id)

## Synchronous/Asynchronous Call Separation

In [41]:
def separate_group(call_list):
    # we separate groups serially starting from 0 to n, because the node order is randomly generated, which means among them, there is no difference.
    n = len(call_list)
    n_cur = 0
    n_left = n
    group_list = []
    if n <2 :
        group_list.append(call_list)
        return group_list
    group_list.append(call_list[:int(n*0.7)+1])
    n_cur = int(n*0.7)+1
    n_left = n_left-n_cur
    while n_left > 2:
#         random_num = np.random.randint(2,n_left+1)
        group_list.append(call_list[n_cur:n_cur+2])
        n_cur=n_cur+2
        n_left = n_left - 2
    if n_cur<n:
        group_list.append(call_list[n_cur:])
    return group_list

In [43]:
nodes_structure = {}
for i in tree_dic.keys():
    nodes_structure[i] = separate_group(tree_dic[i])

## Time Dependency--Generation

In [28]:
def generate_DT(m=3.0,v=0.5,lower_bound=1.8):
    # truncted Normal distribution
    random_value = stats.truncnorm.rvs((lower_bound-m)/v, (100-m)/v, loc=m, scale=v, random_state=None)
    #print(random_value)
    return pow(10,random_value)

In [30]:
# Generate the running time for each module
node_DT = {}
for n in tree_dic.keys():
    node_DT[n]=generate_DT(m=1.85,v=0.15,lower_bound=1)
# Generate the message size of sending and receiving for each pair of modules(edge)
edge_MS = []
for n in tree_dic.keys():
    for m in tree_dic[n]:
        start_edge=n
        end_edge=m
        edge_MS.append([start_edge,end_edge,generate_DT(v=1.0),generate_DT(v=1.0)])

In [31]:
df_edge_info = pd.DataFrame(edge_MS, columns=['start','end','send_MS','recv_MS'])

In [32]:
df_edge_info

Unnamed: 0,start,end,send_MS,recv_MS
0,0,1000,15420.553167,400.748464
1,1000,1001,148.194680,1993.521991
2,1001,1002,1679.162911,2284.858820
3,1002,1003,19235.494013,229014.460299
4,1003,1004,105.905503,64.709731
...,...,...,...,...
25944,993,26944,241.390361,2868.195860
25945,993,26945,5556.208075,12268.641091
25946,993,26946,999.102548,79.381063
25947,993,26947,141.600577,473.301522


In [161]:
def predic_RPC_cost(send,recv):
    x = (send+recv)/250
    return x * (9.021986-4.592256)/400 + 4.592256

## Runtime Distribution

In [None]:
# The start time of a parent is the last end time of all groups.
# The runtime of a group equals to the longest one, the runtime of all child, is the adding up of all groups.

In [49]:
DE = {}

In [167]:
def duration_estimate(node_id):
    if len(tree_dic[node_id]) == 0:
        DE[node_id] = node_DT[node_id]
        return node_DT[node_id]
    else:
        d_sum = 0
        groups = nodes_structure[node_id]
        for g in groups:
            g_max = 0
            for nn in g:
                if nn in DE.keys():
                    d_nn = DE[nn]
                else:
                    d_nn = duration_estimate(nn)
                # nn 与 node_id之间的通讯代价
                send_MS = df_edge_info[(df_edge_info['start']==node_id)&(df_edge_info['end']==nn)]['send_MS'].values[0]
                recv_MS = df_edge_info[(df_edge_info['start']==node_id)&(df_edge_info['end']==nn)]['recv_MS'].values[0]
                d_nn = d_nn+predic_RPC_cost(send_MS,recv_MS)
                if d_nn > g_max:
                        g_max = d_nn
            d_sum += g_max
        DE[node_id] = d_sum
        return d_sum