In [2]:
import torch
import os
import torch.nn as nn
import torch.profiler
import torch.fx as fx
import pandas as pd
import pygraphviz as pgv
from VGG import VGG16

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', 1000)


# model设置
model = VGG16().cuda()
input_data = torch.randn(1, 3, 224, 224).cuda()



In [3]:
# 提取计算图并可视化
model_name='VGG16'
tracer = fx.Tracer()
graph = tracer.trace(model)

# 创建 GraphModule
graph_module = fx.GraphModule(model, graph)


data = []
adj={}
for node in graph.nodes:
    op_type = node.op
    name = node.name
    num_downstream = len(node.users)
    num_upstream = len(node.all_input_nodes)
    downstream = [user.name for user in node.users]
    upstream = [input_node.name for input_node in node.all_input_nodes]
    target = node.target
    args = str(node.args)
    #kwargs = str(node.kwargs)
    module_qualname = ""
    detailed_op = ""
    #构建邻接表
    if name not in adj:
        adj[name]=[]
        for i in range(len(downstream)):
            adj[name].append(downstream[i])
    else:
        for i in range(len(downstream)):
            adj[name].append(downstream[i])

    if op_type == "call_module":
        module = dict(graph_module.named_modules())[target]
        module_qualname = target
        detailed_op = str(module)

    data.append([
        op_type, name, num_downstream, num_upstream, downstream, upstream,
        target, args, module_qualname, detailed_op
    ])


static_graph = pd.DataFrame(data, columns=[
    "op_type", "name", "num_downstream", "num_upstream", "downstream", "upstream",
    "target", "args", "module_qualname", "detailed_op"
])
op_type={}
name_module = {}
for i in range(len(static_graph)):
    if static_graph.loc[i,'detailed_op'] != "":
        key = static_graph.iloc[i]['detailed_op'].split("(")[0]
        if key not in op_type:
            op_type[key] = 0
        module_name='nn.Module: '+key+'_'+str(op_type[key])
        op_type[key] += 1
        static_graph.loc[i,'module']=module_name
        name_module[static_graph.iloc[i]['name']]=[static_graph.iloc[i]['detailed_op'],module_name]
        #name_module[static_graph.iloc[i]['name']]=module_name
    else:
        static_graph.loc[i,'module']=static_graph.iloc[i]['name']
        name_module[static_graph.iloc[i]['name']]=[static_graph.iloc[i]['name'],static_graph.iloc[i]['name']]


#用module名称替换name
adj_module={}
for key in adj:
    adj_module[name_module[key][1]]=[]
    for neighbor in adj[key]:
        adj_module[name_module[key][1]].append(name_module[neighbor][1])

#print(dict(graph_module.named_modules()))


In [4]:
#可视化计算图
def draw_graph(adj, name_module, start_node='x', output_file='graph.png',t=0):
    def draw(node, G, visited):
        if node not in visited:
            visited.add(node)
            G.add_node(node, label=name_module[node][t])
            if node in adj:
                for neighbor in adj[node]:
                    G.add_node(neighbor, label=name_module[neighbor][t])
                    G.add_edge(node, neighbor, arrowsize=0.6)
                    draw(neighbor, G, visited)

    G = pgv.AGraph(strict=True, directed=True)
    visited = set()
    draw(start_node, G, visited)
    G.graph_attr['splines'] = 'true'
    G.graph_attr['rankdir'] = 'TB'  # 从上到下
    G.node_attr['shape'] = 'box'
    G.node_attr['style'] = 'filled'
    G.node_attr['fillcolor'] = 'lightblue'
    G.node_attr['fontname'] = 'Consolas'
    G.node_attr['fontsize'] = 15
    G.layout(prog='dot')
    G.draw(output_file)

draw_graph(adj, name_module, output_file='graph_1.png',t=1)
draw_graph(adj, name_module, output_file='graph_0.png',t=0)

In [6]:
# 使用 PyTorch Profiler 捕获跟踪日志
Wait=2
Warmup=0
Active=1
Repeat=1
total=(Wait+Warmup+Active)*Repeat

with torch.profiler.profile(
    activities=[
        torch.profiler.ProfilerActivity.CPU,
        torch.profiler.ProfilerActivity.CUDA
    ],
    schedule=torch.profiler.schedule(
        wait=Wait,  # 等待周期
        warmup=Warmup,  # 预热周期
        active=Active,  # 活动周期
        repeat=Repeat   # 重复次数
    ),
    #on_trace_ready=torch.profiler.tensorboard_trace_handler(dir_name='./log',worker_name=model_name),  # 保存跟踪日志
    record_shapes=True,  # 记录输入形状
    with_stack=True  # 记录堆栈跟踪
) as prof:
    for _ in range(total):  # 示例在多个迭代中进行分析
        output = model(input_data)
        prof.step()

# 指定路径和名字保存json
log_dir = './log'
if not os.path.exists(log_dir):
    os.makedirs(log_dir)
json_path = os.path.join(log_dir, f'{model_name}.json')
prof.export_chrome_trace(json_path)

#json to csv
from convert_json_to_csv import convert_json_to_csv
log_directory='log'
output_directory='log_csv'
convert_json_to_csv(output_directory,log_directory)

  warn("Profiler won't be using warmup, this can skew profiler results")


正在处理文件 VGG16.json...
文件 VGG16.json 已成功转化为 log_csv/VGG16 目录下的csv
所有文件转换完成。


In [7]:
python_function=pd.read_csv(os.path.join(output_directory,model_name)+'/python_function.csv')
cuda_runtime=pd.read_csv(os.path.join(output_directory,model_name)+'/cuda_runtime.csv')
kernel=pd.read_csv(os.path.join(output_directory,model_name)+'/kernel.csv')

#遍历static_graph，找到每个module对应的kernel列表，存放进module_kernel里
module_kernel={}
kernel_module={}
for i in range(len(static_graph)):
    module_kernel[static_graph.iloc[i]['module']]=[]
    df=python_function[python_function['name']==static_graph.iloc[i]['module']].head(1)
    if len(df)>0:
        start=df['ts'].values[0]
        end=start+df['dur'].values[0]
        launch=cuda_runtime[(cuda_runtime['ts']>=start) & (cuda_runtime['ts']+cuda_runtime['dur']<=end) & (cuda_runtime['name'].str.contains('cudaLaunchKernel'))]
        tmp_kernel=kernel[kernel['correlation'].isin(launch['correlation'])]
        if len(tmp_kernel)>0:
            #把kernel的(name，correlation)提取成元组，放入对应module的kernel list里
                module_kernel[static_graph.iloc[i]['module']]=[(tmp_kernel.iloc[j]['name'],tmp_kernel.iloc[j]['correlation']) for j in range(len(tmp_kernel))]
                for j in range(len(tmp_kernel)):
                    if tmp_kernel.iloc[j]['name'] not in kernel_module:
                        kernel_module[tmp_kernel.iloc[j]['name']]=[]
                    kernel_module[tmp_kernel.iloc[j]['name']].append(static_graph.iloc[i]['module'])
#module_list
module_list=list(module_kernel.keys())
#kernel_list
kernel_list=list(kernel_module.keys())

In [10]:

def draw_module_kernel(adj_module, module_kernel, output_file='module_kernel',module_edge=0,mangle=0):
    G = pgv.AGraph(strict=True, directed=True)
    
    # 添加 module 节点
    for module in module_kernel:
        G.add_node(module, shape='box', style='filled', fillcolor='lightblue')
    
    if module_edge:
        for module in adj_module:
            for neighbor in adj_module[module]:
                G.add_edge(module, neighbor, arrowsize=0.5, color='red')
    
    for module in module_kernel:
        for kernel in module_kernel[module]:
            kernel_label = '\n'.join([kernel[0][i:i+25] for i in range(0, len(kernel[0]), 25)]) #每25个字符换行
            G.add_node(kernel[0], label=kernel_label,shape='ellipse', style='filled', fillcolor='lightgreen')
            G.add_edge(module, kernel[0], arrowsize=0.6)
    

    
    G.graph_attr['splines'] = 'true'
    G.graph_attr['rankdir'] = 'LR' 
    G.node_attr['fontname'] = 'Consolas'
    G.node_attr['fontsize'] = 15
    G.layout(prog='dot')
    '''
    dot:层次布局,适用于有向图。
    neato:力导向布局,适用于无向图。
    fdp:力导向布局,适用于无向图。
    sfdp:多尺度力导向布局,适用于大规模图。
    twopi:径向布局,适用于环形图。
    circo:环形布局,适用于环形图。
    '''
    G.draw(output_file+str(module_edge)+'_'+str(mangle)+'.png')

# draw_module_kernel(adj_module,module_kernel,module_edge=1)


In [48]:
#cupti——pytorch profiler 的kernel对应关系
import pandas as pd
import json
with open('demangled_symbols_1.json','r') as f:
    dic=json.load(f)

cupti_kernel=pd.read_csv('/data/zkx/cupti_to_csv/trans_to_csv/trace_no_profiler_con_res/conc_kernel.csv')
py_kernel=pd.read_csv('/data/zkx/operator_to_kernel/log_csv/VGG16/kernel.csv')

cupti_kernel=cupti_kernel.sort_values(by='start')
py_kernel=py_kernel.sort_values(by='ts')

#cupti_kernel和py_kernel 从前往后顺序name一一对应，存入kernel_dict
kernel_dict={}
assert len(cupti_kernel)==len(py_kernel)
for i in range(len(cupti_kernel)):
    if cupti_kernel.iloc[i]['name'] not in kernel_dict:
        kernel_dict[cupti_kernel.iloc[i]['name']]=py_kernel.iloc[i]['name']
    else:
        tmp=kernel_dict[cupti_kernel.iloc[i]['name']]
        assert tmp==py_kernel.iloc[i]['name']

#建立一个逆转kernel_dict key和value的字典
reverse_kernel_dict={}
for key,value in kernel_dict.items():
    reverse_kernel_dict[value]=key

'''
def draw_module_kernel_1(adj_module, module_kernel, output_file='module_kernel',module_edge=0):
    G = pgv.AGraph(strict=True, directed=True)
    
    # 添加 module 节点
    for module in module_kernel:
        G.add_node(module, shape='box', style='filled', fillcolor='lightblue')
    
    if module_edge:
        for module in adj_module:
            for neighbor in adj_module[module]:
                G.add_edge(module, neighbor, arrowsize=0.5, color='red')
    
    for module in module_kernel:
        for kernel in module_kernel[module]:
            k=reverse_kernel_dict[kernel[0]]
            k=dic[k]['short_name']
            kernel_label = '\n'.join([k[i:i+25] for i in range(0, len(k), 25)]) #每25个字符换行
            G.add_node(kernel[0], label=kernel_label,shape='ellipse', style='filled', fillcolor='lightgreen')
            G.add_edge(module, kernel[0], arrowsize=0.6)
    

    
    G.graph_attr['splines'] = 'true'
    G.graph_attr['rankdir'] = 'LR' 
    G.node_attr['fontname'] = 'Consolas'
    G.node_attr['fontsize'] = 15
    G.layout(prog='dot')
    G.draw(output_file+str(module_edge)+'_2'+'.png')

draw_module_kernel_1(adj_module,module_kernel,module_edge=0)
'''


import cxxfilt

def demangle_with_cxxfilt(symbol):
    try:
        return cxxfilt.demangle(symbol)
    except cxxfilt.InvalidName:
        return symbol  


#解析kernel_dict 的key 然后 与 value 对比
# for key,value in kernel_dict.items():
#     if demangle_with_cxxfilt(key)!=value:
#         print('py_name:',value)
#         print('cupti_name:',demangle_with_cxxfilt(key))
#         print('-----------------')



In [None]:
op_type

{'Conv2d': 13,
 'BatchNorm2d': 13,
 'ReLU6': 13,
 'MaxPool2d': 5,
 'Linear': 3,
 'Dropout': 2}

In [None]:
'''
item in prof.key_averages() parameters:
[
 'add',
 'count',
 'cpu_children',
 'cpu_memory_usage',
 'cpu_parent',
 'cpu_time',
 'cpu_time_str',
 'cpu_time_total',
 'cpu_time_total_str',
 'cuda_time',
 'device_memory_usage',
 'device_time',
 'device_time_str',
 'device_time_total',
 'device_time_total_str',
 'device_type',
 'flops',
 'input_shapes',
 'is_async',
 'is_legacy',
 'is_remote',
 'is_user_annotation',
 'key',
 'node_id',
 'scope',
 'self_cpu_memory_usage',
 'self_cpu_time_total',
 'self_cpu_time_total_str',
 'self_device_memory_usage',
 'self_device_time_total',
 'self_device_time_total_str',
 'stack',
 'use_device']
'''
data = []
for item in prof.key_averages():
    data.append({
        "Name": item.key,
        #"Self CPU total": item.self_cpu_time_total,
        # "CPU total": item.cpu_time_total,
        # "Self CPU total": item.self_cpu_time_total,
        #"CPU time avg": item.cpu_time,
        "GPU total": item.device_time_total,
        "Self GPU total": item.self_device_time_total,
        #"CUDA time avg": item.device_time
        "Calls": item.count,

    })

df=pd.DataFrame(data)


aten=df[df['Name'].str.contains('aten')]['Name'].to_list()
kernel=df[df['Self GPU total']>0] 
kernel=kernel[~kernel['Name'].str.contains('aten')]
kernel=kernel[~kernel['Name'].str.contains('Mem')]
kernel=kernel[~kernel['Name'].str.contains('Profiler')]
kernel=kernel.reset_index(drop=True)
py_kernel=kernel['Name'].unique().tolist()
aten

['aten::conv2d',
 'aten::convolution',
 'aten::_convolution',
 'aten::cudnn_convolution',
 'aten::reshape',
 'aten::view',
 'aten::add_',
 'aten::batch_norm',
 'aten::_batch_norm_impl_index',
 'aten::cudnn_batch_norm',
 'aten::empty_like',
 'aten::empty',
 'aten::hardtanh_',
 'aten::clone',
 'aten::empty_strided',
 'aten::copy_',
 'aten::hardtanh',
 'aten::clamp',
 'aten::to',
 'aten::max_pool2d',
 'aten::max_pool2d_with_indices',
 'aten::linear',
 'aten::t',
 'aten::transpose',
 'aten::as_strided',
 'aten::addmm',
 'aten::dropout',
 'aten::native_dropout']