In [None]:
from TraceLens import TreePerfAnalyzer
from typing import Dict, Any


In [None]:
path = '/path/to/profile.json'
perf_analyzer = TreePerfAnalyzer.from_file(path, add_python_func=True)

In [None]:
tree = perf_analyzer.tree
event = next(e for e in tree.events if e['name'] == 'nn.Module: DeepseekV2DecoderLayer_2')
perf_analyzer.build_nn_module_latency_tree(event)

In [None]:
def print_nn_module_latency_tree(root_nn_module):
    _traverse_nn_modules_subtree_recursive(root_nn_module, _prefix="", is_last=True)

def _traverse_nn_modules_subtree_recursive(node: Dict[str, Any], _prefix: str, is_last: bool, parent_gpu_time=None):
    connector = "└── " if is_last else "├── "
    name = node.get('name', 'Unknown')
    gpu_time = node['GPU Time']
    print_str = f"{_prefix}{connector}UID: {node['UID']}, Name: {name}, GPU Time: {gpu_time:.2f} µs"
    if parent_gpu_time is not None:
        print_str += f", Pct Parent: {gpu_time / parent_gpu_time * 100:.2f}%"
    print(print_str)

    nn_module_children = node.get('nn_module_children', [])
    child_count_print = len(nn_module_children)
    non_nn_module_gpu_time = node.get('Non-nn.Module GPU Time')
    if non_nn_module_gpu_time:
        child_count_print += 1
    
    new_prefix = _prefix + ("    " if is_last else "│   ")
    for i, child_UID in enumerate(nn_module_children):
        child = perf_analyzer.tree.get_UID2event(child_UID)
        _traverse_nn_modules_subtree_recursive(child, new_prefix, is_last=(i == child_count_print - 1), parent_gpu_time=gpu_time)

    if non_nn_module_gpu_time:
        print(f"{new_prefix}└── Non-nn.Module GPU Time: {non_nn_module_gpu_time:.2f} µs, Pct Parent: {non_nn_module_gpu_time / gpu_time * 100:.2f}%")


In [None]:
print_nn_module_latency_tree(event)