In [None]:
from pprint import pprint
import json
import pandas as pd
from TraceLens import TreePerfAnalyzer

In [None]:
# replace by your profile path, it can be a single rank profile from a multi gpu run as well
path = '/path/to/profile.json'

perf_analyzer = TreePerfAnalyzer.from_file(path)

In [None]:
# get breakdown of gpu timeline - busy time, idle time, communication time, etc
perf_analyzer.get_df_gpu_timeline()

In [None]:
# table of all lowest-level CPU operations (from the call stack perspective)
# and the time they "induce" on the GPU
df_kernel_launchers = perf_analyzer.get_df_kernel_launchers()
df_kernel_launchers.round(2).head()

In [None]:
# group by op name and summarize
# this gives an op wise breakdown of gpu time
df_kernel_launchers_summary = perf_analyzer.get_df_kernel_launchers_summary(df_kernel_launchers)
df_kernel_launchers_summary.round(2).head()

In [None]:
# We can further get breakdown by shapes for a particular op
# We do this by filtering the name and then grouping by the input dims
df_kernel_launchers_summary_name_shapes = perf_analyzer.get_df_kernel_launchers_summary_by_shape(df_kernel_launchers, "aten::mm")
df_kernel_launchers_summary_name_shapes.round(2)

In [None]:
# Roofline for ops
# currently we have GEMM, CONV fwd+bwd, FA
# many more coming soon

# Example 1 GEMM
gemm_events = [event for event in perf_analyzer.tree.events if event['name'] in ['aten::addmm', 'aten::mm', 'aten::_scaled_mm']]
print(f"Found {len(gemm_events)} gemm events")

# take an example event and compute perf metrics
gemm_event = gemm_events[0]
print("Event dict:")
pprint(gemm_event)
print("Perf metrics dict:")
pprint(perf_analyzer.compute_perf_metrics(gemm_event))


In [None]:
# build table for compute perf metrics for all gemm events
df_gemm_ops = perf_analyzer.build_df_perf_metrics(gemm_events, bwd=False, non_data_mov=True, include_kernel_names=True)
df_gemm_ops.head()

In [None]:
# summarize by grouping across params M K N and bias and computing aggregate metrics
perf_analyzer.summarize_df_perf_metrics(df_gemm_ops, ['mean'])

In [None]:
# Example 2a FA fwd
fa_events = [event for event in perf_analyzer.tree.events if event['name'] == 'FlashAttnFunc']
df_fa_fwd_ops = perf_analyzer.build_df_perf_metrics(fa_events, bwd=False, non_data_mov=True)
perf_analyzer.summarize_df_perf_metrics(df_fa_fwd_ops, ['mean'])

In [None]:
# Example 2b FA bwd
# Note: bwd events for a fwd pass event are found 
# by traversing the autograd links
df_fa_bwd_ops = perf_analyzer.build_df_perf_metrics(fa_events, bwd=True, non_data_mov=True)
perf_analyzer.summarize_df_perf_metrics(df_fa_bwd_ops, ['mean'])

In [None]:
# Example 3a conv fwd
conv_events = [event for event in perf_analyzer.tree.events if event['name'] == 'aten::convolution']
df_conv_fwd_ops = perf_analyzer.build_df_perf_metrics(conv_events, bwd=False, non_data_mov=True)
perf_analyzer.summarize_df_perf_metrics(df_conv_fwd_ops, ['mean'])

In [None]:
# Example 3b conv bwd
df_conv_bwd_ops = perf_analyzer.build_df_perf_metrics(conv_events, bwd=True, non_data_mov=True)
perf_analyzer.summarize_df_perf_metrics(df_conv_bwd_ops, ['mean'])

In [None]:
# Example 4 unary elementwise 
unary_elemwise_op_names = [
    'aten::copy', 'aten::copy_',
    'atem::clamp_min', 'aten::clamp_min_', 
    'aten::sigmoid',
]

unary_elementwise_events = [event for event in perf_analyzer.tree.events if event['name'] in unary_elemwise_op_names]
df_unary_elementwise_ops = perf_analyzer.build_df_perf_metrics(unary_elementwise_events, bwd=False, non_data_mov=True, include_kernel_names=True)
perf_analyzer.summarize_df_perf_metrics(df_unary_elementwise_ops, ['mean'])

In [None]:
# Example 5 binary elementwise 
binary_elemwise_op_names = [
    'aten::div', 'aten::div_',
    'aten::mul', 'aten::mul_',
    'aten::add', 'aten::add_',
    'aten::sigmoid_backward',
    'aten::threshold_backward',
]

binary_elementwise_events = [event for event in perf_analyzer.tree.events if event['name'] in binary_elemwise_op_names]
df_binary_elementwise_ops = perf_analyzer.build_df_perf_metrics(binary_elementwise_events, bwd=False, non_data_mov=True, include_kernel_names=True)
perf_analyzer.summarize_df_perf_metrics(df_binary_elementwise_ops, ['mean'])