In [None]:
# This notebook demonstrates call stack analysis using TraceLens.
# The first part of the notebook is the usual perf breakdown analysis to set up the context.
# The second part is the call stack analysis, which is the main focus of this notebook.

from pprint import pprint
import json
import pandas as pd
from TraceLens import TreePerfAnalyzer

In [None]:
# replace by your profile path, it can be a single rank profile from a multi gpu run as well
path = '/your/path/to/profile.json'
# we need python func to traceback to the python frontend of pytorch and the model code,
# we can disable this if we are only interested in the backend
perf_analyzer = TreePerfAnalyzer.from_file(path, add_python_func=True)
# perf_analyzer = TreePerfAnalyzer.from_file(path, add_python_func=False)

In [None]:
# table of all lowest-level CPU operations (from the call stack perspective)
# and the time they "induce" on the GPU
df_kernel_launchers = perf_analyzer.get_df_kernel_launchers(include_kernel_names=True)
df_kernel_launchers.round(2).head()

In [None]:
# group by op name and summarize
# this gives an op wise breakdown of gpu time
df_kernel_launchers_summary = perf_analyzer.get_df_kernel_launchers_summary(df_kernel_launchers)
df_kernel_launchers_summary.round(2).head()

In [None]:
# Generate a detailed breakdown of unique argument combinations for all kernel-launching CPU ops.
# For each unique (op name + input dims/types/strides/concrete args), this groups and aggregates GPU time,
# helping identify which op and its arguments are the most time-consuming.
perf_analyzer.get_df_kernel_launchers_unique_args(df_kernel_launchers, include_pct=True)

In [None]:
# Same as above, but restricted to a specific op type
# Useful for drilling into the breakdown of a single op
# We take the 'aten::copy_' op as an example as it is a common op that reduces flops utilization
df_op_interest = perf_analyzer.get_df_kernel_launchers_unique_args(df_kernel_launchers, event_name="aten::copy_", include_pct=True)
df_op_interest.head()

In [None]:
# for further analysis of where this copy comes from we can look at the call stack
sample_row = df_op_interest.sample(1).iloc[0]
uid = sample_row['ex_UID']
evt = perf_analyzer.tree.get_UID2event(uid)
root_node = perf_analyzer.tree.traverse_parents_and_print(evt, cpu_op_fields=('Input Dims', 'Input type', 'Input Strides', 'Concrete Inputs'))

In [None]:
# now we can do a subtree print to further contextualize this copy
perf_analyzer.tree.traverse_subtree_and_print(root_node)

In [None]:
# demonstrate manually traversing the tree
uid = sample_row['ex_UID']
evt = perf_analyzer.tree.get_UID2event(uid)
print(f"Event UID: {evt['UID']}, Name: {evt.get('name', 'Unknown')}")
print("Traversing children of the event:")
children_uids = evt['children']
for child_uid in children_uids:
    child_event = perf_analyzer.tree.get_UID2event(child_uid)
    name = child_event.get('name')
    print(f"UID: {child_uid}, Name: {name}")

print("Traversing parents of the event:")
parent_uid = evt.get('parent')
parent_event = perf_analyzer.tree.get_UID2event(parent_uid)
name = parent_event.get('name')
print(f"UID: {parent_uid}, Name: {name}")
