In [None]:
from pprint import pprint
import json
import pandas as pd
from TraceLens import TreePerfAnalyzer

In [None]:
path = '/path/to/profile.json'
perf_analyzer = TreePerfAnalyzer.from_file(path, add_python_func=True)

In [None]:
def get_next_host_op(perf_analyzer, host_op):
    """
    Given a host op we get host op of the gpu event (executed in the same stream)
    after all the gpu events launched by this host op
    """
    gpu_event_uids = host_op.get('gpu_events')
    if gpu_event_uids is None:
        raise ValueError("Host op does not have gpu events")
    gpu_events = [perf_analyzer.tree.get_UID2event(uid) for uid in gpu_event_uids]
    gpu_streams = [e['args']['stream'] for e in gpu_events]
    # as a simplifying assumption we assume all gpu events are in the same stream
    assert len(set(gpu_streams)) == 1, "Not all GPU events are in the same stream"
    gpu_stream = gpu_streams[0]
    sorted_gpu_events = sorted(gpu_events, key=lambda x: x['ts'])
    last_event = sorted_gpu_events[-1]
    stream_index = last_event['args']['stream_index']
    next_index = stream_index + 1
    next_gpu_event = perf_analyzer.tree.dict_stream_index2event.get((gpu_stream, next_index), None)
    assert next_gpu_event is not None, "No next gpu event found in the stream"
    # lets get the parent host op of this next gpu event
    # tree is like host op -> runtime op (cuda/ hip launch) -> gpu op
    next_gpu_event_launcher = perf_analyzer.tree.get_parent_event(next_gpu_event)
    assert next_gpu_event_launcher is not None, "No launcher event found for the next gpu event"
    next_gpu_event_host_op = perf_analyzer.tree.get_parent_event(next_gpu_event_launcher)
    assert next_gpu_event_host_op is not None, "No host op found for the next gpu event"
    return next_gpu_event_host_op


In [None]:
def summarize_gpu_events_for_host_op(perf_analyzer, host_op):
    """
    Given a host op we get the gpu events launched by this host op
    and summarize them
    """
    gpu_event_uids = host_op.get('gpu_events')
    if gpu_event_uids is None:
        raise ValueError("Host op does not have gpu events")
    gpu_events = [perf_analyzer.tree.get_UID2event(uid) for uid in gpu_event_uids]
    print("GPU Events for host op: ", host_op['name'])
    for e in gpu_events:
        print(f"UID: {e['UID']}, Stream: {e['args']['stream']}, Stream Index: {e['args']['stream_index']}, Duration: {e['dur']}, Name: {e['name'][:64]}")

In [None]:

miopen_events = [e for e in perf_analyzer.tree.events if e['name'] in ['aten::miopen_convolution', 'aten::miopen_batch_norm']]
for evt in miopen_events:
    print('miopen fwd event:')
    print('UID:', evt['UID'], 'Name:', evt['name'])
    summarize_gpu_events_for_host_op(perf_analyzer, evt)
    print('')
    print("Next host op:")
    next_evt = get_next_host_op(perf_analyzer, evt)
    print(next_evt['name'])
    summarize_gpu_events_for_host_op(perf_analyzer, next_evt)
    gpu_events = [perf_analyzer.tree.get_UID2event(uid) for uid in next_evt.get('gpu_events')]
    if 'elementwise' in gpu_events[0]['name']:
        print('Elementwise kernel found after miopen kernel -> Fusion opportunity!')
    print("==="*20)
    break
    



In [None]:
# we can use UID for debug analysis
# example 
uid = 233728
event = perf_analyzer.tree.get_UID2event(uid)

# lets traverse the parents
perf_analyzer.tree.traverse_parents_and_print(event)

In [None]:
# traverse the children
uid = 60134
event = perf_analyzer.tree.get_UID2event(uid)
perf_analyzer.tree.traverse_subtree_and_print(event)