In [None]:
import time
import os
import torch # needed for replay feature
import torchvision.models as torchvision_models
from torch.profiler import profile, record_function, ProfilerActivity

from TraceLens import TreePerfAnalyzer, EventReplayer

In [None]:
# lets create a dir and cwd to it
def create_dir_and_cwd(dir_name):
    if not os.path.exists(dir_name):
        os.makedirs(dir_name)
    os.chdir(dir_name)
    print(f"Changed working directory to: {os.getcwd()}")

create_dir_and_cwd("event_replay_example_wd")

In [None]:
# We will profile a resnet model to demo event replay
# If you want to replay from TraceLens perf report, you can skip down the notebook

def profile_resnet(path=None):
    device = "cuda"
    dtype = torch.bfloat16
    model = torchvision_models.resnet18().to(device=device, dtype=dtype)
    batch = 20
    C_IN, H_IN, W_IN = 3, 224, 224
    dummy_input = torch.randn(batch, C_IN, H_IN, W_IN).to(device=device, dtype=dtype)
    dummy_output = torch.randn(batch, 1000).to(device=device, dtype=dtype)
    if path is None:
        path = "resnet_trace.json"
    def trace_handler(p):
        p.export_chrome_trace(path)
    activities = [ProfilerActivity.CPU, ProfilerActivity.CUDA]
    with profile(
        activities=activities,
        schedule=torch.profiler.schedule(
            wait=10,
            warmup=5,
            active=3,
            repeat=1
            ),
        record_shapes=True,
        on_trace_ready=trace_handler
    ) as p:
        for idx in range(50):
            out = model(dummy_input)
            out.backward(dummy_output)
            p.step()
    return path


In [None]:
profile_path = profile_resnet()

In [None]:
# replace by your profile path, it can be a single rank profile from a multi gpu run as well
# If you are interested in replaying from perf report then skip down 
perf_analyzer = TreePerfAnalyzer.from_file(profile_path)

In [None]:
df_kernel_launchers = perf_analyzer.get_df_kernel_launchers(include_kernel_names=True)

# lets take conv bwd op as example
df_kernel_launchers_unique_args = perf_analyzer.get_df_kernel_launchers_unique_args(df_kernel_launchers, include_pct=True)
df_kernel_launchers_unique_args.head()

In [None]:
# get interesting op to replay based on row idx or any other method you like
device="cuda"
row_idx = 11
row = df_kernel_launchers_unique_args.iloc[row_idx]
uid = row['ex_UID'] # get uid for row of interest
evt_to_replay = perf_analyzer.tree.get_UID2event(uid)
my_replayer = EventReplayer(evt_to_replay, device=device, verbose=True)
my_replayer.replay()


In [None]:
# very useful for understanding the op args

my_replayer.get_repro_info()

In [None]:
# a very quick and dirty benchmark function to check the fidelity of the replayed op
def benchmark_func(func, device, warmup=50, avg_steps=100):
    """
    Benchmark a function with warmup and average steps.
    Disclaimer: This method would be inaccurate for very short ops.
    Args:
        func (callable): The function to benchmark.
        warmup (int): Number of warmup iterations.
        avg_steps (int): Number of iterations to average over.
    Returns:
        float: Average time taken per iteration in microseconds.
    """
    # Warmup phase
    for _ in range(warmup):
        func()

    # Benchmarking phase
    torch.cuda.synchronize(device)
    start_time = time.time()
    for _ in range(avg_steps):
        func()
    torch.cuda.synchronize(device)
    end_time = time.time()

    elapsed_time = end_time - start_time
    avg_time_sec = elapsed_time / avg_steps
    avg_time_us = avg_time_sec * 1e6

    return avg_time_us

In [None]:
# Check fidelity of replay
replay_time_mean = benchmark_func(my_replayer.replay, device)
profile_time_mean = row['total_direct_kernel_time_mean']
percent_diff = (replay_time_mean - profile_time_mean) / profile_time_mean * 100
print(f"Average time per replay: {replay_time_mean:.2f} us")
print(f"Profile time mean: {profile_time_mean:.2f} us")
print(f"Percent difference: {percent_diff:.2f}%")
print(f"Abs difference: {replay_time_mean - profile_time_mean:.2f} us")


In [None]:
# lets further verify the replay fidelity by profiling the replay
# and checking the kernels 
def profile_the_replay(replayer, path="replay_trace.json"):
    """
    Profile the replay of an event.
    I know the name is confusing, 
    but what I mean is that we are profiling the replay of the event
    Args:
        replayer (EventReplayer): The EventReplayer object.
        warmup (int): Number of warmup iterations.
        avg_steps (int): Number of iterations to average over.
    Returns:
        str: path of the replayed events trace
    
    """

    def trace_handler(p):
        p.export_chrome_trace(path)
    activities = [ProfilerActivity.CPU, ProfilerActivity.CUDA]
    wait = 10
    warmup = 5
    active = 10
    with profile(
        activities=activities,
        schedule=torch.profiler.schedule(
            wait=wait,
            warmup=warmup,
            active=active,
            repeat=1
            ),
        record_shapes=True,
        on_trace_ready=trace_handler
    ) as p:
        for idx in range(wait + warmup + active):
            replayer.replay()
            p.step()

    return path


In [None]:
replay_profile_path = profile_the_replay(my_replayer)
replay_perf_analyzer = TreePerfAnalyzer.from_file(replay_profile_path)
replay_evts = [e for e in replay_perf_analyzer.tree.events if e.get('name') == my_replayer.event.get('name')]
replay_evt = replay_evts[0]
replay_kernels = [replay_perf_analyzer.tree.get_UID2event(uid) for uid in replay_evt.get('gpu_events', [])]
replay_kernels_names = [e.get('name') for e in replay_kernels]
gt_kernels = [perf_analyzer.tree.get_UID2event(uid) for uid in evt_to_replay.get('gpu_events', [])]
gt_kernels_names = [e.get('name') for e in gt_kernels]
print(f"GT kernels:")
for gt_name in gt_kernels_names:
    print(gt_name[:128])
print()
print("Replay kernels:")
for replay_name in replay_kernels_names:
    print(replay_name[:128])
print()
assert set(replay_kernels_names) == set(gt_kernels_names), f"Replay kernels: {set(replay_kernels_names)} do not match ground truth kernels: {set(gt_kernels_names)}"


Replaying a bunch of operations -  batched replay

In [None]:
# list of events of interest for batched replay and repro generation
# lets say we are interested in all conv bwd ops
ops_interest = []
df_kernel_launchers_filt = df_kernel_launchers_unique_args[df_kernel_launchers_unique_args['name']=='aten::convolution_backward']
for index, row in df_kernel_launchers_filt.iterrows():
    uid = row['ex_UID']  # get uid for row of interest
    event = perf_analyzer.tree.get_UID2event(uid)
    ops_interest.append(event)
    

In [None]:
# save replay info as json
import json
repro_data_list = []
processed_count = 0

for event in ops_interest:
    # Initialize EventReplayer (device doesn't matter here, just for schema matching)
    # Set lazy=True as we only need the IR, not immediate tensor creation.
    # Verbose can be helpful for debugging schema mismatches during extraction.
    replayer = EventReplayer(event, lazy=True, verbose=False) # Set verbose=True for debug

    # Extract the serializable info
    repro_info = replayer.get_repro_info()
    repro_data_list.append(repro_info)
    processed_count += 1


# --- Save the Extracted Data ---
OUTPUT_REPRO_FILE = 'event_replay_ir.json'
if repro_data_list:
    print(f"\nSaving {len(repro_data_list)} extracted operator infos to '{OUTPUT_REPRO_FILE}'...")
    with open(OUTPUT_REPRO_FILE, 'w') as f:
        json.dump(repro_data_list, f, indent=4)
    print("Save complete.")

abs_path_replay_ir_json = os.path.abspath(OUTPUT_REPRO_FILE)
print(f"Repro data saved to: {abs_path_replay_ir_json}")



In [None]:
# running the batched replay
import subprocess
import os

from TraceLens import EventReplay
dir_batched_replay = os.path.dirname(EventReplay.__file__)
batched_replay_file = os.path.join(dir_batched_replay, "batched_replay.py")
print(f"Running batched replay from directory: {dir_batched_replay}")
cmd = [
    "python",          # run as "python ..."
    batched_replay_file, # path to the batched replay script
    abs_path_replay_ir_json, # path to the replay IR file
    "--verbose"
]
result = subprocess.run(cmd, cwd=dir_batched_replay,
                        capture_output=True, text=True)
if result.returncode != 0:
    print(f"Error running batched replay: {result.stderr}")
else:
    print("Batched replay completed successfully.")
    print(result.stdout)    



In [None]:
# STANDALONE ARTIFACTS FOR REPRO -  independent of model code or tracelens code
# artifacts include (a)replay_ir.json, (b) utils.py, (c) batched_replay.py
import zipfile
import os

utils_file_path = os.path.join(dir_batched_replay, "utils.py")
batched_replay_file = os.path.join(dir_batched_replay, "batched_replay.py")
readme_file_path = os.path.join(dir_batched_replay, "batched_replay_readme.md")
files = [
    abs_path_replay_ir_json,  # Path to the replay IR file
    utils_file_path,  # Path to utils.py
    batched_replay_file,  # Path to batched_replay.py
    readme_file_path  # path to the readme
]
zip_file_path = 'replay_code.zip'
with zipfile.ZipFile(zip_file_path, 'w') as zipf:
    for file in files:
        zipf.write(file, arcname=os.path.basename(file))  # ← use file.name
print(f"Created zip file: {zip_file_path}")

Replay ops from report

In [None]:
# let's first generate the TraceLens perf report

# hacky method to get the path to the TraceLens examples script
import TraceLens
TraceLens_dir = os.path.dirname(os.path.dirname(TraceLens.__file__))
script_path = os.path.join(TraceLens_dir, 'examples', 'generate_perf_report.py')
perf_report_path = "perf_report.xlsx"
cmd = [
    "python",          # run as "python ..."
    script_path,      # path to the generate_perf_report.py script
    "--profile_json_path", profile_path,  # path to the profile json file
    "--output_xlsx_path", perf_report_path
]

result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode != 0:
    print(f"Error generating perf report: {result.stderr}")
else:
    print("Perf report generated successfully.")
    print(f"Perf report saved to: {perf_report_path}")
    print(result.stdout)


In [None]:
# we can replay events from the perf reports as well - without the full profile too!
# This is because we essentially require the args and the op name to replay
# excel -> df -> for each row (row -> event -> replayer -> replayer IR -> append to replayer IR list) -> save replayer IR list as json
import pandas as pd
import ast
# read sheet from excel

df_unique_ops = pd.read_excel('perf_report.xlsx', sheet_name='kernel_launchers_unique_args')

def row_to_evt(row):
    event = {
        'name': row['name'],
        'args': {
            'Input Dims': ast.literal_eval(row['Input Dims']),
            'Input Strides': ast.literal_eval(row['Input Strides']),
            'Input type': ast.literal_eval(row['Input type']),
            'Concrete Inputs': ast.literal_eval(row['Concrete Inputs']),
        }
    }
    return event


In [None]:
repro_data_list = []
processed_count = 0
# lets say we are interested in the following ops
ops_interest = ['aten::miopen_convolution',
                'aten::convolution_backward', 
                'aten::miopen_batch_norm',
                'aten::miopen_batch_norm_backward'] 

df_ops_interest = df_unique_ops[df_unique_ops['name'].isin(ops_interest)].copy()

for index, row in df_ops_interest.iterrows():
    event = row_to_evt(row)
    # Initialize EventReplayer similar to above
    replayer = EventReplayer(event, lazy=True, verbose=False)
    # Extract the serializable info
    repro_info = replayer.get_repro_info()
    repro_data_list.append(repro_info)
    processed_count += 1
print(f"Processed {processed_count} events.")
# --- Save the Extracted Data ---
OUTPUT_REPRO_FILE = 'report_event_replay_ir.json'
if repro_data_list:
    print(f"\nSaving {len(repro_data_list)} extracted operator infos to '{OUTPUT_REPRO_FILE}'...")
    with open(OUTPUT_REPRO_FILE, 'w') as f:
        json.dump(repro_data_list, f, indent=4)
    print("Save complete.")

In [None]:
# STANDALONE ARTIFACTS FOR REPRO -  independent of model code or tracelens code - same drill as before
# artifacts include (a)replay_ir.json, (b) utils.py, (c) batched_replay.py
files = [
    OUTPUT_REPRO_FILE,  # Path to the replay IR file
    utils_file_path,  # Path to utils.py
    batched_replay_file,  # Path to batched_replay.py
    readme_file_path  # path to the readme
]
zip_file_path = 'report_replay_code.zip'
with zipfile.ZipFile(zip_file_path, 'w') as zipf:
    for file in files:
        zipf.write(file, arcname=os.path.basename(file))  # ← use file.name
print(f"Created zip file: {zip_file_path}")