In [None]:
import time
import torch # needed for replay feature
from TraceLens import TreePerfAnalyzer, EventReplayer

In [None]:
# replace by your profile path, it can be a single rank profile from a multi gpu run as well
path ="/home/buffer/resnet_trace.json"
perf_analyzer = TreePerfAnalyzer.from_file(path)

In [None]:
# Replay works for any op, taking gemm as example 
# gemm_events = [event for event in perf_analyzer.tree.events if event['name'] in ['aten::addmm', 'aten::mm', 'aten::_scaled_mm']]
gemm_events = [event for event in perf_analyzer.tree.events if event['name'] in ['aten::convolution']]
df_gemm_ops = perf_analyzer.build_df_perf_metrics(gemm_events)
df_gemm_summary = perf_analyzer.summarize_df_perf_metrics(df_gemm_ops, ['mean'])
df_gemm_summary

In [None]:
device="cuda"
row_idx = 1
row = df_gemm_summary.iloc[row_idx]
uid = row['UID_first'] # get uid for row of interest
evt = perf_analyzer.tree.get_UID2event(uid)
my_replayer = EventReplayer(evt, device=device, verbose=False)
my_replayer.replay()


In [None]:
my_replayer.get_repro_info()

In [None]:
def benchmark_func(func, device, warmup=50, avg_steps=100):
    """
    Benchmark a function with warmup and average steps.
    Disclaimer: This method would be innacurate for very short ops.
    Args:
        func (callable): The function to benchmark.
        warmup (int): Number of warmup iterations.
        avg_steps (int): Number of iterations to average over.
    Returns:
        float: Average time taken per iteration in microseconds.
    """
    # Warmup phase
    for _ in range(warmup):
        func()

    # Benchmarking phase
    torch.cuda.synchronize(device)
    start_time = time.time()
    for _ in range(avg_steps):
        func()
    torch.cuda.synchronize(device)
    end_time = time.time()

    elapsed_time = end_time - start_time
    avg_time_sec = elapsed_time / avg_steps
    avg_time_us = avg_time_sec * 1e6

    return avg_time_us

In [None]:
# Check fidelity of replay
replay_time_mean = benchmark_func(my_replayer.replay, device)
profile_time_mean = row['Kernel Time (µs)_mean']
percent_diff = (replay_time_mean - profile_time_mean) / profile_time_mean * 100
print(f"Average time per replay: {replay_time_mean:.2f} us")
print(f"Profile time mean: {profile_time_mean:.2f} us")
print(f"Percent difference: {percent_diff:.2f}%")
print(f"Abs difference: {replay_time_mean - profile_time_mean:.2f} us")


In [None]:
# lets get events of interest for batched replay 
unique_gemm_events = []
for index, row in df_gemm_summary.iterrows():
    uid = row['UID_first']
    event = perf_analyzer.tree.get_UID2event(uid)
    unique_gemm_events.append(event)
    

In [None]:
# save replay info as json
import json
repro_data_list = []
processed_count = 0

for event in unique_gemm_events:
    # Initialize EventReplayer (device doesn't matter here, just for schema matching)
    # Set lazy=True as we only need the IR, not immediate tensor creation.
    # Verbose can be helpful for debugging schema mismatches during extraction.
    replayer = EventReplayer(event, lazy=True, verbose=False) # Set verbose=True for debug

    # Extract the serializable info
    repro_info = replayer.get_repro_info()
    repro_data_list.append(repro_info)
    processed_count += 1


# --- Save the Extracted Data ---
OUTPUT_REPRO_FILE = '/home/buffer/event_replay_ir.json'
if repro_data_list:
    print(f"\nSaving {len(repro_data_list)} extracted operator infos to '{OUTPUT_REPRO_FILE}'...")
    with open(OUTPUT_REPRO_FILE, 'w') as f:
        json.dump(repro_data_list, f, indent=4)
    print("Save complete.")


In [None]:
import subprocess
from TraceLens.EventReplay import batched_replay

cmd = [
    "python",          # run as "python ..."
    batched_replay.__file__, # batched_replay.py
    str(OUTPUT_REPRO_FILE),  # argument: path to ir json
    "--verbose",
]

print("Running subprocess:\n", " ".join(cmd), "\n")
result = subprocess.run(cmd, capture_output=True, text=True)

print("stdout:\n", result.stdout)
if result.stderr:
    print("stderr:\n", result.stderr)


In [None]:
# STANDALONE ARTIFACTS FOR REPRO -  independent of model code or tracelens code
# artifacts include (a)replay_ir.json, (b) utils.py, (c) batched_replay.py
import zipfile
from TraceLens.EventReplay import utils as tl_utils
import os
files = [
    OUTPUT_REPRO_FILE,
    tl_utils.__file__,  # Path to utils.py
    batched_replay.__file__,  # Path to batched_replay.py
    batched_replay.__file__.replace('batched_replay.py', 'batched_replay_readme.md') # path to the readme
]
zip_file_path = '/home/buffer/replay_code.zip'
with zipfile.ZipFile(zip_file_path, 'w') as zipf:
    for file in files:
        zipf.write(file, arcname=os.path.basename(file))  # ← use file.name
print(f"Created zip file: {zip_file_path}")

In [None]:
batched_replay.__file__