In [1]:
import os
import glob
import json
import pandas as pd

In [2]:
def load_gpu_util(dlprof_summary_file):
    with open(dlprof_summary_file) as json_file:
        summary = json.load(json_file)
    gpu_util_raw = summary["Summary Report"]
    gpu_util = {
        "sm_util": float(100 - gpu_util_raw["GPU Idle %"][0]),
        "tc_util": float(gpu_util_raw["Tensor Core Kernel Utilization %"][0])
    }
    return gpu_util

def parse_pl_timings(pl_profile_file):
    lines = [line.rstrip("\n") for line in open(pl_profile_file)]
    mean_timings = {}
    for l in lines[7:]:
        if "|" in l:
            l = l.split("|")
            l = [i.strip() for i in l]
            mean_timings[l[0]] = float(l[1])
    return mean_timings

In [3]:
gpu_names = [
    "v100-16gb-300w"
]

compute_types = [
    "amp"
]

model_names = [
    "distilroberta-base"
]

columns = ["gpu", "compute", "model", "seq_len", "batch_size",
           "cpu_time", "forward", "backward", "test_loss",
           "vram_usage", "vram_io",
           "sm_util", "tc_util", ]
rows = []

cpu_time_sections = ["get_train_batch", "on_batch_start", "on_train_batch_start",
                     "training_step_end", "on_after_backward",
                     "on_batch_end", "on_train_batch_end"]

for gn in gpu_names:
    for ct in compute_types:
        for mn in model_names:
            path = "/".join(["./results", gn, ct, mn])+"/*"
            configs = glob.glob(path)
            configs.sort()
            for c in configs:
                print(c)
                try:
                    seq_len, batch_size = c.split("/")[-1].split("-")
                    row_1 = [gn, ct, mn, int(seq_len), int(batch_size)]
                    pl_timings = parse_pl_timings(c+"/pl_profile.txt")
                    cpu_time = sum([pl_timings[k] for k in cpu_time_sections])
                    metrics_0 = pd.read_csv(c+"/version_0/metrics.csv")
                    metrics_1 = pd.read_csv(c+"/version_1/metrics.csv")
                    sm_util = metrics_0["gpu_id: 0/utilization.gpu (%)"].mean()
                    vram_usage = metrics_0["gpu_id: 0/memory.used (MB)"].mean()
                    vram_io = metrics_0["gpu_id: 0/utilization.memory (%)"].mean()
                    test_loss = (metrics_0["test_loss"].mean() + metrics_1["test_loss"].mean())/2
                    row_2 = [cpu_time, pl_timings["model_forward"], pl_timings["model_backward"], test_loss]
                    util_data = load_gpu_util(c+"/dlprof_summary.json")
                    sm_util = (sm_util + util_data["sm_util"])/2
                    row_3 = [vram_usage, vram_io, sm_util, util_data["tc_util"]]
                    row = row_1 + row_2 + row_3
                    print(row)
                    rows.append(row)
                except Exception as e:
                    print(e)

./results/v100-16gb-300w/amp/distilroberta-base/128-128
['v100-16gb-300w', 'amp', 'distilroberta-base', 128, 128, 0.004445738, 0.018347, 0.13208, 0.2723444998264313, 8631.666666666666, 43.166666666666664, 59.75239179611637, 40.07509189659681]
./results/v100-16gb-300w/amp/distilroberta-base/128-32
['v100-16gb-300w', 'amp', 'distilroberta-base', 128, 32, 0.0012208020000000002, 0.01968, 0.038257, 0.29111315310001373, 4367.666666666667, 21.5, 35.37566173502956, 37.79442775995403]
./results/v100-16gb-300w/amp/distilroberta-base/128-64
['v100-16gb-300w', 'amp', 'distilroberta-base', 128, 64, 0.001465613, 0.018414, 0.066787, 0.280166357755661, 5683.0, 33.0, 47.14957089250426, 39.0195921764923]
./results/v100-16gb-300w/amp/distilroberta-base/128-8
['v100-16gb-300w', 'amp', 'distilroberta-base', 128, 8, 0.000818426, 0.016901, 0.032901, 0.3316252827644348, 3280.6666666666665, 8.416666666666666, 16.981641262911136, 36.40462718063971]
./results/v100-16gb-300w/amp/distilroberta-base/512-128
'get_tr

In [4]:
df = pd.DataFrame(rows, columns=columns) 
df.head()

Unnamed: 0,gpu,compute,model,seq_len,batch_size,cpu_time,forward,backward,test_loss,vram_usage,vram_io,sm_util,tc_util
0,v100-16gb-300w,amp,distilroberta-base,128,128,0.004446,0.018347,0.13208,0.272344,8631.666667,43.166667,59.752392,40.075092
1,v100-16gb-300w,amp,distilroberta-base,128,32,0.001221,0.01968,0.038257,0.291113,4367.666667,21.5,35.375662,37.794428
2,v100-16gb-300w,amp,distilroberta-base,128,64,0.001466,0.018414,0.066787,0.280166,5683.0,33.0,47.149571,39.019592
3,v100-16gb-300w,amp,distilroberta-base,128,8,0.000818,0.016901,0.032901,0.331625,3280.666667,8.416667,16.981641,36.404627
4,v100-16gb-300w,amp,distilroberta-base,512,32,0.001251,0.019799,0.18679,0.182629,13479.666667,51.833333,69.563309,32.270593
